# House Prices - Advanced Regression Techniques

In this notebook we will be analysing the data and experiment with it.

*To View the full code refer to `kaggle.ipynb` - This notebook was submitted in the competition.*

In [59]:
import pandas as pd

import torch
from torch import nn

torch.__version__

'2.0.1+cu117'

# Loading the Data

In [60]:
train_dataset_path = "data/train.csv"
test_dataset_path = "data/test.csv"

In [61]:
train_dataset = pd.read_csv(train_dataset_path)

test_dataset = pd.read_csv(test_dataset_path)

# Analysing the Data

In [62]:
train_dataset.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [63]:
test_dataset.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


## Missing Values

In [64]:
from helper_functions import pd_to_csv

# Saving Missing Values in train_dataset
pd_to_csv(train_dataset.isna().sum(), "raw_data/train_missing_values.csv")

# Saving Missing Values in train_dataset
pd_to_csv(test_dataset.isna().sum(), "raw_data/test_missing_values.csv")

'DONE'

In [65]:
# Viewing the Missing Values
train_missing_values = pd.read_csv("raw_data/train_missing_values.csv")

train_missing_values

Unnamed: 0.1,Unnamed: 0,0
0,Id,0
1,MSSubClass,0
2,MSZoning,0
3,LotFrontage,259
4,LotArea,0
...,...,...
76,MoSold,0
77,YrSold,0
78,SaleType,0
79,SaleCondition,0


## DataTypes

In [66]:
train_dataset.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

*Here, we can see that not all the datatypes are in integer or float, they are in object (i.e. String). Which means we need to convert them to number in order to convert them to tensors.*

## Duplicated Values

In [67]:
print("Training Dataset Duplicated Values")
print(train_dataset.duplicated().sum())

print("\n----------------\n")

print("Test Dataset Duplicated Values")
print(test_dataset.duplicated().sum())

Training Dataset Duplicated Values
0

----------------

Test Dataset Duplicated Values
0


**NOTE**: After viewing the data and `data_description`, I came to understand that no data is missing, everthing is given and we just need to convert them to desired output.

# Pre-Processing Data

This would involve transforming the data which would be best for our ML.

To know how it was achieved in detail, please refer to [Transform Data Guide](https://github.com/adityajideveloper/kaggle-competition/house-prices/transform_data.md)

In [68]:
from helper_functions import transform_csv

transform_csv(train_dataset, "raw_data/train.csv")
transform_csv(test_dataset, "raw_data/test.csv")

## Converting Data to Tensor

In [69]:
df = pd.read_csv("raw_data/train.csv")
df_test = pd.read_csv("raw_data/test.csv")

df.head(5)

Unnamed: 0.1,Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0,1,60,5,65.0,8450,2,0.0,0,0,...,0,0.0,0.0,0.0,0,2,2008,0,0,208500
1,1,2,20,5,80.0,9600,2,0.0,0,0,...,0,0.0,0.0,0.0,0,5,2007,0,0,181500
2,2,3,60,5,68.0,11250,2,0.0,1,0,...,0,0.0,0.0,0.0,0,9,2008,0,0,223500
3,3,4,70,5,60.0,9550,2,0.0,1,0,...,0,0.0,0.0,0.0,0,2,2006,0,1,140000
4,4,5,60,5,84.0,14260,2,0.0,1,0,...,0,0.0,0.0,0.0,0,12,2008,0,0,250000


In [70]:
# Removing ID
df = df.drop(df.columns[0], axis=1)
df = df.drop("Id", axis=1)

In [71]:
# Splitting the Data

from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

In [72]:
# Further splitting data in X and y
X_train = train_set.drop("SalePrice", axis=1).to_numpy()
y_train = train_set['SalePrice'].to_numpy()

X_test = test_set.drop("SalePrice", axis=1).to_numpy()
y_test = test_set['SalePrice'].to_numpy()

In [73]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [74]:
# Converting them to tensors
X_train = torch.from_numpy(X_train).to(device)
y_train = torch.from_numpy(y_train).to(device)

X_test = torch.from_numpy(X_test).to(device)
y_test = torch.from_numpy(y_test).to(device)

In [75]:
print(f"X_train Shape -> {X_train.shape}")
print(f"y_train Shape -> {y_train.shape}")

print(f"X_test Shape -> {X_test.shape}")
print(f"y_test Shape -> {y_test.shape}")

X_train Shape -> torch.Size([1168, 79])
y_train Shape -> torch.Size([1168])
X_test Shape -> torch.Size([292, 79])
y_test Shape -> torch.Size([292])


In [76]:
X_train = X_train.to(torch.float32)
y_train = y_train.to(torch.float32)

X_test = X_test.to(torch.float32)
y_test = y_test.to(torch.float32)

X_train.dtype, y_train.dtype, X_test.dtype, y_test.dtype

(torch.float32, torch.float32, torch.float32, torch.float32)

# Creating Neural network

In [77]:
class House_Price_Model_V0(nn.Module):
    def __init__(self) -> None:
        super().__init__()

        # Refer to https://www.linkedin.com/pulse/choosing-number-hidden-layers-neurons-neural-networks-sachdev#:~:text=If%20data%20is%20less%20complex,hidden%20layers%20can%20be%20used.
        # When struggling with hidden layers.

        self.layer_0 = nn.Linear(in_features=79, out_features=128)
        self.layer_1 = nn.Linear(in_features=128, out_features=256)
        self.layer_2 = nn.Linear(in_features=256, out_features=128)
        self.layer_3 = nn.Linear(in_features=128, out_features=64)
        self.layer_4 = nn.Linear(in_features=64, out_features=32)
        self.layer_5 = nn.Linear(in_features=32, out_features=16)
        self.layer_6 = nn.Linear(in_features=16, out_features=4)
        self.layer_7 = nn.Linear(in_features=4, out_features=2)
        self.layer_8 = nn.Linear(in_features=2, out_features=1)
        
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.relu(self.layer_0(x))
        x = self.relu(self.layer_1(x))
        x = self.relu(self.layer_2(x))
        x = self.relu(self.layer_3(x))
        x = self.relu(self.layer_4(x))
        x = self.relu(self.layer_5(x))
        x = self.relu(self.layer_6(x))
        x = self.relu(self.layer_7(x))
        x = self.layer_8(x)

        return x

In [78]:
model = House_Price_Model_V0().to(device)

We will be using `MSELoss` as Loss function and `Adam` as optimizer.

In [79]:
loss_fn = nn.MSELoss()

optimizer = torch.optim.Adam(params=model.parameters(),
                            lr=10e-6)

In [80]:
# This function was written as described in Competition Evaluation Overview
# https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/overview/evaluation

def acc_fn(y_pred, y_true):
    absoulute_error = torch.log(y_true) - torch.log(y_pred)
    square_error = torch.square(absoulute_error)
    MSE = torch.mean(square_error)
    RMSE = torch.sqrt(MSE)

    return RMSE

## Initial Testing of Model

In [81]:
model.eval()

with torch.inference_mode():
    y_init_pred = model(X_test)

print(f"Initial Score -> {acc_fn(y_init_pred.squeeze(dim=1), y_test)}")

Initial Score -> 12.456887245178223


# Training Our Model

In [82]:
torch.manual_seed(42)

def train_model(model, X_train, y_train, X_test, y_test):
    epochs = 30_000

    for epoch in range(epochs):
        model.train()

        y_train_pred = model(X_train)
        train_acc = acc_fn(y_train_pred.squeeze(dim=1), y_train)

        loss = loss_fn(y_train_pred.squeeze(dim=1), y_train)

        optimizer.zero_grad()

        loss.backward()

        optimizer.step()

        # Evaluating Our Model
        model.eval()
        with torch.inference_mode():
            y_test_pred = model(X_test)
        
        test_acc = acc_fn(y_test_pred.squeeze(dim=1), y_test)

        if epoch % 3000 == 0:
            print(f"Epoch: {epoch} | Train Score: {train_acc} | Test Score: {test_acc}")

        if epoch == epochs - 1:
            print(f"Epoch: {epoch} | Train Score: {train_acc} | Test Score: {test_acc}")


In [83]:
train_model(model, X_train, y_train, X_test, y_test)

Epoch: 0 | Train Score: 12.484933853149414 | Test Score: 12.447531700134277
Epoch: 3000 | Train Score: 0.40854892134666443 | Test Score: 0.4341622292995453
Epoch: 6000 | Train Score: 0.28999897837638855 | Test Score: 0.32397833466529846
Epoch: 9000 | Train Score: 0.277414470911026 | Test Score: 0.3121585249900818
Epoch: 12000 | Train Score: 0.24445638060569763 | Test Score: 0.2676754295825958
Epoch: 15000 | Train Score: 0.20876722037792206 | Test Score: 0.2151678204536438
Epoch: 18000 | Train Score: 0.20054158568382263 | Test Score: 0.20749273896217346
Epoch: 21000 | Train Score: 0.1907290667295456 | Test Score: 0.21145117282867432
Epoch: 24000 | Train Score: 0.16221730411052704 | Test Score: 0.19028161466121674
Epoch: 27000 | Train Score: 0.1376049965620041 | Test Score: 0.17789295315742493
Epoch: 29999 | Train Score: 0.12762823700904846 | Test Score: 0.1820656955242157


In [84]:
model.eval()
with torch.inference_mode():
    sample_y_value = model(X_test)

print(f"Score -> {acc_fn(sample_y_value.squeeze(dim=1), y_test)}")

Score -> 0.1820656955242157


# Submission

In [85]:
# Converting Data to Tensor
test_data = torch.from_numpy(df_test.drop(df_test.columns[0], axis=1).drop("Id", axis=1).to_numpy()).to(device).to(torch.float)
test_ids = df_test["Id"].to_numpy()

print(f"Total Ids -> {len(test_ids)}")

# Opening CSV file
import csv

with open("raw_data/submission.csv", "w") as f:
    writer = csv.writer(f)
    
    writer.writerow(["Id", "SalePrice"])
    
    # Looping through the data
    for i, id in enumerate(test_ids):
        model.eval()
        with torch.inference_mode():
            sale_price = model(test_data[i, :]).item()
            
        writer.writerow([id, sale_price])

Total Ids -> 1459


# Saving the Model

In [86]:
# Saving the entire model, although we can save only `state_dict()`, but for the small model, it would be good.
torch.save(model, "model/modelv0.pth")