In [321]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import cv2
import os

In [322]:
def data_preparation(train_data_path: str, test_data_path: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    raw_train_data_df = pd.read_csv(train_data_path, index_col=0)
    train_SellPrice = raw_train_data_df.pop("SalePrice")
    numeric_features = raw_train_data_df.select_dtypes(include="number").columns.tolist()
    train_data_df = pd.get_dummies(raw_train_data_df, dtype="float32")
    train_data_df["SalePrice"] = train_SellPrice.apply(np.log)
    
    raw_test_data_df = pd.read_csv(test_data_path, index_col=0)
    test_data_df = pd.get_dummies(raw_test_data_df, dtype="float32")
    test_data_df = test_data_df.reindex(columns=train_data_df.columns, fill_value=0)
    
    standard_scaler = preprocessing.StandardScaler()
    train_data_df[numeric_features] = standard_scaler.fit_transform(train_data_df[numeric_features])
    test_data_df[numeric_features] = standard_scaler.transform(test_data_df[numeric_features])
    return train_data_df, test_data_df
    

In [323]:
class HouseDataset(torch.utils.data.Dataset):
    def __init__(self, df: pd.DataFrame):
        prices = df.pop("SalePrice").values
        features = df.values
        
        self.features = torch.tensor(features, dtype=torch.float32)
        self.prices = torch.tensor(prices, dtype=torch.float32)
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.prices[idx]

In [324]:
train_path = "./data/train.csv"
test_path = "./data/test.csv"

full_train_dataset_df, test_dataset_df = data_preparation(train_path, test_path)
full_train_dataset = HouseDataset(full_train_dataset_df)

print(type(full_train_dataset))
print(type(full_train_dataset[0]))
print(type(full_train_dataset[0][0]))
print(type(full_train_dataset[0][1]))
print(full_train_dataset[0][1])
print(full_train_dataset[0][0].shape)
print(full_train_dataset[0][1].shape)
print(full_train_dataset[0][0])

<class '__main__.HouseDataset'>
<class 'tuple'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
tensor(12.2477)
torch.Size([287])
torch.Size([])
tensor([ 0.0734, -0.2080, -0.2071,  0.6515, -0.5172,  1.0510,  0.8787,  0.5100,
         0.5754, -0.2887, -0.9446, -0.4593, -0.7934,  1.1619, -0.1202,  0.3703,
         1.1078, -0.2411,  0.7897,  1.2276,  0.1638, -0.2115,  0.9122, -0.9512,
         0.9924,  0.3117,  0.3510, -0.7522,  0.2165, -0.3593, -0.1163, -0.2702,
        -0.0687, -0.0877, -1.5991,  0.1388,  0.0000,  0.0000,  0.0000,  1.0000,
         0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         1.0000,  0.0000,  0.0000,  0.0000,  1.0000,  1.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  1.0000,  1.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0

In [325]:
train_path = "./data/train.csv"
test_path = "./data/test.csv"

full_train_dataset_df, test_dataset_df = data_preparation(train_path, test_path)

full_train_dataset = HouseDataset(full_train_dataset_df)

train_dataset, val_dataset = torch.utils.data.random_split(full_train_dataset, [0.95, 0.05])
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=16 ,shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=16)

test_dataset = HouseDataset(test_dataset_df)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=16)

In [326]:
print(len(full_train_dataset))
print(len(train_dataset))
print(len(val_dataset))
a, b = full_train_dataset[0]
print(a)
print(b)


1460
1387
73
tensor([ 0.0734, -0.2080, -0.2071,  0.6515, -0.5172,  1.0510,  0.8787,  0.5100,
         0.5754, -0.2887, -0.9446, -0.4593, -0.7934,  1.1619, -0.1202,  0.3703,
         1.1078, -0.2411,  0.7897,  1.2276,  0.1638, -0.2115,  0.9122, -0.9512,
         0.9924,  0.3117,  0.3510, -0.7522,  0.2165, -0.3593, -0.1163, -0.2702,
        -0.0687, -0.0877, -1.5991,  0.1388,  0.0000,  0.0000,  0.0000,  1.0000,
         0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         1.0000,  0.0000,  0.0000,  0.0000,  1.0000,  1.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  1.0000,  1.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  

In [327]:
class HouseNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(287, 100)
        self.dropout1 = nn.Dropout(0.2)
        self.fc2 = nn.Linear(100, 100)
        self.dropout2 = nn.Dropout(0.2)
        self.fc3 = nn.Linear(100, 1)
        self.activation = nn.ReLU()
        
    def forward(self, x):
        x = self.dropout1(self.activation(self.fc1(x)))
        x = self.dropout2(self.activation(self.fc2(x)))
        x = self.fc3(x)
        return x
    

In [328]:
model = HouseNetwork()
loss_fun = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.0001, momentum=0.9)

In [330]:
epochs = 5

for epoch in range(epochs):
    epoch_loss = 0.0
    for house, price in train_dataloader:
        optimizer.zero_grad()
        prediction = model(house)
        prediction = prediction.squeeze()
        loss = loss_fun(prediction, price)
        epoch_loss+=loss.item()
        loss.backward()
        optimizer.step()
    print(epoch_loss)    
    
    

nan
nan
nan
nan
nan


In [172]:
print(model)

HouseNetwork(
  (fc1): Linear(in_features=287, out_features=100, bias=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=100, out_features=100, bias=True)
  (dropout2): Dropout(p=0.2, inplace=False)
  (fc3): Linear(in_features=100, out_features=1, bias=True)
  (activation): ReLU()
)
