In [56]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import cv2
import os

In [57]:
class HouseDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = pd.read_csv(data)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data.iloc[idx, :-1], self.data.iloc[idx, -1].item()

In [63]:
def data_preparation(train_data_path: str, test_data_path: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    raw_train_data_df = pd.read_csv(train_data_path, index_col=0)
    train_SellPrice = raw_train_data_df.pop("SalePrice")
    numeric_features = raw_train_data_df.select_dtypes(include="number").columns.tolist()
    train_data_df = pd.get_dummies(raw_train_data_df)
    train_data_df["SalePrice"] = train_SellPrice.apply(np.log)
    
    raw_test_data_df = pd.read_csv(test_data_path, index_col=0)
    test_data_df = pd.get_dummies(raw_test_data_df)
    test_data_df = test_data_df.reindex(columns=train_data_df.columns, fill_value=0)
    
    standard_scaler = preprocessing.StandardScaler()
    train_data_df[numeric_features] = standard_scaler.fit_transform(train_data_df[numeric_features])
    test_data_df[numeric_features] = standard_scaler.transform(test_data_df[numeric_features])
    return train_data_df, test_data_df
    

In [64]:
raw_train_data_df = pd.read_csv("data/train.csv", index_col=0)
train_SalePrice = raw_train_data_df["SalePrice"]
raw_train_data_df.drop("SalePrice", axis=1, inplace=True)
numeric_features = raw_train_data_df.select_dtypes(include="number").columns.tolist()
train_data_df = pd.get_dummies(raw_train_data_df)
train_data_df["SalePrice"] = train_SalePrice.apply(np.log)
print(train_SalePrice)
print(train_data_df)

raw_test_data_df = pd.read_csv("data/test.csv", index_col=0)
test_data_df = pd.get_dummies(raw_test_data_df)
print(len(test_data_df.columns))
test_data_df = test_data_df.reindex(columns=train_data_df.columns, fill_value=0)
print(len(test_data_df.columns))

Id
1       208500
2       181500
3       223500
4       140000
5       250000
         ...  
1456    175000
1457    210000
1458    266500
1459    142125
1460    147500
Name: SalePrice, Length: 1460, dtype: int64
      MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
Id                                                                            
1             60         65.0     8450            7            5       2003   
2             20         80.0     9600            6            8       1976   
3             60         68.0    11250            7            5       2001   
4             70         60.0     9550            7            5       1915   
5             60         84.0    14260            8            5       2000   
...          ...          ...      ...          ...          ...        ...   
1456          60         62.0     7917            6            5       1999   
1457          20         85.0    13175            6            6       1978  

In [59]:
standard_scaler = preprocessing.StandardScaler()
train_data_df[numeric_features] = standard_scaler.fit_transform(train_data_df[numeric_features])
test_data_df[numeric_features] = standard_scaler.transform(test_data_df[numeric_features])

In [16]:
full_train_dataset = HouseDataset("data/train.csv")
train_dataset, val_dataset = torch.utils.data.random_split(full_train_dataset, [0.95, 0.05])

In [20]:
print(len(full_train_dataset))
print(len(train_dataset))
print(len(val_dataset))
a, b = full_train_dataset[0]
print(a)
print(b)

1460
1387
73
Id                    1
MSSubClass           60
MSZoning             RL
LotFrontage        65.0
LotArea            8450
                  ...  
MiscVal               0
MoSold                2
YrSold             2008
SaleType             WD
SaleCondition    Normal
Name: 0, Length: 80, dtype: object
208500


In [5]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=16 ,shuffle=True)
val_dataset = torch.utils.data.DataLoader(val_dataset, batch_size=16)

In [7]:
class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(287, 100)
        self.linear2 = nn.Linear(100, 100)
        self.linear3 = nn.Linear(100, 1)
        self.activation = nn.ReLU()
        
    def forward(self, x):
        x = self.activation(self.linear1(x))
        x = self.activation(self.linear2(x))
        x = self.activation(self.linear3(x))
        return x
    

In [8]:
model = Network()

In [9]:
loss_fn = nn.CrossEntropyLoss()

In [10]:
optimizer = torch.optim.SGD(model.parameters(), lr = 0.001, momentum=0.9)

In [11]:
epochs = 5

for epoch in range(epochs):
    loss_val = 0
    for house, price in (pbar := tqdm(train_dataloader)):
        optimizer.zero_grad()
        pred = model(house)
        loss = loss_fn(pred, price)
        loss.backward()
        loss_val += loss.item()
        
        optimizer.step()
        
        pbar.set_description(f"loss: {loss.item():.5f}")

loss: 1.40576: 100%|██████████| 3750/3750 [01:38<00:00, 37.97it/s]


1.562981344850858


loss: 0.73487: 100%|██████████| 3750/3750 [01:51<00:00, 33.77it/s]


1.1037349614898364


loss: 0.21521: 100%|██████████| 3750/3750 [01:14<00:00, 50.40it/s] 


0.7472377012009422


loss: 1.10486: 100%|██████████| 3750/3750 [01:25<00:00, 43.96it/s] 


0.6867399440020323


loss: 0.77737: 100%|██████████| 3750/3750 [01:28<00:00, 42.60it/s]

0.653815554845333





In [12]:
print(model)

NeuralNumbers(
  (flat): Flatten(start_dim=1, end_dim=-1)
  (linear1): Linear(in_features=784, out_features=100, bias=True)
  (linear2): Linear(in_features=100, out_features=100, bias=True)
  (linear3): Linear(in_features=100, out_features=10, bias=True)
  (activation): ReLU()
)
