In [37]:
#Data Cleaning
import pandas as pd
import os
import numpy as np

#Data is saved on diffrent CSVs for each city
#To make things easier, we can combine the data into one dataframe
dfs = []
for file in os.listdir("data"):
    df = pd.read_csv(os.path.join("data", file))
    df["City"] = file.replace(".csv", "")
    dfs.append(df)
df = pd.concat(dfs)

#As documented on kaggle, 9 implies that this information was not found for a home.
#Therefore we replaced all 9s with np.nan as is standard for empty values

temp = df["No. of Bedrooms"].copy()
df = df.applymap(lambda x: (np.nan if x == 9  else  x))
df["No. of Bedrooms"] = temp
df

Unnamed: 0,Price,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,...,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator,City
0,30000000,3340,JP Nagar Phase 1,4,0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bangalore
1,7888000,1045,Dasarahalli on Tumkur Road,2,0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bangalore
2,4866000,1179,Kannur on Thanisandra Main Road,2,0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bangalore
3,8358000,1675,Doddanekundi,3,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bangalore
4,6845000,1670,Kengeri,3,0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bangalore
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7714,14500000,1180,Mira Road East,2,0,,,,,,...,,,,,,,,,,Mumbai
7715,14500000,530,Naigaon East,1,1,,,,,,...,,,,,,,,,,Mumbai
7716,4100000,700,Shirgaon,1,0,,,,,,...,,,,,,,,,,Mumbai
7717,2750000,995,Mira Road East,2,0,,,,,,...,,,,,,,,,,Mumbai


In [38]:
cleaned_df = df[~df.isnull().any(axis=1)]
cleaned_df

Unnamed: 0,Price,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,...,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator,City
0,30000000,3340,JP Nagar Phase 1,4,0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bangalore
1,7888000,1045,Dasarahalli on Tumkur Road,2,0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bangalore
2,4866000,1179,Kannur on Thanisandra Main Road,2,0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bangalore
3,8358000,1675,Doddanekundi,3,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bangalore
4,6845000,1670,Kengeri,3,0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Bangalore
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1393,62000000,1450,Worli,3,0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Mumbai
1394,2500000,540,Virar East,1,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Mumbai
1395,19000000,1267,Belapur,3,1,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Mumbai
1396,14900000,1245,Airoli,2,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Mumbai


Below, I one-hot encode the location and city columns

In [39]:
cleaned_df = pd.get_dummies(cleaned_df,columns=["Location", "City"])
##

In [40]:
import sklearn
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(
    cleaned_df.drop("Price", axis=1), 
    cleaned_df["Price"], 
    test_size=0.33, 
    random_state=42
)


In [41]:
reg = GradientBoostingRegressor()
reg.fit(X_train, y_train)
mse = mean_squared_error(y_test, reg.predict(X_test))
reg.score(X_test,y_test)

0.6047263612905617

## Neural Network

In [42]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import torch

In [43]:
import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt

class dataframeDataset(Dataset):
    def __init__(self, X, y, zscored=True, train_mean=0, train_std=0):
        self.X = X
        self.Y = y
        if (zscored):
            self.Y_zscored = (self.Y - self.Y.mean())/self.Y.std(ddof=0)
        else:
            if (train_std == 0):
                raise "train_std not set"
            self.Y_zscored = (self.Y - train_mean)/train_std 

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        #print("idx", torch.Tensor(self.X.iloc[idx]).dtype, float(self.Y.iloc[idx]))
        #if (self.zscored):
        return (
            torch.Tensor(self.X.iloc[idx]), 
            torch.Tensor([float(self.Y_zscored.iloc[idx])])
        )
        # else:
        #     return (
        #         torch.Tensor(self.X.iloc[idx]), 
        #         torch.Tensor([float(self.Y.iloc[idx])])
        #     )
    
    def get_mean_std(self):
        return self.Y.mean(), self.Y.std(ddof=0)

dataframeDataset(X_train, y_train) 

<__main__.dataframeDataset at 0x240121d9910>

In [44]:
from torch.utils.data import DataLoader
train_dataset = dataframeDataset(X_train, y_train) 
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

train_mean, train_std = train_dataset.get_mean_std()

test_dataset = dataframeDataset(X_test, y_test, zscored=False, train_mean=train_mean, train_std=train_std) 
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [45]:
X_train.shape[1]

893

In [46]:
class NNRegressor_V1(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(893, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 1),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits
    
model = NNRegressor_V1()

In [47]:
from sklearn.metrics import r2_score

def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)

    losses = []
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            losses.append(loss)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    
    return losses


def test_loop(dataloader, model, loss_fn):
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, test_r2 = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)

            test_r2 += r2_score(y, pred)
            test_loss += loss_fn(pred, y).item()

    test_loss /= num_batches
    test_r2 /= num_batches
    print(f"Test Error: Avg loss: {test_loss:>8f} Avg r^2 {test_r2:>8f} \n")

In [48]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=.0001)

epochs = 20

losses = []

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    losses.append(train_loop(train_dataloader, model, loss_fn, optimizer))
print("Done!") 
test_loop(test_dataloader, model, loss_fn)

Epoch 1
-------------------------------
loss: 0.509983  [   16/ 6762]
loss: 0.191351  [ 1616/ 6762]
loss: 0.488435  [ 3216/ 6762]
loss: 0.674525  [ 4816/ 6762]
loss: 0.252242  [ 6416/ 6762]
Epoch 2
-------------------------------
loss: 0.237055  [   16/ 6762]
loss: 0.307220  [ 1616/ 6762]
loss: 0.307647  [ 3216/ 6762]
loss: 0.916976  [ 4816/ 6762]
loss: 0.262694  [ 6416/ 6762]
Epoch 3
-------------------------------
loss: 0.374956  [   16/ 6762]
loss: 0.653102  [ 1616/ 6762]
loss: 0.173809  [ 3216/ 6762]
loss: 0.165382  [ 4816/ 6762]
loss: 0.663912  [ 6416/ 6762]
Epoch 4
-------------------------------
loss: 0.323596  [   16/ 6762]
loss: 0.502840  [ 1616/ 6762]
loss: 0.149498  [ 3216/ 6762]
loss: 0.223130  [ 4816/ 6762]
loss: 0.230025  [ 6416/ 6762]
Epoch 5
-------------------------------
loss: 7.274990  [   16/ 6762]
loss: 0.467998  [ 1616/ 6762]
loss: 0.202764  [ 3216/ 6762]
loss: 0.260585  [ 4816/ 6762]
loss: 37.219872  [ 6416/ 6762]
Epoch 6
-------------------------------
loss: 0.1

In [None]:
plt.plot(losses)