In [101]:
#Data Cleaning
import pandas as pd
import os
import numpy as np

#Data is saved on diffrent CSVs for each city
#To make things easier, we can combine the data into one dataframe
dfs = []
for file in os.listdir("data"):
    df = pd.read_csv(os.path.join("data", file))
    df["City"] = file.replace(".csv", "")
    dfs.append(df)
df = pd.concat(dfs)

#As documented on kaggle, 9 implies that this information was not found for a home.
#Therefore we replaced all 9s with np.nan as is standard for empty values

temp = df["No. of Bedrooms"].copy()
df = df.applymap(lambda x: (np.nan if x == 9  else  x))
df["No. of Bedrooms"] = temp
df

Unnamed: 0,Price,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,...,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator,City
0,6968000,1340,Nizampet,2,0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hyderabad
1,29000000,3498,Hitech City,4,0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hyderabad
2,6590000,1318,Manikonda,2,0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hyderabad
3,5739000,1295,Alwal,3,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hyderabad
4,5679000,1145,Kukatpally,2,0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hyderabad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6202,5364000,590,Chandapura,1,0,,,,,,...,,,,,,,,,,Bangalore
6203,8716000,1179,Kasavanahalli,2,0,,,,,,...,,,,,,,,,,Bangalore
6204,7373000,1143,Kasavanahalli,2,0,,,,,,...,,,,,,,,,,Bangalore
6205,4985000,1680,Kasavanahalli,3,0,,,,,,...,,,,,,,,,,Bangalore


In [102]:
cleaned_df = df[~df.isnull().any(axis=1)]
cleaned_df

Unnamed: 0,Price,Area,Location,No. of Bedrooms,Resale,MaintenanceStaff,Gymnasium,SwimmingPool,LandscapedGardens,JoggingTrack,...,BED,VaastuCompliant,Microwave,GolfCourse,TV,DiningTable,Sofa,Wardrobe,Refrigerator,City
0,6968000,1340,Nizampet,2,0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hyderabad
1,29000000,3498,Hitech City,4,0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hyderabad
2,6590000,1318,Manikonda,2,0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hyderabad
3,5739000,1295,Alwal,3,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hyderabad
4,5679000,1145,Kukatpally,2,0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hyderabad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1946,8306999,1184,Hosa Road,2,0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Bangalore
1947,4883000,655,Hosa Road,1,0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Bangalore
1948,11500000,1680,Hosa Road,3,0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Bangalore
1949,8378000,1195,Hosa Road,2,0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Bangalore


Below, I one-hot encode the location and city columns

In [103]:
cleaned_df = pd.get_dummies(cleaned_df,columns=["Location", "City"])
##

In [104]:
import sklearn
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(
    cleaned_df.drop("Price", axis=1), 
    cleaned_df["Price"], 
    test_size=0.33, 
    random_state=42
)


In [105]:
reg = GradientBoostingRegressor()
reg.fit(X_train, y_train)
mse = mean_squared_error(y_test, reg.predict(X_test))
reg.score(X_test,y_test)

0.5835689139851632

## Neural Network

In [106]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import torch

In [107]:
import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt

class dataframeDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.Y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        #print("idx", torch.Tensor(self.X.iloc[idx]).dtype, float(self.Y.iloc[idx]))
        return (
            torch.Tensor(self.X.iloc[idx]), 
            torch.Tensor([float(self.Y.iloc[idx])])
        )

dataframeDataset(X_train, y_train) 

<__main__.dataframeDataset at 0x7f7ccbf13220>

In [108]:
from torch.utils.data import DataLoader
train_dataset = dataframeDataset(X_train, y_train) 
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataset = dataframeDataset(X_test, y_test) 
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [109]:
X_train.shape[1]

893

In [110]:
class NNRegressor_V1(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(893, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 1),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits
    
model = NNRegressor_V1()

In [111]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    

    for thing in enumerate(dataloader):
        print(thing)

    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()

    test_loss /= num_batches
    print(f"Test Error: Avg loss: {test_loss:>8f} \n")

In [112]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=.001)

epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
print("Done!") 
test_loop(test_dataloader, model, loss_fn)

Epoch 1
-------------------------------
(0, [tensor([[1.7420e+03, 3.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.1000e+03, 2.0000e+00, 1.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [9.0000e+02, 3.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [1.3500e+03, 2.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.6910e+03, 3.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [5.3000e+02, 1.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         1.0000e+00]]), tensor([[13700000.],
        [ 8500000.],
        [ 4500000.],
        [ 7569000.],
        [14400000.],
        [ 4656000.],
        [ 5380000.],
        [ 3932000.],
        [ 2100000.],
        [ 3500000.],
        [ 4850000.],
        [ 6950000.],
        [ 3321000.],
        [ 6061000.],
        [10900000.],
        [ 3467000.]])])
(1, [tensor([[1.278

KeyboardInterrupt: 