## House Price Prediction Using Pytorch

In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('./houseprice.csv',usecols=["SalePrice", "MSSubClass", "MSZoning", "LotFrontage", "LotArea",
                                         "Street", "YearBuilt", "LotShape", "1stFlrSF", "2ndFlrSF"]).dropna()

In [3]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,YearBuilt,1stFlrSF,2ndFlrSF,SalePrice
0,60,RL,65.0,8450,Pave,Reg,2003,856,854,208500
1,20,RL,80.0,9600,Pave,Reg,1976,1262,0,181500
2,60,RL,68.0,11250,Pave,IR1,2001,920,866,223500
3,70,RL,60.0,9550,Pave,IR1,1915,961,756,140000
4,60,RL,84.0,14260,Pave,IR1,2000,1145,1053,250000


In [4]:
df.shape

(1201, 10)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1201 entries, 0 to 1459
Data columns (total 10 columns):
MSSubClass     1201 non-null int64
MSZoning       1201 non-null object
LotFrontage    1201 non-null float64
LotArea        1201 non-null int64
Street         1201 non-null object
LotShape       1201 non-null object
YearBuilt      1201 non-null int64
1stFlrSF       1201 non-null int64
2ndFlrSF       1201 non-null int64
SalePrice      1201 non-null int64
dtypes: float64(1), int64(6), object(3)
memory usage: 103.2+ KB


In [6]:
for i in df.columns:
    print("Column name {} and unique values are {}".format(i,len(df[i].unique())))

Column name MSSubClass and unique values are 15
Column name MSZoning and unique values are 5
Column name LotFrontage and unique values are 110
Column name LotArea and unique values are 869
Column name Street and unique values are 2
Column name LotShape and unique values are 4
Column name YearBuilt and unique values are 112
Column name 1stFlrSF and unique values are 678
Column name 2ndFlrSF and unique values are 368
Column name SalePrice and unique values are 597


In [7]:
import datetime
datetime.datetime.now().year

2020

In [8]:
df['Total Years']=datetime.datetime.now().year-df['YearBuilt']

In [9]:
df.drop("YearBuilt",axis=1,inplace=True)

In [10]:
df.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', '1stFlrSF', '2ndFlrSF', 'SalePrice', 'Total Years'],
      dtype='object')

In [11]:
cat_features=["MSSubClass", "MSZoning", "Street", "LotShape"]
out_feature="SalePrice"

In [12]:
from sklearn.preprocessing import LabelEncoder
lbl_encoders={}
lbl_encoders["MSSubClass"]=LabelEncoder()
lbl_encoders["MSSubClass"].fit_transform(df["MSSubClass"])

array([5, 0, 5, ..., 6, 0, 0], dtype=int64)

In [13]:
lbl_encoders

{'MSSubClass': LabelEncoder()}

In [14]:
from sklearn.preprocessing import LabelEncoder
lbl_encoders={}
for feature in cat_features:
    lbl_encoders[feature]=LabelEncoder()
    df[feature]=lbl_encoders[feature].fit_transform(df[feature])

In [15]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,1stFlrSF,2ndFlrSF,SalePrice,Total Years
0,5,3,65.0,8450,1,3,856,854,208500,17
1,0,3,80.0,9600,1,3,1262,0,181500,44
2,5,3,68.0,11250,1,0,920,866,223500,19
3,6,3,60.0,9550,1,0,961,756,140000,105
4,5,3,84.0,14260,1,0,1145,1053,250000,20


In [16]:
### Stacking and Converting Into Tensors
cat_features=np.stack([df['MSSubClass'],df['MSZoning'],df['Street'],df['LotShape']],1)
cat_features

array([[5, 3, 1, 3],
       [0, 3, 1, 3],
       [5, 3, 1, 0],
       ...,
       [6, 3, 1, 3],
       [0, 3, 1, 3],
       [0, 3, 1, 3]], dtype=int64)

In [19]:
### Convert numpy to Tensors
import torch
cat_features=torch.tensor(cat_features,dtype=torch.int64)
cat_features

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        ...,
        [6, 3, 1, 3],
        [0, 3, 1, 3],
        [0, 3, 1, 3]])

In [20]:
#### create continuous variable
cont_features=[]
for i in df.columns:
    if i in ["MSSubClass", "MSZoning", "Street", "LotShape","SalePrice"]:
        pass
    else:
        cont_features.append(i)

In [21]:
cont_features


['LotFrontage', 'LotArea', '1stFlrSF', '2ndFlrSF', 'Total Years']

In [22]:
### Stacking continuous variable to a tensor
cont_values=np.stack([df[i].values for i in cont_features],axis=1)
cont_values=torch.tensor(cont_values,dtype=torch.float)
cont_values

tensor([[   65.,  8450.,   856.,   854.,    17.],
        [   80.,  9600.,  1262.,     0.,    44.],
        [   68., 11250.,   920.,   866.,    19.],
        ...,
        [   66.,  9042.,  1188.,  1152.,    79.],
        [   68.,  9717.,  1078.,     0.,    70.],
        [   75.,  9937.,  1256.,     0.,    55.]])

In [23]:
cont_values.dtype

torch.float32

In [24]:
### Dependent Feature 
y=torch.tensor(df['SalePrice'].values,dtype=torch.float).reshape(-1,1)
y

tensor([[208500.],
        [181500.],
        [223500.],
        ...,
        [266500.],
        [142125.],
        [147500.]])

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1201 entries, 0 to 1459
Data columns (total 10 columns):
MSSubClass     1201 non-null int64
MSZoning       1201 non-null int32
LotFrontage    1201 non-null float64
LotArea        1201 non-null int64
Street         1201 non-null int32
LotShape       1201 non-null int32
1stFlrSF       1201 non-null int64
2ndFlrSF       1201 non-null int64
SalePrice      1201 non-null int64
Total Years    1201 non-null int64
dtypes: float64(1), int32(3), int64(6)
memory usage: 89.1 KB


In [26]:
cat_features.shape,cont_values.shape,y.shape

(torch.Size([1201, 4]), torch.Size([1201, 5]), torch.Size([1201, 1]))

In [27]:
len(df['MSSubClass'].unique())

15

In [28]:
#### Embedding Size For Categorical columns
cat_dims=[len(df[col].unique()) for col in ["MSSubClass", "MSZoning", "Street", "LotShape"]]

In [29]:
cat_dims

[15, 5, 2, 4]

In [30]:
embedding_dim= [(x, min(50, (x + 1) // 2)) for x in cat_dims]

In [31]:
embedding_dim

[(15, 8), (5, 3), (2, 1), (4, 2)]

In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F
embed_representation=nn.ModuleList([nn.Embedding(inp,out) for inp,out in embedding_dim])
embed_representation

ModuleList(
  (0): Embedding(15, 8)
  (1): Embedding(5, 3)
  (2): Embedding(2, 1)
  (3): Embedding(4, 2)
)

In [33]:
cat_features

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        ...,
        [6, 3, 1, 3],
        [0, 3, 1, 3],
        [0, 3, 1, 3]])

In [34]:
cat_featuresz=cat_features[:4]
cat_featuresz

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        [6, 3, 1, 0]])

In [35]:
pd.set_option('display.max_rows', 500)
embedding_val=[]
for i,e in enumerate(embed_representation):
    embedding_val.append(e(cat_features[:,i]))

In [36]:
embedding_val

[tensor([[-0.0192,  0.2156,  0.0783,  ..., -1.7073, -0.1094, -0.8735],
         [ 0.8725,  1.6844,  1.6478,  ..., -0.6130, -0.4029,  1.1516],
         [-0.0192,  0.2156,  0.0783,  ..., -1.7073, -0.1094, -0.8735],
         ...,
         [ 0.3033, -1.8885,  2.0891,  ..., -2.2290, -1.4483, -0.2902],
         [ 0.8725,  1.6844,  1.6478,  ..., -0.6130, -0.4029,  1.1516],
         [ 0.8725,  1.6844,  1.6478,  ..., -0.6130, -0.4029,  1.1516]],
        grad_fn=<EmbeddingBackward>), tensor([[ 0.3318, -0.0883, -0.5150],
         [ 0.3318, -0.0883, -0.5150],
         [ 0.3318, -0.0883, -0.5150],
         ...,
         [ 0.3318, -0.0883, -0.5150],
         [ 0.3318, -0.0883, -0.5150],
         [ 0.3318, -0.0883, -0.5150]], grad_fn=<EmbeddingBackward>), tensor([[1.0299],
         [1.0299],
         [1.0299],
         ...,
         [1.0299],
         [1.0299],
         [1.0299]], grad_fn=<EmbeddingBackward>), tensor([[ 1.1506, -0.1699],
         [ 1.1506, -0.1699],
         [ 0.8582,  0.4397],
     

In [37]:
z = torch.cat(embedding_val, 1)
z

tensor([[-0.0192,  0.2156,  0.0783,  ...,  1.0299,  1.1506, -0.1699],
        [ 0.8725,  1.6844,  1.6478,  ...,  1.0299,  1.1506, -0.1699],
        [-0.0192,  0.2156,  0.0783,  ...,  1.0299,  0.8582,  0.4397],
        ...,
        [ 0.3033, -1.8885,  2.0891,  ...,  1.0299,  1.1506, -0.1699],
        [ 0.8725,  1.6844,  1.6478,  ...,  1.0299,  1.1506, -0.1699],
        [ 0.8725,  1.6844,  1.6478,  ...,  1.0299,  1.1506, -0.1699]],
       grad_fn=<CatBackward>)

In [38]:
#### Implement dropupout
droput=nn.Dropout(.4)

In [39]:
final_embed=droput(z)
final_embed

tensor([[-0.0000,  0.3593,  0.0000,  ...,  0.0000,  1.9177, -0.2832],
        [ 1.4542,  2.8073,  0.0000,  ...,  0.0000,  1.9177, -0.2832],
        [-0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.7329],
        ...,
        [ 0.5055, -0.0000,  3.4819,  ...,  0.0000,  0.0000, -0.2832],
        [ 1.4542,  2.8073,  2.7464,  ...,  0.0000,  0.0000, -0.2832],
        [ 0.0000,  0.0000,  2.7464,  ...,  1.7164,  1.9177, -0.2832]],
       grad_fn=<DropoutBackward>)

In [40]:
##### Create a Feed Forward Neural Network
import torch
import torch.nn as nn
import torch.nn.functional as F
class FeedForwardNN(nn.Module):

    def __init__(self, embedding_dim, n_cont, out_sz, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(inp,out) for inp,out in embedding_dim])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        layerlist = []
        n_emb = sum((out for inp,out in embedding_dim))
        n_in = n_emb + n_cont
        
        for i in layers:
            layerlist.append(nn.Linear(n_in,i)) 
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_sz))
            
        self.layers = nn.Sequential(*layerlist)
    
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        return x

In [41]:
len(cont_features)

5

In [42]:
torch.manual_seed(100)
model=FeedForwardNN(embedding_dim,len(cont_features),1,[100,50],p=0.4)

In [43]:
model

FeedForwardNN(
  (embeds): ModuleList(
    (0): Embedding(15, 8)
    (1): Embedding(5, 3)
    (2): Embedding(2, 1)
    (3): Embedding(4, 2)
  )
  (emb_drop): Dropout(p=0.4)
  (bn_cont): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=19, out_features=100, bias=True)
    (1): ReLU(inplace)
    (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4)
    (4): Linear(in_features=100, out_features=50, bias=True)
    (5): ReLU(inplace)
    (6): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4)
    (8): Linear(in_features=50, out_features=1, bias=True)
  )
)

### Define Loss and Optimization:

In [44]:
loss_function=nn.MSELoss()
optimizer=torch.optim.Adam(model.parameters(),lr=0.01)

In [45]:
df.shape

(1201, 10)

In [46]:
cont_values

tensor([[   65.,  8450.,   856.,   854.,    17.],
        [   80.,  9600.,  1262.,     0.,    44.],
        [   68., 11250.,   920.,   866.,    19.],
        ...,
        [   66.,  9042.,  1188.,  1152.,    79.],
        [   68.,  9717.,  1078.,     0.,    70.],
        [   75.,  9937.,  1256.,     0.,    55.]])

In [47]:
cont_values.shape

torch.Size([1201, 5])

In [48]:
batch_size=1200
test_size=int(batch_size*0.15)
train_categorical=cat_features[:batch_size-test_size]
test_categorical=cat_features[batch_size-test_size:batch_size]
train_cont=cont_values[:batch_size-test_size]
test_cont=cont_values[batch_size-test_size:batch_size]
y_train=y[:batch_size-test_size]
y_test=y[batch_size-test_size:batch_size]

In [49]:
len(train_categorical),len(test_categorical),len(train_cont),len(test_cont),len(y_train),len(y_test)

(1020, 180, 1020, 180, 1020, 180)

In [51]:
epochs=5000
final_losses=[]
for i in range(epochs):
    i=i+1
    y_pred=model(train_categorical,train_cont)
    loss=torch.sqrt(loss_function(y_pred,y_train)) ### RMSE
    final_losses.append(loss)
    if i%10==1:
        print("Epoch number: {} and the loss : {}".format(i,loss.item()))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch number: 1 and the loss : 200496.75
Epoch number: 11 and the loss : 200494.890625
Epoch number: 21 and the loss : 200491.390625
Epoch number: 31 and the loss : 200485.46875
Epoch number: 41 and the loss : 200477.578125
Epoch number: 51 and the loss : 200466.296875
Epoch number: 61 and the loss : 200452.265625
Epoch number: 71 and the loss : 200434.84375
Epoch number: 81 and the loss : 200414.6875
Epoch number: 91 and the loss : 200389.25
Epoch number: 101 and the loss : 200364.28125
Epoch number: 111 and the loss : 200333.90625
Epoch number: 121 and the loss : 200299.984375
Epoch number: 131 and the loss : 200261.109375
Epoch number: 141 and the loss : 200215.640625
Epoch number: 151 and the loss : 200171.59375
Epoch number: 161 and the loss : 200123.71875
Epoch number: 171 and the loss : 200073.578125
Epoch number: 181 and the loss : 200012.171875
Epoch number: 191 and the loss : 199945.328125
Epoch number: 201 and the loss : 199881.625
Epoch number: 211 and the loss : 199823.390

Epoch number: 1761 and the loss : 155703.375
Epoch number: 1771 and the loss : 154808.890625
Epoch number: 1781 and the loss : 153846.921875
Epoch number: 1791 and the loss : 153286.1875
Epoch number: 1801 and the loss : 153102.6875
Epoch number: 1811 and the loss : 152619.1875
Epoch number: 1821 and the loss : 152121.296875
Epoch number: 1831 and the loss : 151916.15625
Epoch number: 1841 and the loss : 151281.5
Epoch number: 1851 and the loss : 150878.859375
Epoch number: 1861 and the loss : 150371.34375
Epoch number: 1871 and the loss : 150598.515625
Epoch number: 1881 and the loss : 149920.46875
Epoch number: 1891 and the loss : 149321.75
Epoch number: 1901 and the loss : 148658.53125
Epoch number: 1911 and the loss : 148179.46875
Epoch number: 1921 and the loss : 147490.296875
Epoch number: 1931 and the loss : 147421.8125
Epoch number: 1941 and the loss : 146660.15625
Epoch number: 1951 and the loss : 147045.984375
Epoch number: 1961 and the loss : 145861.125
Epoch number: 1971 an

Epoch number: 3501 and the loss : 68048.8515625
Epoch number: 3511 and the loss : 69845.515625
Epoch number: 3521 and the loss : 67833.140625
Epoch number: 3531 and the loss : 68493.3359375
Epoch number: 3541 and the loss : 67272.765625
Epoch number: 3551 and the loss : 67025.0
Epoch number: 3561 and the loss : 67344.640625
Epoch number: 3571 and the loss : 67146.1171875
Epoch number: 3581 and the loss : 64850.76953125
Epoch number: 3591 and the loss : 65638.3515625
Epoch number: 3601 and the loss : 64937.67578125
Epoch number: 3611 and the loss : 65205.53515625
Epoch number: 3621 and the loss : 64020.546875
Epoch number: 3631 and the loss : 64302.16796875
Epoch number: 3641 and the loss : 63260.30078125
Epoch number: 3651 and the loss : 62470.17578125
Epoch number: 3661 and the loss : 62443.7890625
Epoch number: 3671 and the loss : 61582.6328125
Epoch number: 3681 and the loss : 60403.1015625
Epoch number: 3691 and the loss : 60223.015625
Epoch number: 3701 and the loss : 60900.894531

In [52]:
#### Validate the Test Data
y_pred=""
with torch.no_grad():
    y_pred=model(test_categorical,test_cont)
    loss=torch.sqrt(loss_function(y_pred,y_test))
print('RMSE: {}'.format(loss))

RMSE: 41801.68359375


In [53]:
data_verify=pd.DataFrame(y_test.tolist(),columns=["Test"])

In [54]:
data_predicted=pd.DataFrame(y_pred.tolist(),columns=["Prediction"])

In [55]:
data_predicted

Unnamed: 0,Prediction
0,163325.40625
1,179278.25
2,161876.3125
3,188466.75
4,147170.828125
5,179108.1875
6,235840.0625
7,295837.6875
8,151762.109375
9,429709.34375


In [56]:
final_output=pd.concat([data_verify,data_predicted],axis=1)
final_output['Difference']=final_output['Test']-final_output['Prediction']
final_output.head()

Unnamed: 0,Test,Prediction,Difference
0,130000.0,163325.40625,-33325.40625
1,138887.0,179278.25,-40391.25
2,175500.0,161876.3125,13623.6875
3,195000.0,188466.75,6533.25
4,142500.0,147170.828125,-4670.828125


In [57]:
#### Saving The Model
#### Save the model
torch.save(model,'HousePrice.pt')

  "type " + obj.__name__ + ". It won't be checked "


In [58]:
torch.save(model.state_dict(),'HouseWeights.pt')

In [59]:
### Loading the saved Model
embs_size=[(15, 8), (5, 3), (2, 1), (4, 2)]
model1=FeedForwardNN(embs_size,5,1,[100,50],p=0.4)

In [60]:
model1.load_state_dict(torch.load('HouseWeights.pt'))

In [61]:
model1.eval()

FeedForwardNN(
  (embeds): ModuleList(
    (0): Embedding(15, 8)
    (1): Embedding(5, 3)
    (2): Embedding(2, 1)
    (3): Embedding(4, 2)
  )
  (emb_drop): Dropout(p=0.4)
  (bn_cont): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=19, out_features=100, bias=True)
    (1): ReLU(inplace)
    (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4)
    (4): Linear(in_features=100, out_features=50, bias=True)
    (5): ReLU(inplace)
    (6): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4)
    (8): Linear(in_features=50, out_features=1, bias=True)
  )
)