In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

#### Read and pre-process data

In [2]:
# fetch data 

housing_data = pd.read_csv('house_price_subset_expanded.csv')

housing_data.head()

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,SaleCondition,LotShape,GarageType,SalePrice
0,8450,7,5,1710,2,3,2,Normal,Reg,Attchd,208500
1,9600,6,8,1262,2,3,2,Normal,Reg,Attchd,181500
2,11250,7,5,1786,2,3,2,Normal,IR1,Attchd,223500
3,9550,7,5,1717,1,3,3,Abnorml,IR1,Detchd,140000
4,14260,8,5,2198,2,4,3,Normal,IR1,Attchd,250000


In [3]:
# subset data to only include columns for Normal Sale Condition

len(housing_data)

subset_data = housing_data[housing_data['SaleCondition'] == 'Normal'].drop(columns = ['SaleCondition'])

subset_data.head()

len(subset_data)

1460

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,LotShape,GarageType,SalePrice
0,8450,7,5,1710,2,3,2,Reg,Attchd,208500
1,9600,6,8,1262,2,3,2,Reg,Attchd,181500
2,11250,7,5,1786,2,3,2,IR1,Attchd,223500
4,14260,8,5,2198,2,4,3,IR1,Attchd,250000
5,14115,5,5,1362,1,1,2,IR1,Attchd,143000


1198

In [4]:
subset_data.isna().sum()

LotArea          0
OverallQual      0
OverallCond      0
GrLivArea        0
FullBath         0
BedroomAbvGr     0
GarageCars       0
LotShape         0
GarageType      61
SalePrice        0
dtype: int64

In [5]:
print(set(subset_data['LotShape']))
print(set(subset_data['GarageType']))

{'IR1', 'IR3', 'Reg', 'IR2'}
{nan, 'Detchd', 'Basment', 'CarPort', '2Types', 'Attchd', 'BuiltIn'}


In [6]:
subset_data['LotShape'].replace(['IR1', 'IR2', 'IR3'], 'Irr', inplace = True)
subset_data['GarageType'].replace(['BuiltIn', 'Basment', '2Types', 'CarPort'], 'Other', inplace = True)
subset_data['GarageType'].fillna('Other', inplace = True)
print(set(subset_data['LotShape']))
print(set(subset_data['GarageType']))
subset_data.head(20)

{'Reg', 'Irr'}
{'Attchd', 'Detchd', 'Other'}


Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,LotShape,GarageType,SalePrice
0,8450,7,5,1710,2,3,2,Reg,Attchd,208500
1,9600,6,8,1262,2,3,2,Reg,Attchd,181500
2,11250,7,5,1786,2,3,2,Irr,Attchd,223500
4,14260,8,5,2198,2,4,3,Irr,Attchd,250000
5,14115,5,5,1362,1,1,2,Irr,Attchd,143000
6,10084,8,5,1694,2,3,2,Reg,Attchd,307000
7,10382,7,6,2090,2,3,2,Irr,Attchd,200000
9,7420,5,6,1077,1,2,1,Reg,Attchd,118000
10,11200,5,5,1040,1,3,1,Reg,Detchd,129500
12,12968,5,6,912,1,2,1,Irr,Detchd,144000


In [7]:
pd.get_dummies(subset_data, drop_first=False) # Change drop_first to True after demonstration
subset_data.head(20)
# This is a simple way, but it has some downsides (read up about the problems)

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,SalePrice,LotShape_Irr,LotShape_Reg,GarageType_Attchd,GarageType_Detchd,GarageType_Other
0,8450,7,5,1710,2,3,2,208500,0,1,1,0,0
1,9600,6,8,1262,2,3,2,181500,0,1,1,0,0
2,11250,7,5,1786,2,3,2,223500,1,0,1,0,0
4,14260,8,5,2198,2,4,3,250000,1,0,1,0,0
5,14115,5,5,1362,1,1,2,143000,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7917,6,5,1647,2,3,2,175000,0,1,1,0,0
1456,13175,6,6,2073,2,3,2,210000,0,1,1,0,0
1457,9042,7,9,2340,2,4,1,266500,0,1,1,0,0
1458,9717,5,6,1078,1,2,1,142125,0,1,1,0,0


Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,LotShape,GarageType,SalePrice
0,8450,7,5,1710,2,3,2,Reg,Attchd,208500
1,9600,6,8,1262,2,3,2,Reg,Attchd,181500
2,11250,7,5,1786,2,3,2,Irr,Attchd,223500
4,14260,8,5,2198,2,4,3,Irr,Attchd,250000
5,14115,5,5,1362,1,1,2,Irr,Attchd,143000
6,10084,8,5,1694,2,3,2,Reg,Attchd,307000
7,10382,7,6,2090,2,3,2,Irr,Attchd,200000
9,7420,5,6,1077,1,2,1,Reg,Attchd,118000
10,11200,5,5,1040,1,3,1,Reg,Detchd,129500
12,12968,5,6,912,1,2,1,Irr,Detchd,144000


In [8]:
from sklearn.preprocessing import OneHotEncoder

def get_ohe(df, col):
    ohe = OneHotEncoder(drop='first', handle_unknown='error', sparse=False, dtype='int')
    ohe.fit(df[[col]])
    temp_df = pd.DataFrame(data=ohe.transform(df[[col]]), columns=ohe.get_feature_names())
    # If you have a newer version, replace with columns=ohe.get_feature_names_out()
    df.drop(columns=[col], axis=1, inplace=True)
    df = pd.concat([df.reset_index(drop=True), temp_df], axis=1)
    return df

In [9]:
subset_data = get_ohe(subset_data, 'LotShape')
subset_data = get_ohe(subset_data, 'GarageType')
subset_data.head(20)



Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,SalePrice,x0_Reg,x0_Detchd,x0_Other
0,8450,7,5,1710,2,3,2,208500,1,0,0
1,9600,6,8,1262,2,3,2,181500,1,0,0
2,11250,7,5,1786,2,3,2,223500,0,0,0
3,14260,8,5,2198,2,4,3,250000,0,0,0
4,14115,5,5,1362,1,1,2,143000,0,0,0
5,10084,8,5,1694,2,3,2,307000,1,0,0
6,10382,7,6,2090,2,3,2,200000,0,0,0
7,7420,5,6,1077,1,2,1,118000,1,0,0
8,11200,5,5,1040,1,3,1,129500,1,1,0
9,12968,5,6,912,1,2,1,144000,0,1,0


In [10]:
# X_train, X_test, y_train, y_test = train_test_split(subset_data.drop(columns = ['SalePrice']), subset_data['SalePrice'], test_size=0.25)
X_train, X_test, y_train, y_test = train_test_split(subset_data.drop(columns = ['SalePrice']), subset_data['SalePrice'], test_size=0.25, random_state=50)
# Pick a random_state as below and keep using the same number (example 35) to repeat the same test and train data
# X_train, X_test, y_train, y_test = train_test_split(subset_data['bmi'], subset_data['charges'], test_size=0.25, random_state=35)
X_train
X_test
y_train
y_test

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,x0_Reg,x0_Detchd,x0_Other
175,10900,6,7,1392,1,3,1,0,0,0
575,5600,4,5,1092,2,3,0,1,0,1
1123,9750,7,6,2097,2,3,2,1,1,0
1021,14303,8,5,1987,2,2,2,0,0,0
328,7200,5,5,894,1,2,2,1,1,0
...,...,...,...,...,...,...,...,...,...,...
1195,9042,7,9,2340,2,4,1,1,0,0
229,11287,7,6,1982,2,3,2,1,0,0
70,8530,7,5,1474,2,3,2,0,0,1
132,5400,6,7,1616,2,3,1,1,1,0


Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,x0_Reg,x0_Detchd,x0_Other
679,10004,6,6,1516,1,3,2,1,0,0
1023,9587,7,5,1166,2,2,2,0,0,0
453,11146,8,5,1717,2,3,3,0,0,0
284,8544,3,4,1040,2,2,2,1,1,0
739,13501,8,5,1636,2,3,3,0,0,0
...,...,...,...,...,...,...,...,...,...,...
1016,3136,7,5,1405,2,2,2,0,0,0
813,10206,3,3,944,1,2,2,1,1,0
27,10552,5,5,1700,1,4,2,0,0,0
2,11250,7,5,1786,2,3,2,0,0,0


175     161750
575      55000
1123    274300
1021    301500
328     123000
         ...  
1195    266500
229     228500
70      168500
132     152000
109     180000
Name: SalePrice, Length: 898, dtype: int64

679     167000
1023    190000
453     255500
284      87500
739     255000
         ...  
1016    171750
813      82000
27      165500
2       223500
24       40000
Name: SalePrice, Length: 300, dtype: int64

In [11]:
model = LinearRegression(fit_intercept = True)
model.fit(X_train, y_train) 

# The following gives the R-square score
model.score(X_train, y_train) 

# This is the coefficient Beta_1, ..., Beta_7
model.coef_

# This is the coefficient Beta_0
model.intercept_

LinearRegression()

0.8040606762260561

array([ 8.54126329e-01,  2.01432502e+04,  3.41432204e+03,  6.86559154e+01,
       -2.94320675e+03, -1.39994833e+04,  1.69436706e+04, -9.70936029e+03,
       -1.58908503e+04, -1.12726542e+03])

-50735.48946572392

In [12]:
test_output = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['pred_SalePrice'])
# When extending to multiple features remove .array.reshape(-1, 1)
test_output.head()

Unnamed: 0,pred_SalePrice
679,182473.32
1023,195582.14
453,257830.57
284,76453.11
739,254280.91


In [13]:
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_SalePrice'] - test_output['SalePrice']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)

Unnamed: 0,pred_SalePrice,SalePrice
679,182473.32,167000
1023,195582.14,190000
453,257830.57,255500
284,76453.11,87500
739,254280.91,255000


Mean absolute error is 
19675.554460243766


#### Visualize data

In [14]:
# define function to import viz libraries
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import *
from plotly import tools
import plotly.graph_objects as go
import seaborn as sns

In [15]:
cols = X_train.columns
for col in cols:
    plot_data = []
    plot_data.append(go.Scatter(x= X_train[col], y= y_train, name = 'Train data actual', mode = 'markers'))
    plot_data.append(go.Scatter(x= X_train[col], y= model.predict(X_train), name = 'Train data predicted', mode = 'markers'))
    plot_data.append(go.Scatter(x= X_test[col], y= y_test, name = 'Test data actual', mode = 'markers'))
    plot_data.append(go.Scatter(x= X_test[col], y= model.predict(X_test), name = 'Test data predicted', mode = 'markers'))

    layout = go.Layout(xaxis = dict(title=col), yaxis = dict(title= 'SalePrice'), 
                       title = 'Plot of predicted and actual')
    fig = go.Figure(data= plot_data, layout=layout)
    plotly.offline.iplot(fig)


In [16]:
mean_absolute_error_ratio = abs(test_output['pred_SalePrice'] - test_output['SalePrice']).mean()/test_output['SalePrice'].mean()
print('Mean absolute error ratio is ')
print(mean_absolute_error_ratio)
# Without the categorical variables:
# Mean absolute error ratio is 
# 0.11562408981964112

Mean absolute error ratio is 
0.11254262105041833
