In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

#### Read and pre-process data

In [2]:
# fetch data 

housing_data = pd.read_csv('Datasets/house_price_subset.csv')

housing_data.head()

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,SaleCondition,SalePrice
0,8450,7,5,1710,2,3,2,Normal,208500
1,9600,6,8,1262,2,3,2,Normal,181500
2,11250,7,5,1786,2,3,2,Normal,223500
3,9550,7,5,1717,1,3,3,Abnorml,140000
4,14260,8,5,2198,2,4,3,Normal,250000


In [3]:
# subset data to only include columns for Normal Sale Condition

len(housing_data)

subset_data = housing_data[housing_data['SaleCondition'] == 'Normal'].drop(columns = ['SaleCondition'])

subset_data.head()

len(subset_data)

1460

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,SalePrice
0,8450,7,5,1710,2,3,2,208500
1,9600,6,8,1262,2,3,2,181500
2,11250,7,5,1786,2,3,2,223500
4,14260,8,5,2198,2,4,3,250000
5,14115,5,5,1362,1,1,2,143000


1198

In [4]:
subset_data.isna().sum()

LotArea         0
OverallQual     0
OverallCond     0
GrLivArea       0
FullBath        0
BedroomAbvGr    0
GarageCars      0
SalePrice       0
dtype: int64

In [5]:
# X_train, X_test, y_train, y_test = train_test_split(subset_data.drop(columns = ['SalePrice']), subset_data['SalePrice'], test_size=0.25)
X_train, X_test, y_train, y_test = train_test_split(subset_data.drop(columns = ['SalePrice']), subset_data['SalePrice'], test_size=0.25, random_state=35)
# Pick a random_state as below and keep using the same number (example 35) to repeat the same test and train data
# X_train, X_test, y_train, y_test = train_test_split(subset_data['bmi'], subset_data['charges'], test_size=0.25, random_state=35)
X_train
X_test
y_train
y_test

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars
574,10500,5,7,1109,1,3,1
662,13560,6,3,1392,1,2,2
97,10921,4,5,960,1,3,1
191,7472,7,9,1479,1,4,2
769,53504,8,5,3279,3,4,3
...,...,...,...,...,...,...,...
911,9286,5,7,1268,1,3,1
1176,6951,5,5,923,1,3,1
1231,7728,5,6,1190,1,3,2
1295,8400,5,5,1052,1,3,1


Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars
1350,11643,5,5,2634,2,6,4
1438,7407,6,7,1236,1,2,2
975,2651,7,5,1382,2,3,2
163,5500,4,6,882,1,1,0
1225,10482,6,8,1138,1,3,1
...,...,...,...,...,...,...,...
132,7388,5,6,1327,1,3,2
582,11841,6,5,816,1,3,0
1390,9100,7,5,1525,2,3,2
94,9337,6,5,1786,2,3,2


574     139000
662     110000
97       94750
191     184000
769     538000
         ...  
911     143500
1176    119500
1231    132500
1295    138500
1168    235000
Name: SalePrice, Length: 898, dtype: int64

1350    200000
1438    149700
975     165000
163     103200
1225    145000
         ...  
132     150750
582     118500
1390    235000
94      204750
777     142500
Name: SalePrice, Length: 300, dtype: int64

In [6]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(sc.transform(X_test), columns = X_test.columns)
X_train
X_test
y_train
y_test

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars
0,-0.01,-0.78,1.15,-0.79,-1.00,0.18,-1.03
1,0.28,-0.03,-2.33,-0.22,-1.00,-1.10,0.39
2,0.03,-1.54,-0.59,-1.08,-1.00,0.18,-1.03
3,-0.29,0.73,2.90,-0.05,-1.00,1.45,0.39
4,4.03,1.49,-0.59,3.55,2.69,1.45,1.82
...,...,...,...,...,...,...,...
893,-0.12,-0.78,1.15,-0.47,-1.00,0.18,-1.03
894,-0.34,-0.78,-0.59,-1.16,-1.00,0.18,-1.03
895,-0.27,-0.78,0.28,-0.63,-1.00,0.18,0.39
896,-0.20,-0.78,-0.59,-0.90,-1.00,0.18,-1.03


Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars
0,0.10,-0.78,-0.59,2.26,0.85,4.00,3.24
1,-0.30,-0.03,1.15,-0.53,-1.00,-1.10,0.39
2,-0.74,0.73,-0.59,-0.24,0.85,0.18,0.39
3,-0.48,-1.54,0.28,-1.24,-1.00,-2.37,-2.45
4,-0.01,-0.03,2.02,-0.73,-1.00,0.18,-1.03
...,...,...,...,...,...,...,...
295,-0.30,-0.78,0.28,-0.35,-1.00,0.18,0.39
296,0.12,-0.03,-0.59,-1.37,-1.00,0.18,-2.45
297,-0.14,0.73,-0.59,0.04,0.85,0.18,0.39
298,-0.11,-0.03,-0.59,0.57,0.85,0.18,0.39


574     139000
662     110000
97       94750
191     184000
769     538000
         ...  
911     143500
1176    119500
1231    132500
1295    138500
1168    235000
Name: SalePrice, Length: 898, dtype: int64

1350    200000
1438    149700
975     165000
163     103200
1225    145000
         ...  
132     150750
582     118500
1390    235000
94      204750
777     142500
Name: SalePrice, Length: 300, dtype: int64

In [7]:
model = LinearRegression(fit_intercept = True)
model.fit(X_train, y_train) 

# The following gives the R-square score
model.score(X_train, y_train) 

# This is the coefficient Beta_1, ..., Beta_7
model.coef_

# This is the coefficient Beta_0
model.intercept_

LinearRegression()

0.8208015412689109

array([10142.32230792, 29963.31168563,  2821.45352121, 34298.99827469,
         -44.04935573, -9427.28874807, 12547.96875896])

177367.58129175947

In [8]:
test_output = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['pred_SalePrice'])
# When extending to multiple features remove .array.reshape(-1, 1)
test_output.head()

Unnamed: 0,pred_SalePrice
0,233688.24
1,173894.77
2,185042.88
3,76307.97
4,142695.42


In [9]:
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_SalePrice'] - test_output['SalePrice']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)

Unnamed: 0,pred_SalePrice,SalePrice
23,118454.19,129900
24,324127.24,154000
25,361344.63,256300
27,301899.44,306000
28,130819.69,207500


Mean absolute error is 
51318.670780662345


In [10]:
abs(test_output['pred_SalePrice'] - test_output['SalePrice']).mean()/test_output['SalePrice'].mean()



0.3039332619816686

#### Visualize data

In [11]:
# define function to import viz libraries
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import *
from plotly import tools
import plotly.graph_objects as go
import seaborn as sns

In [12]:
cols = X_train.columns
for col in cols:
    plot_data = []
    plot_data.append(go.Scatter(x= X_train[col], y= y_train, name = 'Train data actual', mode = 'markers'))
    plot_data.append(go.Scatter(x= X_train[col], y= model.predict(X_train), name = 'Train data predicted', mode = 'markers'))
    plot_data.append(go.Scatter(x= X_test[col], y= y_test, name = 'Test data actual', mode = 'markers'))
    plot_data.append(go.Scatter(x= X_test[col], y= model.predict(X_test), name = 'Test data predicted', mode = 'markers'))

    layout = go.Layout(xaxis = dict(title=col), yaxis = dict(title= 'SalePrice'), 
                       title = 'Plot of predicted and actual')
    fig = go.Figure(data= plot_data, layout=layout)
    plotly.offline.iplot(fig)
