In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

#### Read and pre-process data

In [2]:
# fetch data 

housing_data = pd.read_csv('house_price_subset.csv')

housing_data.head()

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,SaleCondition,SalePrice
0,8450,7,5,1710,2,3,2,Normal,208500
1,9600,6,8,1262,2,3,2,Normal,181500
2,11250,7,5,1786,2,3,2,Normal,223500
3,9550,7,5,1717,1,3,3,Abnorml,140000
4,14260,8,5,2198,2,4,3,Normal,250000


In [3]:
# subset data to only include columns for Normal Sale Condition

len(housing_data)

subset_data = housing_data[housing_data['SaleCondition'] == 'Normal'].drop(columns = ['SaleCondition'])

subset_data.head()

len(subset_data)

1460

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,SalePrice
0,8450,7,5,1710,2,3,2,208500
1,9600,6,8,1262,2,3,2,181500
2,11250,7,5,1786,2,3,2,223500
4,14260,8,5,2198,2,4,3,250000
5,14115,5,5,1362,1,1,2,143000


1198

In [4]:
subset_data.isna().sum()

LotArea         0
OverallQual     0
OverallCond     0
GrLivArea       0
FullBath        0
BedroomAbvGr    0
GarageCars      0
SalePrice       0
dtype: int64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(subset_data.drop(columns = ['SalePrice']), subset_data['SalePrice'], test_size=0.25)
# Pick a random_state as below and keep using the same number (example 35) to repeat the same test and train data
# X_train, X_test, y_train, y_test = train_test_split(subset_data['bmi'], subset_data['charges'], test_size=0.25, random_state=35)
X_train
X_test
y_train
y_test

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars
1092,8400,6,5,1694,2,4,2
313,215245,7,5,2036,2,3,2
1204,10140,5,6,1074,1,3,2
124,17043,6,5,1586,2,3,2
1396,57200,5,5,1687,1,3,2
...,...,...,...,...,...,...,...
705,5600,4,5,1092,2,3,0
1406,8445,5,7,768,1,2,2
55,10175,6,5,1425,2,3,2
980,12122,7,9,999,1,3,2


Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars
1033,8125,7,5,1654,2,3,3
1190,32463,4,4,1622,1,3,4
35,13418,8,5,2452,3,4,3
1201,10400,7,5,1779,2,3,2
394,10134,5,6,1034,1,2,1
...,...,...,...,...,...,...,...
603,3182,7,5,1200,2,2,2
1118,13825,5,6,1601,1,3,1
619,12244,8,5,2262,2,4,3
988,12046,6,6,2030,2,4,2


1092    136500
313     375000
1204    153500
124     181000
1396    160000
         ...  
705      55000
1406    133000
55      180500
980     178400
1358    177500
Name: SalePrice, Length: 898, dtype: int64

1033    230000
1190    168000
35      309000
1201    197900
394     109000
         ...  
603     151000
1118    140000
619     305000
988     195000
429     175000
Name: SalePrice, Length: 300, dtype: int64

In [6]:
model = LinearRegression(fit_intercept = True)
model.fit(X_train, y_train) 

# The following gives the R-square score
model.score(X_train, y_train) 

# This is the coefficient Beta_1, ..., Beta_7
model.coef_

# This is the coefficient Beta_0
model.intercept_

LinearRegression()

0.7995533812290185

array([ 9.78394709e-01,  2.25387758e+04,  2.71192055e+03,  6.54827669e+01,
        8.67433945e+02, -1.21476317e+04,  1.76101322e+04])

-80728.4862363954

In [7]:
test_output = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['pred_SalePrice'])
# When extending to multiple features remove .array.reshape(-1, 1)
test_output.head()

Unnamed: 0,pred_SalePrice
1033,224982.87
1190,193114.04
35,293675.34
1201,217783.93
394,120043.45


In [8]:
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_SalePrice'] - test_output['SalePrice']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)

Unnamed: 0,pred_SalePrice,SalePrice
1033,224982.87,230000
1190,193114.04,168000
35,293675.34,309000
1201,217783.93,197900
394,120043.45,109000


Mean absolute error is 
21860.193513723367


#### Visualize data

In [9]:
# define function to import viz libraries
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import *
from plotly import tools
import plotly.graph_objects as go
import seaborn as sns

In [10]:
cols = X_train.columns
for col in cols:
    plot_data = []
    plot_data.append(go.Scatter(x= X_train[col], y= y_train, name = 'Train data actual', mode = 'markers'))
    plot_data.append(go.Scatter(x= X_train[col], y= model.predict(X_train), name = 'Train data predicted', mode = 'markers'))
    plot_data.append(go.Scatter(x= X_test[col], y= y_test, name = 'Test data actual', mode = 'markers'))
    plot_data.append(go.Scatter(x= X_test[col], y= model.predict(X_test), name = 'Test data predicted', mode = 'markers'))

    layout = go.Layout(xaxis = dict(title=col), yaxis = dict(title= 'SalePrice'), 
                       title = 'Plot of predicted and actual')
    fig = go.Figure(data= plot_data, layout=layout)
    plotly.offline.iplot(fig)
