In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

#### Read and pre-process data

In [2]:
# fetch data 

df = pd.read_csv('house_price_subset.csv')

df.head()

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,SaleCondition,SalePrice
0,8450,7,5,1710,2,3,2,Normal,208500
1,9600,6,8,1262,2,3,2,Normal,181500
2,11250,7,5,1786,2,3,2,Normal,223500
3,9550,7,5,1717,1,3,3,Abnorml,140000
4,14260,8,5,2198,2,4,3,Normal,250000


In [3]:
# subset data to only include columns for smokers

df = df[df['SaleCondition'] == 'Normal']

df.head()

len(df)

df.drop(df.columns[7], axis=1, inplace=True)

df.head()

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,SaleCondition,SalePrice
0,8450,7,5,1710,2,3,2,Normal,208500
1,9600,6,8,1262,2,3,2,Normal,181500
2,11250,7,5,1786,2,3,2,Normal,223500
4,14260,8,5,2198,2,4,3,Normal,250000
5,14115,5,5,1362,1,1,2,Normal,143000


1198

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars,SalePrice
0,8450,7,5,1710,2,3,2,208500
1,9600,6,8,1262,2,3,2,181500
2,11250,7,5,1786,2,3,2,223500
4,14260,8,5,2198,2,4,3,250000
5,14115,5,5,1362,1,1,2,143000


In [4]:
x = df.drop(['SalePrice'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(x, df['SalePrice'], test_size=0.25)
# Pick a random_state as below and keep using the same number (example 35) to repeat the same test and train data
# X_train, X_test, y_train, y_test = train_test_split(subset_data['bmi'], subset_data['charges'], test_size=0.25, random_state=35)
X_train
X_test
y_train
y_test

Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars
513,9187,6,5,1080,1,3,2
1110,8000,6,5,1658,2,3,2
1268,14100,8,9,3447,3,4,3
780,7875,7,5,1253,2,3,2
1071,11700,6,6,1556,1,4,2
...,...,...,...,...,...,...,...
145,2522,6,5,1709,2,3,2
242,5000,5,4,1440,1,3,1
1448,11767,4,7,1346,1,2,1
124,17043,6,5,1586,2,3,2


Unnamed: 0,LotArea,OverallQual,OverallCond,GrLivArea,FullBath,BedroomAbvGr,GarageCars
276,9196,7,5,1560,2,3,2
1137,6342,5,8,1020,1,2,0
337,9135,7,5,1536,2,3,2
451,70761,7,5,1533,2,2,2
177,13650,5,5,1920,2,4,2
...,...,...,...,...,...,...,...
385,3182,8,5,1269,2,2,2
932,11670,9,5,1905,2,3,3
645,10530,6,5,981,1,3,2
1189,7500,7,5,1804,2,3,2


513     134000
1110    188000
1268    381000
780     176000
1071    154000
         ...  
145     130000
242      79000
1448    112000
124     181000
1069    135000
Name: SalePrice, Length: 898, dtype: int64

276     201000
1137     94000
337     214000
451     280000
177     172500
         ...  
385     192000
932     320000
645     143250
1189    189000
563     185000
Name: SalePrice, Length: 300, dtype: int64

In [5]:
model = LinearRegression(fit_intercept = True)

model.fit(X_train, y_train) # When extending to multiple features remove .array.reshape(-1, 1)

# The following gives the R-square score
model.score(X_train, y_train) # When extending to multiple features remove .array.reshape(-1, 1)

# This is the coefficient Beta_1 (or slope of the Simple Linear Regression line)
model.coef_

# This is the coefficient Beta_0
model.intercept_

LinearRegression()

0.8146581491497346

array([ 1.15627214e+00,  2.30738502e+04,  2.39862622e+03,  6.12987943e+01,
        1.65674906e+03, -1.13094431e+04,  1.66711936e+04])

-79353.08270153886

In [6]:
test_output = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['pred_charges'])
# When extending to multiple features remove .array.reshape(-1, 1)
test_output.head()

Unnamed: 0,pred_charges
276,203143.75
1137,104100.89
337,201602.05
451,283984.02
177,172904.21


In [7]:
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_charges'] - test_output['SalePrice']).mean()
error = mean_absolute_error/(test_output['SalePrice']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)
print(error)

Unnamed: 0,pred_charges,SalePrice
276,203143.75,201000
1137,104100.89,94000
337,201602.05,214000
451,283984.02,280000
177,172904.21,172500


Mean absolute error is 
23617.209442665226
0.13449318717590458


#### Visualize data

In [8]:
# define function to import viz libraries
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import *
from plotly import tools
import plotly.graph_objects as go
import seaborn as sns

In [9]:
plot_data = []
plot_data.append(go.Scatter(x= X_train, y= y_train, name = 'Train data actual', mode = 'markers'))
plot_data.append(go.Scatter(x= X_train, y= model.predict(X_train), name = 'Train data predicted', mode = 'markers'))
plot_data.append(go.Scatter(x= X_test, y= y_test, name = 'Test data actual', mode = 'markers'))
plot_data.append(go.Scatter(x= X_test, y= model.predict(X_test), name = 'Test data predicted', mode = 'markers'))
# When extending to multiple features remove .array.reshape(-1, 1) in above (but remember 2-d we cannot draw)

layout = go.Layout(xaxis = dict(title='Area'), yaxis = dict(title= 'SalePrice'), 
                   title = 'Plot of predicted and actual')
fig = go.Figure(data= plot_data, layout=layout)
plotly.offline.iplot(fig)
