# Prediction of house price with Linear Regression, LASSO and Extra tree Regressor



In [9]:
import pandas
import numpy as np
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.model_selection import cross_val_score,GridSearchCV
#from sklearn import grid_search
from sklearn.ensemble import ExtraTreesRegressor
import statsmodels.api as sm
float_formatter = lambda x: "%.2f" % x
np.set_printoptions(formatter={'float_kind':float_formatter})
import sklearn.feature_selection as f

In [3]:
######################################################################################

g = pandas.read_csv("kc-house-data.csv",encoding = "ISO-8859-1")
g["price"]    = g["price"]/1000



X               = g[["sqft_above","sqft_basement","sqft_lot","sqft_living","floors","bedrooms",
                     "yr_built","lat","long","bathrooms"]].values
Y               = g["price"].values
zipcodes        = pandas.get_dummies(g["zipcode"]).values
condition       = pandas.get_dummies(g["condition"]).values
grade           = pandas.get_dummies(g["grade"]).values
X               = np.concatenate((X,zipcodes),axis=1)
X               = np.concatenate((X,condition),axis=1)
X               = np.concatenate((X,grade),axis=1)



In [4]:
#######################################################################

model = sm.OLS(g["price"],X)
results = model.fit()
print(results.summary())



                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.792
Model:                            OLS   Adj. R-squared:                  0.791
Method:                 Least Squares   F-statistic:                     881.0
Date:                Sun, 04 Oct 2020   Prob (F-statistic):               0.00
Time:                        15:49:49   Log-Likelihood:            -1.4134e+05
No. Observations:               21613   AIC:                         2.829e+05
Df Residuals:                   21519   BIC:                         2.836e+05
Df Model:                          93                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.0681      0.002     34.701      0.0

In [6]:
#######################################################################
 
clf            = LinearRegression()
clf.fit(X, g["price"].values)
scores = cross_val_score(clf,X , g["price"].values, cv=3)
print("Linear Regression Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print(clf.coef_)
print("LinearRegression # coeffs :" + str(clf.coef_.shape[0]))





Linear Regression Accuracy: 0.79 (+/- 0.03)
[0.07 0.04 0.00 0.11 -18.19 -22.89 -0.91 90.84 -312.49 31.75 -193.77
 -171.86 -213.18 546.04 89.64 71.09 55.94 117.82 -55.20 -85.99 -10.54
 -42.92 -85.39 -251.43 41.94 10.86 -96.08 74.51 -169.20 -165.45 -204.24
 157.74 -5.24 -89.55 994.82 329.25 -145.16 48.85 49.60 43.56 -148.77
 -91.37 -142.13 -94.40 11.83 -108.00 -44.36 19.98 33.22 -81.73 -187.78
 213.93 74.16 232.18 -122.93 74.87 -123.29 239.05 356.63 85.41 45.68 40.16
 -57.17 225.83 99.07 -29.34 -60.33 -101.17 22.26 48.72 -114.51 -161.49
 -94.04 -123.04 -178.28 -29.25 -139.79 -181.53 -171.63 116.91 -64.44
 -19.23 -0.21 21.96 61.93 -270.32 -240.36 -303.31 -335.20 -345.63 -334.03
 -296.29 -206.75 -68.37 155.20 618.20 1626.85]
LinearRegression # coeffs :97


In [7]:
#######################################################################

clf            = Lasso(max_iter = 100000000)
clf.fit(X, g["price"].values)
scores = cross_val_score(clf,X , g["price"].values, cv=3)
print("Lasso Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print(clf.coef_)
print("Lasso # coeffs :" + str(clf.coef_[clf.coef_>0].shape[0]))



Lasso Accuracy: 0.74 (+/- 0.02)
[0.01 0.00 0.00 0.20 14.78 -38.13 -1.94 525.31 -99.23 46.11 -0.00 -0.00
 -0.00 487.44 0.00 61.76 0.00 35.04 0.00 -30.15 -0.00 -24.31 0.00 -22.11
 0.00 0.00 -63.81 0.00 -0.00 -0.00 -0.00 90.64 -0.00 -0.00 631.16 279.63
 -0.00 0.00 0.00 -0.00 -0.00 -0.00 -16.43 -0.00 -0.00 0.00 -26.54 -0.00
 0.00 -41.64 -0.00 26.01 22.85 106.21 -0.00 0.00 -0.00 32.18 251.28 25.18
 23.83 0.00 -0.00 101.64 23.37 -0.00 0.00 -69.58 3.62 1.19 -0.00 -0.00
 -74.23 -0.00 -14.06 -0.00 -0.00 -0.00 -0.00 67.54 -0.00 -0.00 -16.58 0.00
 26.94 -0.00 0.00 -0.00 -40.36 -114.14 -105.35 -69.92 0.00 101.33 260.70
 507.46 43.23]
Lasso # coeffs :29


In [8]:
######################################################################

clf            = ExtraTreesRegressor()
parameters     = {'max_depth':np.arange(1,15)}
clfgrid        = GridSearchCV(clf, parameters)
clfgrid.fit(X, g["price"].values)
scores = cross_val_score(clf,X , g["price"].values, cv=3)
print("Extratrees Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

#######################################################################################




Extratrees Accuracy: 0.84 (+/- 0.01)
