In [19]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np

import statsmodels.api as sm
from statsmodels.formula.api import ols

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from math import sqrt

In [3]:
df = pd.read_csv('house_data_cleaned.csv')

In [4]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bed_bath_sum,renovation_yes,sqft_living_plus_lot,sqft_living_plus_lot_15
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0.0,0.0,...,,98178,47.5112,-122.257,1340,5650,4.0,0,6830,6990
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,...,1991.0,98125,47.721,-122.319,1690,7639,5.25,1,9812,9329
2,5631500400,2015-02-25,180000.0,2,1.0,770,10000,1.0,0.0,0.0,...,,98028,47.7379,-122.233,2720,8062,3.0,0,10770,10782
3,2487200875,2014-12-09,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,...,,98136,47.5208,-122.393,1360,5000,7.0,0,6960,6360
4,1954400510,2015-02-18,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,...,,98074,47.6168,-122.045,1800,7503,5.0,0,9760,9303


In [5]:
df6 = df[['price','sqft_living', 'grade', 'sqft_above', 'sqft_living15', 'bathrooms',
       'view']]

In [6]:
df6.head()

Unnamed: 0,price,sqft_living,grade,sqft_above,sqft_living15,bathrooms,view
0,221900.0,1180,7,1180,1340,1.0,0.0
1,538000.0,2570,7,2170,1690,2.25,0.0
2,180000.0,770,6,770,2720,1.0,0.0
3,604000.0,1960,7,1050,1360,3.0,0.0
4,510000.0,1680,8,1680,1800,2.0,0.0


In [7]:
abs(df6.corr()) > .8

Unnamed: 0,price,sqft_living,grade,sqft_above,sqft_living15,bathrooms,view
price,True,False,False,False,False,False,False
sqft_living,False,True,False,True,False,False,False
grade,False,False,True,False,False,False,False
sqft_above,False,True,False,True,False,False,False
sqft_living15,False,False,False,False,True,False,False
bathrooms,False,False,False,False,False,True,False
view,False,False,False,False,False,False,True


## Notes on above
- at .7 correlation, clearly sqft_living is the best predictor, for it is correlated with everything but view
- at .8 correlation, now sqft_living only correlate with sqft_above
- first test, remove sqft_above, and see outcome!

## Test[1] : dropping sqft_above

In [8]:
X1 = df6[['sqft_living', 'grade', 'sqft_living15', 'bathrooms',
       'view']]
y = df6['price']

In [9]:
predictors = np.asarray(X1) #set predictors as numpy array of capital X
predictors_int = sm.add_constant(predictors) #.add_constant
model = sm.OLS(y,predictors_int).fit() #statsmodels.OrdinaryLeastSquares (dependent var, array of predictors).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.574
Model:,OLS,Adj. R-squared:,0.574
Method:,Least Squares,F-statistic:,5809.0
Date:,"Tue, 16 Apr 2019",Prob (F-statistic):,0.0
Time:,10:26:40,Log-Likelihood:,-298190.0
No. Observations:,21597,AIC:,596400.0
Df Residuals:,21591,BIC:,596400.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-5.5e+05,1.29e+04,-42.756,0.000,-5.75e+05,-5.25e+05
x1,182.8500,3.596,50.851,0.000,175.802,189.898
x2,9.896e+04,2335.091,42.378,0.000,9.44e+04,1.04e+05
x3,-2.7493,3.876,-0.709,0.478,-10.347,4.849
x4,-3.059e+04,3330.820,-9.185,0.000,-3.71e+04,-2.41e+04
x5,9.579e+04,2240.663,42.751,0.000,9.14e+04,1e+05

0,1,2,3
Omnibus:,16620.975,Durbin-Watson:,1.98
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1088824.458
Skew:,3.161,Prob(JB):,0.0
Kurtosis:,37.205,Cond. No.,24500.0


In [10]:
""" Now using Sklearn"""
ss = StandardScaler()

X1_scaled = ss.fit_transform(X1)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [11]:
np.random.seed(33)

X1_train,X1_test,Y_train,Y_test = train_test_split(X1_scaled,y)

In [12]:
lr = LinearRegression()
lr.fit(X1_train,Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [13]:
lr.score(X1_test,Y_test) #r**2 value

0.5815117275967299

In [14]:
lr.coef_

array([161493.65221168, 118316.48323463,   3152.76714081, -25662.7335309 ,
        69021.0727499 ])

In [15]:
r_2 = metrics.r2_score(Y_test, lr.predict(X1_test))
vif = 1/(1-r_2)
mae = metrics.mean_absolute_error(Y_test, lr.predict(X1_test))
mse = metrics.mean_squared_error(Y_test, lr.predict(X1_test))

In [20]:
#PRICE vs. TOP 6 BENCHMARK #test size is .25
print(f'Our list of predictors: {X1.columns}')
print(f"The R-squared score of our test is {r_2}")
print(f"The MAE score of our test is {mae}")
print(f"The MSE score of our test is {mse}")
print(f"The RMSE score of our test is ${sqrt(mse)}")

Our list of predictors: Index(['sqft_living', 'grade', 'sqft_living15', 'bathrooms', 'view'], dtype='object')
The R-squared score of our test is 0.5815117275967299
The MAE score of our test is 157595.44542674246
The MSE score of our test is 60759037804.78902
The RMSE score of our test is $246493.4843049386


##  Test[1] Observations
- 

In [None]:
vif