## Step 0: Import modules, packages

In [1]:
import numpy as np
import pandas as pd
from math import pi

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression as lr
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder

In [2]:
cd ..

/Users/alphonsowoodbury/DS/mod2project/flatiron_mod2_project_kch


---

## Step 1: Import and prep data

In [3]:
df = pd.read_csv('kc_cleaned.csv')
df.zipcode = df.zipcode.astype('str') #always imports as int...
df = df.drop(['lat','long'],axis=1) #not a feature
df

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,view,condition,grade,zipcode,sqft_living15,sqft_lot15,has_waterfront,has_basement,has_renovation,eff_built
0,221900.0,3,1,1180.0,5650.0,1,0,3,7,98178,1340.0,5650.0,0,0,0,65
1,538000.0,3,2,2570.0,7242.0,2,0,3,7,98125,1690.0,7639.0,0,1,1,29
2,180000.0,2,1,770.0,10000.0,1,0,3,6,98028,2720.0,8062.0,0,0,0,87
3,604000.0,4,3,1960.0,5000.0,1,0,5,7,98136,1360.0,5000.0,0,1,0,55
4,510000.0,3,2,1680.0,8080.0,1,0,3,8,98074,1800.0,7503.0,0,0,0,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21592,360000.0,3,2,1530.0,1131.0,3,0,3,8,98103,1530.0,1509.0,0,0,0,11
21593,400000.0,4,2,2310.0,5813.0,2,0,3,8,98146,1830.0,7200.0,0,0,0,6
21594,402101.0,2,0,1020.0,1350.0,2,0,3,7,98144,1020.0,2007.0,0,0,0,11
21595,400000.0,3,2,1600.0,2388.0,2,0,3,8,98027,1410.0,1287.0,0,0,0,16


##### **question:** are 'has_waterfront',etc already dummy encoded?

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 16 columns):
price             21597 non-null float64
bedrooms          21597 non-null int64
bathrooms         21597 non-null int64
sqft_living       21597 non-null float64
sqft_lot          21597 non-null float64
floors            21597 non-null int64
view              21597 non-null int64
condition         21597 non-null int64
grade             21597 non-null int64
zipcode           21597 non-null object
sqft_living15     21597 non-null float64
sqft_lot15        21597 non-null float64
has_waterfront    21597 non-null int64
has_basement      21597 non-null int64
has_renovation    21597 non-null int64
eff_built         21597 non-null int64
dtypes: float64(5), int64(10), object(1)
memory usage: 2.6+ MB


Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'view',
       'condition', 'grade', 'zipcode', 'sqft_living15', 'sqft_lot15',
       'has_waterfront', 'has_basement', 'has_renovation', 'eff_built'],
      dtype='object')

In [5]:
cols = df.columns
x_cols = cols[1:]
x_cols = ['bathrooms', 'sqft_living','view', 'sqft_lot', 'condition', 'grade', 'zipcode', 'sqft_living15', 'sqft_lot15', 'has_waterfront', 'has_basement', 'has_renovation', 'eff_built']

In [6]:
df[x_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 13 columns):
bathrooms         21597 non-null int64
sqft_living       21597 non-null float64
view              21597 non-null int64
sqft_lot          21597 non-null float64
condition         21597 non-null int64
grade             21597 non-null int64
zipcode           21597 non-null object
sqft_living15     21597 non-null float64
sqft_lot15        21597 non-null float64
has_waterfront    21597 non-null int64
has_basement      21597 non-null int64
has_renovation    21597 non-null int64
eff_built         21597 non-null int64
dtypes: float64(4), int64(8), object(1)
memory usage: 2.1+ MB


In [7]:
categorical_variables = ['zipcode']
categorical_variables

['zipcode']

---

## Step 2: Process categorical feature(s) (*run get_dummies*)

In [8]:
df_ohe = pd.get_dummies(df[x_cols], columns=categorical_variables,drop_first=True)
df_ohe.head()

Unnamed: 0,bathrooms,sqft_living,view,sqft_lot,condition,grade,sqft_living15,sqft_lot15,has_waterfront,has_basement,...,zipcode_98146,zipcode_98148,zipcode_98155,zipcode_98166,zipcode_98168,zipcode_98177,zipcode_98178,zipcode_98188,zipcode_98198,zipcode_98199
0,1,1180.0,0,5650.0,3,7,1340.0,5650.0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,2,2570.0,0,7242.0,3,7,1690.0,7639.0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,770.0,0,10000.0,3,6,2720.0,8062.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1960.0,0,5000.0,5,7,1360.0,5000.0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,2,1680.0,0,8080.0,3,8,1800.0,7503.0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
print(df_ohe.shape)

(21597, 81)


In [10]:
df_ohe.corr()

Unnamed: 0,bathrooms,sqft_living,view,sqft_lot,condition,grade,sqft_living15,sqft_lot15,has_waterfront,has_basement,...,zipcode_98146,zipcode_98148,zipcode_98155,zipcode_98166,zipcode_98168,zipcode_98177,zipcode_98178,zipcode_98188,zipcode_98198,zipcode_98199
bathrooms,1.000000,0.698591,0.175111,0.086029,-0.127328,0.607156,0.510950,0.082039,0.059587,0.126748,...,-0.062797,-0.018181,-0.046927,-0.027891,-0.080724,-0.006668,-0.047062,-0.023197,-0.046386,0.007889
sqft_living,0.698591,1.000000,0.281715,0.173453,-0.059445,0.762779,0.756402,0.184342,0.104637,0.201198,...,-0.058106,-0.026323,-0.046455,-0.001475,-0.074826,0.028933,-0.042364,-0.024066,-0.041815,0.010832
view,0.175111,0.281715,1.000000,0.075054,0.045622,0.249082,0.278928,0.073083,0.380543,0.176905,...,0.025785,-0.015687,-0.011922,0.061657,-0.032063,0.082709,0.043647,-0.008966,0.053909,0.050897
sqft_lot,0.086029,0.173453,0.075054,1.000000,-0.008830,0.114731,0.144763,0.718204,0.021459,-0.034889,...,-0.017066,-0.007625,-0.016883,-0.003925,-0.010386,-0.008433,-0.018171,-0.009560,-0.012657,-0.028480
condition,-0.127328,-0.059445,0.045622,-0.008830,1.000000,-0.146896,-0.093072,-0.003126,0.016648,0.130542,...,-0.021116,-0.026858,0.014123,0.027666,-0.028394,0.014164,-0.015198,-0.009660,0.005819,0.018398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zipcode_98177,-0.006668,0.028933,0.082709,-0.008433,0.014164,0.029681,0.031837,-0.004281,-0.003787,0.042987,...,-0.012708,-0.005623,-0.015873,-0.011925,-0.012276,1.000000,-0.012113,-0.008702,-0.012528,-0.013341
zipcode_98178,-0.047062,-0.042364,0.043647,-0.018171,-0.015198,-0.079451,-0.054631,-0.018725,0.037313,0.022689,...,-0.012883,-0.005701,-0.016092,-0.012089,-0.012445,-0.012113,1.000000,-0.008822,-0.012700,-0.013525
zipcode_98188,-0.023197,-0.024066,-0.008966,-0.009560,-0.009660,-0.042148,-0.040701,-0.008772,-0.006567,-0.002855,...,-0.009255,-0.004095,-0.011560,-0.008684,-0.008940,-0.008702,-0.008822,1.000000,-0.009123,-0.009716
zipcode_98198,-0.046386,-0.041815,0.053909,-0.012657,0.005819,-0.054154,-0.046004,-0.013782,0.035501,-0.008267,...,-0.013324,-0.005896,-0.016642,-0.012503,-0.012871,-0.012528,-0.012700,-0.009123,1.000000,-0.013988


---

## Step 3: Feature Selection
* VIF
* ?

In [11]:
vif = [variance_inflation_factor(df_ohe.values, i) for i in range(df_ohe.shape[1])]
list(zip(x_cols, vif))

[('bathrooms', 15.397105827887552),
 ('sqft_living', 24.458109966917203),
 ('view', 1.5505921904606732),
 ('sqft_lot', 2.3945596602206187),
 ('condition', 32.03425425480559),
 ('grade', 87.37752196958772),
 ('zipcode', 30.557314915411713),
 ('sqft_living15', 2.7617982689567913),
 ('sqft_lot15', 1.2184021751166572),
 ('has_waterfront', 2.0846552117310266),
 ('has_basement', 1.1294567446668518),
 ('has_renovation', 7.807981142796038),
 ('eff_built', 1.3933406456207227)]

## Step 4: Set up and Train Model

In [12]:
from sklearn.model_selection import train_test_split

X = df_ohe
y = df.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=55)

In [14]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [15]:
reg = lr.fit(X_train,y_train)

In [16]:
reg.score(X_train,y_train)

0.8001991829826596

In [17]:
reg.coef_

array([ 2.34261711e+04,  1.61186115e+02,  5.57895457e+04,  2.45034071e-01,
        2.28817969e+04,  6.68191315e+04,  1.64080835e+01, -1.16775401e-01,
        6.96846195e+05, -3.82048852e+04,  8.45475291e+04,  1.00119814e+03,
        2.57376723e+04, -1.92572709e+04,  7.96401127e+05,  2.75865700e+05,
        2.48851489e+05,  2.27866544e+05,  2.40224759e+05,  7.83266440e+04,
        1.13856701e+05,  1.07983293e+05,  8.67732298e+04, -1.89241959e+04,
       -3.06167703e+04,  1.62626131e+05,  1.59685658e+05,  1.20419373e+05,
        1.92090936e+05, -3.30055847e+03,  9.38037691e+03, -9.88508496e+01,
        3.49370942e+05,  1.93235448e+05,  2.72639317e+04,  1.35296215e+06,
        4.87900496e+05, -2.41431831e+02,  9.29925462e+04,  2.15893804e+05,
        2.06815635e+05,  3.68512678e+04,  9.69482933e+04,  2.81064022e+04,
        8.68577059e+04,  7.40832501e+04, -1.25381717e+04,  1.50053650e+05,
        1.58653336e+05,  1.70060642e+05,  1.21677063e+05, -3.83680199e+04,
        4.53731055e+05,  

---

## Step 5: Test and validate Model

In [18]:
y_pred = lr.predict(X_test) #prdict price on test data
y_pred

array([ 476018.23029201, 1678900.557377  ,  245767.12533277, ...,
        370394.95865402,  417100.05169392,  462808.07571328])

In [19]:
y_error = round((y_pred - y_test),2)
y_error.mean()

371.4325939955152

#### Questions:

* now to do R2?
* how do you plot your y_pred against y_test?

In [24]:
# plt.figure(figsize=(20,20))
# plt.scatter(y_test,y_pred)

y_test is answer to the running the model on x_test dataset (error, residual(lowest possible))

## Step...?