In [1]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
import pandas as pd
from scipy import stats
from pydataset import data
import numpy as np
import env
import matplotlib.pyplot as plt
import os
import prepare
import acquire
import seaborn as sns
import explore
from math import sqrt
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LassoLars
from sklearn.linear_model import TweedieRegressor

In [2]:
df = acquire.get_zillow_sfr_data()

In [3]:
train, val, test = prepare.clean_prep_zillow(df)

In [4]:
train.shape

(38619, 6)

In [5]:
x_train, y_train, x_val, y_val, x_test, y_test = prepare.modeling_split(train, val, test, 'tax_value')

In [6]:
x_train_scaled, x_val_scaled, x_test_scaled = prepare.prep_2_model(x_train, 
                                                                   x_val, x_test, x_train)


In [7]:
x_val_scaled.head().T


Unnamed: 0,28238,9010,42538,55990,29379
bedroom_1.0,0.0,0.0,0.0,0.0,0.0
bedroom_2.0,0.0,0.0,0.0,0.0,1.0
bedroom_3.0,1.0,0.0,0.0,0.0,0.0
bedroom_5.0,0.0,0.0,0.0,0.0,0.0
bedroom_6.0,0.0,0.0,0.0,0.0,0.0
bedroom_7.0,0.0,0.0,0.0,0.0,0.0
bathroom_1.0,0.0,0.0,0.0,0.0,1.0
bathroom_1.5,0.0,0.0,0.0,0.0,0.0
bathroom_2.5,0.0,1.0,0.0,0.0,0.0
bathroom_3.0,0.0,0.0,0.0,0.0,0.0


In [8]:
lm = LinearRegression()
rfe = RFE(lm, n_features_to_select = 7)
rfe.fit(x_train_scaled, y_train)

In [9]:
ranks = rfe.ranking_
columns = x_train_scaled.columns.tolist()

In [10]:
feature_ranks = pd.DataFrame({'ranking':ranks, 'feature':columns})

In [11]:
feature_ranks.sort_values('ranking')

Unnamed: 0,ranking,feature
12,1,bathroom_4.5
17,1,bathroom_7.0
16,1,bathroom_6.5
15,1,bathroom_6.0
14,1,bathroom_5.5
25,1,sqft_scaled
5,1,bedroom_7.0
13,2,bathroom_5.0
10,3,bathroom_3.5
11,4,bathroom_4.0


In [12]:
pf = PolynomialFeatures(degree = 2)

In [13]:
pf.fit(x_train_scaled, y_train)

In [14]:
x_poly=pf.transform(x_train_scaled)

In [15]:
lmtwo = LinearRegression()

In [16]:
lmtwo.fit(x_poly, y_train)

In [17]:
preds_df = pd.DataFrame({'actual': y_train})

In [18]:
preds_df['baseline'] = preds_df.actual.mean()

In [19]:
preds_df.head()

Unnamed: 0,actual,baseline
11547,236760.0,414518.714234
29091,360606.0,414518.714234
19611,341328.0,414518.714234
24109,372312.0,414518.714234
45109,381638.0,414518.714234


In [20]:
sqrt(mean_squared_error(preds_df['baseline'], preds_df['actual']))

356106.2796197136

In [21]:
preds_df['poly_preds'] = lmtwo.predict(x_poly)
preds_df.head()

Unnamed: 0,actual,baseline,poly_preds
11547,236760.0,414518.714234,338895.378418
29091,360606.0,414518.714234,258117.574951
19611,341328.0,414518.714234,386502.108398
24109,372312.0,414518.714234,265407.97168
45109,381638.0,414518.714234,321301.100098


In [22]:
sqrt(mean_squared_error(preds_df['poly_preds'], preds_df['actual']))

266794.15823686164

In [23]:
pf2 = PolynomialFeatures(degree = 3)
pf2.fit(x_train_scaled, y_train)
x_poly2=pf.transform(x_train_scaled)
lmthree = LinearRegression()
lmthree.fit(x_poly2, y_train)
preds_df['poly_preds2'] = lmthree.predict(x_poly)

In [24]:
lasso = LassoLars(alpha = 0.15)

In [25]:
lasso.fit(x_train_scaled, y_train)

In [26]:
lasso_preds = lasso.predict(x_train_scaled)

In [27]:
preds_df['lasso_preds'] = lasso_preds
preds_df.head()

Unnamed: 0,actual,baseline,poly_preds,poly_preds2,lasso_preds
11547,236760.0,414518.714234,338895.378418,338895.378418,347808.379646
29091,360606.0,414518.714234,258117.574951,258117.574951,269515.714869
19611,341328.0,414518.714234,386502.108398,386502.108398,305963.807869
24109,372312.0,414518.714234,265407.97168,265407.97168,279297.220545
45109,381638.0,414518.714234,321301.100098,321301.100098,314070.532418


In [28]:
sqrt(mean_squared_error(preds_df['lasso_preds'], preds_df['actual']))

270926.5503506549

In [29]:
tweedie = TweedieRegressor(power = 0)

In [30]:
tweedie.fit(x_train_scaled, y_train)

In [31]:
preds_df['tweedie'] = tweedie.predict(x_train_scaled)
preds_df.head()

Unnamed: 0,actual,baseline,poly_preds,poly_preds2,lasso_preds,tweedie
11547,236760.0,414518.714234,338895.378418,338895.378418,347808.379646,399606.342347
29091,360606.0,414518.714234,258117.574951,258117.574951,269515.714869,380084.763092
19611,341328.0,414518.714234,386502.108398,386502.108398,305963.807869,415478.83344
24109,372312.0,414518.714234,265407.97168,265407.97168,279297.220545,380253.125751
45109,381638.0,414518.714234,321301.100098,321301.100098,314070.532418,402835.524173


In [32]:
sqrt(mean_squared_error(preds_df['tweedie'], preds_df['actual']))

346263.85531092784

In [33]:
tweedietwo = TweedieRegressor(power = 1)
tweedietwo.fit(x_train_scaled, y_train)
preds_df['tweedietwo'] = tweedietwo.predict(x_train_scaled)

In [34]:
preds_df.head()

Unnamed: 0,actual,baseline,poly_preds,poly_preds2,lasso_preds,tweedie,tweedietwo
11547,236760.0,414518.714234,338895.378418,338895.378418,347808.379646,399606.342347,327331.468016
29091,360606.0,414518.714234,258117.574951,258117.574951,269515.714869,380084.763092,260390.418697
19611,341328.0,414518.714234,386502.108398,386502.108398,305963.807869,415478.83344,328825.32701
24109,372312.0,414518.714234,265407.97168,265407.97168,279297.220545,380253.125751,264637.382641
45109,381638.0,414518.714234,321301.100098,321301.100098,314070.532418,402835.524173,324618.530737


In [35]:
tweedietwo_rmse = sqrt(mean_squared_error(preds_df['actual'], preds_df['tweedietwo']))
tweedietwo_rmse

273909.17140425305

In [36]:
tweediethree = TweedieRegressor(power = 2)
tweediethree.fit(x_train_scaled, y_train)
preds_df['tweediethree'] = tweediethree.predict(x_train_scaled)

In [37]:
tweediethree_rmse = sqrt(mean_squared_error(preds_df['actual'], preds_df['tweediethree']))
tweediethree_rmse

346452.8670043818

In [38]:
preds_df.head()

Unnamed: 0,actual,baseline,poly_preds,poly_preds2,lasso_preds,tweedie,tweedietwo,tweediethree
11547,236760.0,414518.714234,338895.378418,338895.378418,347808.379646,399606.342347,327331.468016,392730.21517
29091,360606.0,414518.714234,258117.574951,258117.574951,269515.714869,380084.763092,260390.418697,372290.756544
19611,341328.0,414518.714234,386502.108398,386502.108398,305963.807869,415478.83344,328825.32701,407862.441496
24109,372312.0,414518.714234,265407.97168,265407.97168,279297.220545,380253.125751,264637.382641,372437.897121
45109,381638.0,414518.714234,321301.100098,321301.100098,314070.532418,402835.524173,324618.530737,395666.036432
