In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import env
from sklearn.model_selection import train_test_split
from acquire import get_zillow_data
from prepare import * 

from sklearn.feature_selection import  SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor

import warnings
warnings.filterwarnings('ignore')

### Acquire Zillow data

In [2]:
df= get_zillow_data()

Reading from csv file...


### Remove outliers

In [3]:
df = remove_outliers(df, 1.5, ['bedrooms', 'bathrooms', 'sqft', 'tax_value'])

Number of observations removed: 5670


### Split data

In [4]:
train, validate, test = split_zillow_data(df)

Dataframe has been split: 
Train: (26076, 6)
Validate: (11176, 6)
Test: (9313, 6)


In [14]:
train = train.drop(columns=['county', 'year_built'])
validate = validate.drop(columns=['county', 'year_built'])
test = test.drop(columns=['county', 'year_built'])

### Create X and Y variables

In [15]:
X_train = train.drop(columns=['tax_value'])
y_train = train[['tax_value']]

X_validate = validate.drop(columns=['tax_value'])
y_validate = validate[['tax_value']]

X_test = test.drop(columns=['tax_value'])
y_test = test[['tax_value']]

In [16]:
kbest = SelectKBest(f_regression, k=3)
kbest.fit(X_train, y_train)

SelectKBest(k=3, score_func=<function f_regression at 0x7f97518a50d0>)

In [18]:
kbest_results = pd.DataFrame(dict(p=kbest.pvalues_,f=kbest.scores_),index=X_train.columns)
kbest_results

Unnamed: 0,p,f
bedrooms,1.3263519999999998e-274,1284.317532
bathrooms,0.0,5133.668196
sqft,0.0,7975.53871


In [19]:
X_train.columns[kbest.get_support()]

Index(['bedrooms', 'bathrooms', 'sqft'], dtype='object')

In [20]:
X_train_transformed = pd.DataFrame(
    kbest.transform(X_train),
    index=X_train.index,
    columns=X_train.columns[kbest.get_support()]
)
X_train_transformed.head()

Unnamed: 0,bedrooms,bathrooms,sqft
49486,3.0,2.0,1528.0
40685,3.0,3.0,2501.0
489,3.0,3.0,1299.0
23962,3.0,2.0,1671.0
12881,4.0,2.5,2373.0


In [21]:
model = LinearRegression()
rfe = RFE(model, n_features_to_select=3)
rfe.fit(X_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=3)

In [22]:
pd.DataFrame({'rfe_ranking': rfe.ranking_}, index=X_train.columns)

Unnamed: 0,rfe_ranking
bedrooms,1
bathrooms,1
sqft,1


In [23]:
X_train.columns[rfe.get_support()]

Index(['bedrooms', 'bathrooms', 'sqft'], dtype='object')

In [24]:
X_train_transformed = pd.DataFrame(
    rfe.transform(X_train),
    index=X_train.index,
    columns=X_train.columns[rfe.support_]
)
X_train_transformed.head()

Unnamed: 0,bedrooms,bathrooms,sqft
49486,3.0,2.0,1528.0
40685,3.0,3.0,2501.0
489,3.0,3.0,1299.0
23962,3.0,2.0,1671.0
12881,4.0,2.5,2373.0


### Create a baseline

In [25]:
# We need y_train and y_validate to be dataframes to append the new columns with predicted values. 
y_train = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)

# 1. Predict taxvalue_mean
taxvalue_pred_mean = y_train['tax_value'].mean()
y_train['taxvalue_pred_mean'] = taxvalue_pred_mean
y_validate['taxvalue_pred_mean'] = taxvalue_pred_mean

# 2. compute taxvalue_median
taxvalue_pred_median = y_train['tax_value'].median()
y_train['taxvalue_pred_median'] = taxvalue_pred_median
y_validate['taxvalue_pred_median'] = taxvalue_pred_median

# 3. RMSE of taxvalue_pred_mean
rmse_train = mean_squared_error(y_train.tax_value, y_train.taxvalue_pred_mean)**(1/2)
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.taxvalue_pred_mean)**(1/2)

print("RMSE using Mean\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

# 4. RMSE of taxvalue_pred_median
rmse_train = mean_squared_error(y_train.tax_value, y_train.taxvalue_pred_median)**(1/2)
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.taxvalue_pred_median)**(1/2)

print("RMSE using Median\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))


RMSE using Mean
Train/In-Sample:  262435.17 
Validate/Out-of-Sample:  265189.74
RMSE using Median
Train/In-Sample:  266147.4 
Validate/Out-of-Sample:  269193.53


### LinearRegression (OLS)


In [26]:
# create the model object
lm = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lm.fit(X_train, y_train.tax_value)

# predict train
y_train['taxvalue_pred_lm'] = lm.predict(X_train)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.taxvalue_pred_lm)**(1/2)

# predict validate
y_validate['taxvalue_pred_lm'] = lm.predict(X_validate)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.taxvalue_pred_lm)**(1/2)

print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)


RMSE for OLS using LinearRegression
Training/In-Sample:  227654.69265710737 
Validation/Out-of-Sample:  230004.51518317722


### LassoLars

In [30]:
# create the model object
lars = LassoLars(alpha=1.0)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lars.fit(X_train, y_train.tax_value)

# predict train
y_train['taxvalue_pred_lars'] = lars.predict(X_train)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.taxvalue_pred_lars)**(1/2)

# predict validate
y_validate['taxvalue_pred_lars'] = lars.predict(X_validate)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.taxvalue_pred_lars)**(1/2)

print("RMSE for Lasso + Lars\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)


RMSE for Lasso + Lars
Training/In-Sample:  227655.02050881245 
Validation/Out-of-Sample:  230005.92764848098


### TweedieRegressor (GLM)


In [None]:
# create the model object
glm = TweedieRegressor(power=1, alpha=0)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
glm.fit(X_train, y_train.tax_value)

# predict train
y_train['taxvalue_pred_glm'] = glm.predict(X_train)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.taxvalue_pred_glm)**(1/2)

# predict validate
y_validate['taxvalue_pred_glm'] = glm.predict(X_validate)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.taxvalue_pred_glm)**(1/2)

print("RMSE for GLM using Tweedie, power=1 & alpha=0\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)


RMSE for GLM using Tweedie, power=1 & alpha=0
Training/In-Sample:  262435.1694660326 
Validation/Out-of-Sample:  265189.74235114787


### PolynomialFeatures

In [34]:
# make the polynomial features to get a new set of features
pf = PolynomialFeatures(degree=2)

# fit and transform X_train_scaled
X_train_degree2 = pf.fit_transform(X_train)

# transform X_validate_scaled & X_test_scaled
X_validate_degree2 = pf.transform(X_validate)
X_test_degree2 = pf.transform(X_test)


### LinearRegression

In [35]:
# create the model object
lm2 = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lm2.fit(X_train_degree2, y_train.tax_value)

# predict train
y_train['taxvalue_pred_lm2'] = lm2.predict(X_train_degree2)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.taxvalue_pred_lm2)**(1/2)

# predict validate
y_validate['taxvalue_pred_lm2'] = lm2.predict(X_validate_degree2)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.taxvalue_pred_lm2)**(1/2)

print("RMSE for Polynomial Model, degrees=2\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)


RMSE for Polynomial Model, degrees=2
Training/In-Sample:  227250.0009561301 
Validation/Out-of-Sample:  229680.69001349027


### Evaluate GLM

In [37]:
y_test = pd.DataFrame(y_test)

# predict on test
y_test['taxvalue_pred_glm'] = lm.predict(X_test)

# evaluate: rmse
rmse_test = mean_squared_error(y_test.tax_value, y_test.taxvalue_pred_glm)**(1/2)

print("RMSE for OLS Model using LinearRegression\nOut-of-Sample Performance: ", rmse_test)


RMSE for OLS Model using LinearRegression
Out-of-Sample Performance:  231186.70174349454
