In [1]:
import pandas as pd
import numpy as np
import wrangle as w

# modeling methods
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression, RFE

In [2]:
train, validate, test = w.wrangle_zillow()

opening data from local file


In [3]:
train_scaled, validate_scaled, test_scaled = w.scale_data(train, validate, test)

In [4]:
drops = ['tax_value','lot_size_binned']

In [5]:
X_train_scaled = train_scaled.drop(columns=drops)
y_train = train_scaled.tax_value
X_validate_scaled = validate_scaled.drop(columns=drops)
y_validate = validate_scaled.tax_value
X_test_scaled = test_scaled.drop(columns=drops)
y_test = test_scaled.tax_value

# Feature selection

In [6]:
# parameters: f_regression stats test, give me 4 features
f_selector = SelectKBest(f_regression, k=4)

# find the top 3 X's correlated with y
f_selector.fit(X_train_scaled, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

# get list of top K features. 
f_feature_kbest = X_train_scaled.iloc[:,feature_mask].columns.tolist()

In [7]:
f_feature_kbest

['bathrooms', 'area', 'garage_sqft', 'year_built']

In [8]:
f_feature_kbest = ['region', 'area', 'garage_sqft', 'year_built']

In [9]:
lm = LinearRegression()
# parameters: f_regression stats test, give me 4 features
f_selector = RFE(lm, n_features_to_select=4)

# find the top 3 X's correlated with y
f_selector.fit(X_train_scaled, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

# get list of top K features. 
f_feature_rfe = X_train_scaled.iloc[:,feature_mask].columns.tolist()

In [10]:
f_feature_rfe

['bedrooms', 'area', 'cars_garage', 'garage_sqft']

In [11]:
f_feature_rfe = ['pools', 'year_built', 'area', 'garage_sqft', 'region']

# Establish Baseline

In [12]:
y_train = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)
y_test = pd.DataFrame(y_test)

In [13]:
y_train['tax_value_pred_mean'] = y_train.tax_value.mean()
y_validate['tax_value_pred_mean'] = y_train.tax_value.mean()

y_train['tax_value_pred_med'] = y_train.tax_value.median()
y_validate['tax_value_pred_med'] = y_train.tax_value.median()

In [14]:
rmse_train_mu = mean_squared_error(y_train.tax_value,
                                   y_train.tax_value_pred_mean) ** .5
rmse_validate_mu = mean_squared_error(y_validate.tax_value, 
                                      y_validate.tax_value_pred_mean) ** (0.5)
rmse_train_med = mean_squared_error(y_train.tax_value, 
                                    y_train.tax_value_pred_med) ** .5
rmse_validate_med = mean_squared_error(y_validate.tax_value, 
                                       y_validate.tax_value_pred_med) ** (0.5)

In [15]:
metric_df = pd.DataFrame(data=[
    {
        'model' : 'mean_baseline',
        'RMSE_train' : rmse_train_mu,
        'RMSE_validate' : rmse_validate_mu,
        'difference' : rmse_validate_mu - rmse_train_mu,
        'R2' : explained_variance_score(y_validate.tax_value,
                                       y_validate.tax_value_pred_mean)
    }
])

In [16]:
metric_df

Unnamed: 0,model,RMSE_train,RMSE_validate,difference,R2
0,mean_baseline,243722.129324,246130.971338,2408.842014,0.0


In [17]:
# make la thing
lm = LinearRegression()
# fit za thing
lm.fit(X_train_scaled[f_feature_kbest], y_train.tax_value)
# usage of a thing
y_train['tax_value_pred_lm'] = lm.predict(X_train_scaled[f_feature_kbest])
# Evaluate: RMSE
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_lm) ** .5

# repeat usage on validate
y_validate['tax_value_pred_lm'] = lm.predict(
    X_validate_scaled[f_feature_kbest])
# evaluate: RMSE
rmse_validate = mean_squared_error(y_validate.tax_value, 
                                   y_validate.tax_value_pred_lm) ** .5

In [18]:
#Append this to the metric_df

metric_df = metric_df.append(
    {
        'model' : 'OLS Regressor',
        'RMSE_train' : rmse_train,
        'RMSE_validate' : rmse_validate,
        'difference' : rmse_validate - rmse_train,
        'R2' : explained_variance_score(y_validate.tax_value,
                                       y_validate.tax_value_pred_lm)
    }, ignore_index=True
)

In [19]:
metric_df

Unnamed: 0,model,RMSE_train,RMSE_validate,difference,R2
0,mean_baseline,243722.129324,246130.971338,2408.842014,0.0
1,OLS Regressor,215520.87718,219171.616444,3650.739264,0.207054


In [20]:
# make the thing
lm = LinearRegression()
# fit za thing
lm.fit(X_train_scaled[f_feature_rfe], y_train.tax_value)
# usage of a thing
y_train['tax_value_pred_lm'] = lm.predict(X_train_scaled[f_feature_rfe])
# Evaluate: RMSE
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_lm) ** .5

# repeat usage on validate
y_validate['tax_value_pred_lm'] = lm.predict(
    X_validate_scaled[f_feature_rfe])
# evaluate: RMSE
rmse_validate = mean_squared_error(y_validate.tax_value, 
                                   y_validate.tax_value_pred_lm) ** .5

In [21]:
#Append this to the metric_df

metric_df = metric_df.append(
    {
        'model' : 'OLS-rfe',
        'RMSE_train' : rmse_train,
        'RMSE_validate' : rmse_validate,
        'difference' : rmse_validate - rmse_train,
        'R2' : explained_variance_score(y_validate.tax_value,
                                       y_validate.tax_value_pred_lm)
    }, ignore_index=True
)

In [22]:
metric_df

Unnamed: 0,model,RMSE_train,RMSE_validate,difference,R2
0,mean_baseline,243722.129324,246130.971338,2408.842014,0.0
1,OLS Regressor,215520.87718,219171.616444,3650.739264,0.207054
2,OLS-rfe,215268.700711,219013.771067,3745.070355,0.208196


# Lasso + Lars

In [23]:
# make la thing
lars = LassoLars(alpha=0.1)
# fit za thing
lars.fit(X_train_scaled[f_feature_kbest], y_train.tax_value)
# usage of a thing
y_train['tax_value_pred_lars'] = lars.predict(X_train_scaled[f_feature_kbest])
# Evaluate: RMSE
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_lars) ** .5

# repeat usage on validate
y_validate['tax_value_pred_lars'] = lars.predict(X_validate_scaled[f_feature_kbest])
# evaluate: RMSE
rmse_validate = mean_squared_error(y_validate.tax_value, 
                                   y_validate.tax_value_pred_lars) ** .5

In [24]:
#Append this to the metric_df

metric_df = metric_df.append(
    {
        'model' : 'Lasso + Lars',
        'RMSE_train' : rmse_train,
        'RMSE_validate' : rmse_validate,
        'difference' : rmse_validate - rmse_train,
        'R2' : explained_variance_score(y_validate.tax_value,
                                       y_validate.tax_value_pred_lars)
    }, ignore_index=True
)

In [25]:
metric_df

Unnamed: 0,model,RMSE_train,RMSE_validate,difference,R2
0,mean_baseline,243722.129324,246130.971338,2408.842014,0.0
1,OLS Regressor,215520.87718,219171.616444,3650.739264,0.207054
2,OLS-rfe,215268.700711,219013.771067,3745.070355,0.208196
3,Lasso + Lars,215520.881033,219171.181556,3650.300523,0.207057


In [26]:
# make la thing
lars = LassoLars(alpha=0.01)
# fit za thing
lars.fit(X_train_scaled[f_feature_rfe], y_train.tax_value)
# usage of a thing
y_train['tax_value_pred_lars'] = lars.predict(X_train_scaled[f_feature_rfe])
# Evaluate: RMSE
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_lars) ** .5

# repeat usage on validate
y_validate['tax_value_pred_lars'] = lars.predict(X_validate_scaled[f_feature_rfe])
# evaluate: RMSE
rmse_validate = mean_squared_error(y_validate.tax_value, 
                                   y_validate.tax_value_pred_lars) ** .5

In [27]:
#Append this to the metric_df

metric_df = metric_df.append(
    {
        'model' : 'Lasso + Lars-rfe',
        'RMSE_train' : rmse_train,
        'RMSE_validate' : rmse_validate,
        'difference' : rmse_validate - rmse_train,
        'R2' : explained_variance_score(y_validate.tax_value,
                                       y_validate.tax_value_pred_lars)
    }, ignore_index=True
)

In [28]:
metric_df

Unnamed: 0,model,RMSE_train,RMSE_validate,difference,R2
0,mean_baseline,243722.129324,246130.971338,2408.842014,0.0
1,OLS Regressor,215520.87718,219171.616444,3650.739264,0.207054
2,OLS-rfe,215268.700711,219013.771067,3745.070355,0.208196
3,Lasso + Lars,215520.881033,219171.181556,3650.300523,0.207057
4,Lasso + Lars-rfe,215268.700753,219013.719581,3745.018828,0.208196


In [29]:
# make la thing
glm = TweedieRegressor(power=1, alpha=0)
# fit za thing
glm.fit(X_train_scaled[f_feature_kbest], y_train.tax_value)
# usage of a thing
y_train['tax_value_pred_glm'] = glm.predict(X_train_scaled[f_feature_kbest])
# Evaluate: RMSE
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_glm) ** .5

# repeat usage on validate
y_validate['tax_value_pred_glm'] = glm.predict(X_validate_scaled[f_feature_kbest])
# evaluate: RMSE
rmse_validate = mean_squared_error(y_validate.tax_value, 
                                   y_validate.tax_value_pred_glm) ** .5

In [30]:
#Append this to the metric_df

metric_df = metric_df.append(
    {
        'model' : 'Tweedie Regressor',
        'RMSE_train' : rmse_train,
        'RMSE_validate' : rmse_validate,
        'difference' : rmse_validate - rmse_train,
        'R2' : explained_variance_score(y_validate.tax_value,
                                       y_validate.tax_value_pred_glm)
    }, ignore_index=True
)

In [31]:
metric_df

Unnamed: 0,model,RMSE_train,RMSE_validate,difference,R2
0,mean_baseline,243722.129324,246130.971338,2408.842014,0.0
1,OLS Regressor,215520.87718,219171.616444,3650.739264,0.2070538
2,OLS-rfe,215268.700711,219013.771067,3745.070355,0.208196
3,Lasso + Lars,215520.881033,219171.181556,3650.300523,0.2070569
4,Lasso + Lars-rfe,215268.700753,219013.719581,3745.018828,0.2081963
5,Tweedie Regressor,243722.129324,246130.971338,2408.842014,1.110223e-16


In [32]:
# make la thing
glm = TweedieRegressor(power=1, alpha=0)
# fit za thing
glm.fit(X_train_scaled[f_feature_rfe], y_train.tax_value)
# usage of a thing
y_train['tax_value_pred_glm'] = glm.predict(X_train_scaled[f_feature_rfe])
# Evaluate: RMSE
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_glm) ** .5

# repeat usage on validate
y_validate['tax_value_pred_glm'] = glm.predict(X_validate_scaled[f_feature_rfe])
# evaluate: RMSE
rmse_validate = mean_squared_error(y_validate.tax_value, 
                                   y_validate.tax_value_pred_glm) ** .5

In [33]:
#Append this to the metric_df

metric_df = metric_df.append(
    {
        'model' : 'Tweedie Regressor-rfe',
        'RMSE_train' : rmse_train,
        'RMSE_validate' : rmse_validate,
        'difference' : rmse_validate - rmse_train,
        'R2' : explained_variance_score(y_validate.tax_value,
                                       y_validate.tax_value_pred_glm)
    }, ignore_index=True
)

In [34]:
metric_df

Unnamed: 0,model,RMSE_train,RMSE_validate,difference,R2
0,mean_baseline,243722.129324,246130.971338,2408.842014,0.0
1,OLS Regressor,215520.87718,219171.616444,3650.739264,0.2070538
2,OLS-rfe,215268.700711,219013.771067,3745.070355,0.208196
3,Lasso + Lars,215520.881033,219171.181556,3650.300523,0.2070569
4,Lasso + Lars-rfe,215268.700753,219013.719581,3745.018828,0.2081963
5,Tweedie Regressor,243722.129324,246130.971338,2408.842014,1.110223e-16
6,Tweedie Regressor-rfe,243722.129324,246130.971338,2408.842014,1.110223e-16


In [35]:
#1. Create the polynomial features to get a new set of features
pf = PolynomialFeatures(degree=2) #quadratic function

#1. Fit and transform X_train_scaled
X_train_degree2 = pf.fit_transform(X_train_scaled[f_feature_kbest])

#1. Transform X_validate_scaled & X_test_scaled 
X_validate_degree2 = pf.fit_transform(X_validate_scaled[f_feature_kbest])
X_test_degree2 = pf.fit_transform(X_test_scaled[f_feature_kbest])

In [36]:
# make la thing
lm2 = LinearRegression()
# fit za thing
lm2.fit(X_train_degree2, y_train.tax_value)
# usage of a thing
y_train['tax_value_pred_lm2'] = lm2.predict(X_train_degree2)
# Evaluate: RMSE
rmse_train = mean_squared_error(y_train.tax_value, 
                                y_train.tax_value_pred_lm2) ** .5

# repeat usage on validate
y_validate['tax_value_pred_lm2'] = lm2.predict(X_validate_degree2)
# evaluate: RMSE
rmse_validate = mean_squared_error(y_validate.tax_value, 
                                   y_validate.tax_value_pred_lm2) ** .5

In [37]:
#Append this to the metric_df

metric_df = metric_df.append(
    {
        'model' : 'Polynomial',
        'RMSE_train' : rmse_train,
        'RMSE_validate' : rmse_validate,
        'difference' : rmse_validate - rmse_train,
        'R2' : explained_variance_score(y_validate.tax_value,
                                       y_validate.tax_value_pred_lm2)
    }, ignore_index=True
)

In [38]:
metric_df

Unnamed: 0,model,RMSE_train,RMSE_validate,difference,R2
0,mean_baseline,243722.129324,246130.971338,2408.842014,0.0
1,OLS Regressor,215520.87718,219171.616444,3650.739264,0.2070538
2,OLS-rfe,215268.700711,219013.771067,3745.070355,0.208196
3,Lasso + Lars,215520.881033,219171.181556,3650.300523,0.2070569
4,Lasso + Lars-rfe,215268.700753,219013.719581,3745.018828,0.2081963
5,Tweedie Regressor,243722.129324,246130.971338,2408.842014,1.110223e-16
6,Tweedie Regressor-rfe,243722.129324,246130.971338,2408.842014,1.110223e-16
7,Polynomial,211053.371691,214005.652335,2952.280645,0.2439874


In [39]:
#1. Create the polynomial features to get a new set of features
pf = PolynomialFeatures(degree=2) #quadratic function

#1. Fit and transform X_train_scaled
X_train_degree2 = pf.fit_transform(X_train_scaled[f_feature_rfe])

#1. Transform X_validate_scaled & X_test_scaled 
X_validate_degree2 = pf.fit_transform(X_validate_scaled[f_feature_rfe])
X_test_degree2 = pf.fit_transform(X_test_scaled[f_feature_rfe])

In [40]:
# make la thing
lm2 = LinearRegression()
# fit za thing
lm2.fit(X_train_degree2, y_train.tax_value)
# usage of a thing
y_train['tax_value_pred_lm2'] = lm2.predict(X_train_degree2)
# Evaluate: RMSE
rmse_train = mean_squared_error(y_train.tax_value, 
                                y_train.tax_value_pred_lm2) ** .5

# repeat usage on validate
y_validate['tax_value_pred_lm2'] = lm2.predict(X_validate_degree2)
# evaluate: RMSE
rmse_validate = mean_squared_error(y_validate.tax_value, 
                                   y_validate.tax_value_pred_lm2) ** .5

In [41]:
#Append this to the metric_df

metric_df = metric_df.append(
    {
        'model' : 'Polynomial-rfe',
        'RMSE_train' : rmse_train,
        'RMSE_validate' : rmse_validate,
        'difference' : rmse_validate - rmse_train,
        'R2' : explained_variance_score(y_validate.tax_value,
                                       y_validate.tax_value_pred_lm2)
    }, ignore_index=True
)

In [42]:
metric_df

Unnamed: 0,model,RMSE_train,RMSE_validate,difference,R2
0,mean_baseline,243722.129324,246130.971338,2408.842014,0.0
1,OLS Regressor,215520.87718,219171.616444,3650.739264,0.2070538
2,OLS-rfe,215268.700711,219013.771067,3745.070355,0.208196
3,Lasso + Lars,215520.881033,219171.181556,3650.300523,0.2070569
4,Lasso + Lars-rfe,215268.700753,219013.719581,3745.018828,0.2081963
5,Tweedie Regressor,243722.129324,246130.971338,2408.842014,1.110223e-16
6,Tweedie Regressor-rfe,243722.129324,246130.971338,2408.842014,1.110223e-16
7,Polynomial,211053.371691,214005.652335,2952.280645,0.2439874
8,Polynomial-rfe,210452.027938,213462.815257,3010.787319,0.2478174


In [43]:
print('3 varialbe, OLS-rfe best')
print('OLS-rfe	214552.202335	218110.669318	3558.466983	0.214702')
print('4 vaiable, polynomial best')
print('Polynomial	210641.884099	214250.079823	3608.195724	0.242266')
print('5 varaible, polynomila best')
print('Polynomial	209763.653824	213557.375839	3793.722014	0.247151')

3 varialbe, OLS-rfe best
OLS-rfe	214552.202335	218110.669318	3558.466983	0.214702
4 vaiable, polynomial best
Polynomial	210641.884099	214250.079823	3608.195724	0.242266
5 varaible, polynomila best
Polynomial	209763.653824	213557.375839	3793.722014	0.247151


In [44]:
print('7	Polynomial	211053.371691	214005.652335	2952.280645	2.439874e-01')

7	Polynomial	211053.371691	214005.652335	2952.280645	2.439874e-01


In [45]:
y_test['tax_value_pred_lm2'] = lm2.predict(X_test_degree2)
# Evaluate: RMSE
rmse_test = mean_squared_error(y_test.tax_value, y_test.tax_value_pred_lm2) ** .5

In [46]:
test_metric_df = pd.DataFrame(data=[
    {
        'model' : 'Polynomial test',
        'RMSE_test' : rmse_test,
        'R2' : explained_variance_score(y_test.tax_value,
                                       y_test.tax_value_pred_lm2)
    }
])

In [47]:
test_metric_df

Unnamed: 0,model,RMSE_test,R2
0,Polynomial test,213160.518547,0.24401


In [48]:
(243722.129324 - 215566.186715)

28155.94260900002

In [49]:
(243722.129324 - 213583.064565)

30139.064759

In [51]:
(243722.129324 - 213160.518547)

30561.610776999994

In [50]:
(243722.129324 - 213583.064565) * (len(train) + len(validate) + len(test))

1371839810.635403

For houses that had a transaction in 2017, our model was able to reduce the error from the baseline prediction by \\$30,139 per house, and a total amount of \\$1,371,839,810.64 (1.37 billion dollars).