In [1]:
import wrangle
import pandas as pd
import numpy as np
import modeling

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler, QuantileTransformer, StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.metrics import explained_variance_score

import matplotlib.pyplot as plt

In [2]:
train, validate, test = wrangle.wrangle_zillow()

In [3]:
train.shape, validate.shape, test.shape

((25908, 18), (8637, 18), (8637, 18))

In [4]:
train.describe().astype(int).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
parcelid,25908,13007162,2586803,10711855,11503757,12653312,14128263,162960814
bathrooms,25908,2,0,0,2,2,2,5
bedrooms,25908,3,0,0,3,3,4,6
sqft,25908,1722,666,128,1240,1573,2073,5256
fireplacecnt,25908,0,0,0,0,0,0,4
garagecarcnt,25908,0,0,0,0,0,2,9
lotsize,25908,7160,3127,236,5457,6624,8039,21663
poolcnt,25908,0,0,0,0,0,0,1
logerror,25908,0,0,-4,0,0,0,3
tract,25908,3050,2821,2,746,1955,5300,9800


In [104]:
features = ['age', 'bedrooms' , 'sqft', 'tract', '4plusBath', 'Orange', 'Los Angeles', 'poolcnt', 'garagecarcnt']

In [105]:
# create X and y
X_train = train[features]
y_train = train['tax_value']

X_validate = validate[features]
y_validate = validate['tax_value']

In [106]:
# create MinMaxScaler and fit to train
scaler = MinMaxScaler()
X_train[features] = scaler.fit_transform(X_train[features])
X_validate[features] = scaler.transform(X_validate[features])

# make a dataframe
y_train = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)



In [107]:
# 1. Predict value_pred_mean
value_pred_mean = y_train['tax_value'].mean()
y_train['value_pred_mean'] = value_pred_mean
y_validate['value_pred_mean'] = value_pred_mean

# 2. compute value_pred_median
value_pred_median = y_train['tax_value'].median()
y_train['value_pred_median'] = value_pred_median
y_validate['value_pred_median'] = value_pred_median

# 3. RMSE of value_pred_mean
rmse_train = mean_squared_error(y_train.tax_value, y_train.value_pred_mean)**(1/2)
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.value_pred_mean)**(1/2)

print("RMSE using Mean\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

# 4. RMSE of value_pred_median
rmse_train = mean_squared_error(y_train.tax_value, y_train.value_pred_median)**(1/2)
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.value_pred_median)**(1/2)

print("RMSE using Median\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

RMSE using Mean
Train/In-Sample:  233115.06 
Validate/Out-of-Sample:  234810.22
RMSE using Median
Train/In-Sample:  235121.87 
Validate/Out-of-Sample:  236574.11


In [108]:
# # scale y
# target_scaler = QuantileTransformer()
# y_train.tax_value = target_scaler.fit_transform(y_train[['tax_value']])
# y_validate.tax_value = target_scaler.transform(y_validate[['tax_value']])

In [109]:
# create the model object
lm = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lm.fit(X_train, y_train['tax_value'])

# predict train
y_train['value_pred_lm'] = lm.predict(X_train)
#y_train['value_pred_lm'] = target_scaler.inverse_transform(y_train[['value_pred_lm']])

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.value_pred_lm)**(1/2)

# predict validate
y_validate['value_pred_lm'] = lm.predict(X_validate)
#y_validate['value_pred_lm'] = target_scaler.inverse_transform(y_validate[['value_pred_lm']])

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.value_pred_lm)**(1/2)

print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for OLS using LinearRegression
Training/In-Sample:  202273.0100312374 
Validation/Out-of-Sample:  202743.27911783007


In [110]:
# create the model object
lars = LassoLars(alpha=1)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lars.fit(X_train, y_train['tax_value'])

# predict train
y_train['value_pred_llars'] = lars.predict(X_train)
#y_train['value_pred_llars'] = target_scaler.inverse_transform(y_train[['value_pred_llars']])

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.value_pred_llars)**(1/2)

# predict validate
y_validate['value_pred_llars'] = lars.predict(X_validate)
#y_validate['value_pred_llars'] = target_scaler.inverse_transform(y_validate[['value_pred_llars']])

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.value_pred_llars)**(1/2)

print("RMSE for Lasso + Lars\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for Lasso + Lars
Training/In-Sample:  202276.96474841915 
Validation/Out-of-Sample:  202739.13431126162


In [111]:
# create the model object
glm = TweedieRegressor(power=0, alpha=0) # changed power to 0 since we normalized the target

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
glm.fit(X_train, y_train['tax_value'])

# predict train
y_train['value_pred_glm'] = glm.predict(X_train)
#y_train['value_pred_glm'] = target_scaler.inverse_transform(y_train[['value_pred_glm']])

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.value_pred_glm)**(1/2)

# predict validate
y_validate['value_pred_glm'] = glm.predict(X_validate)
#y_validate['value_pred_glm'] = target_scaler.inverse_transform(y_validate[['value_pred_glm']])

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.value_pred_glm)**(1/2)

print("RMSE for GLM using Tweedie, power=1 & alpha=0\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for GLM using Tweedie, power=1 & alpha=0
Training/In-Sample:  202273.01003127068 
Validation/Out-of-Sample:  202743.27988502494


# Polynomial Regression

In [112]:
# create the model object
pf = PolynomialFeatures(degree=2)

# create the X and y datasets
X2_train = train[features]

X2_validate = validate[features]

 

In [113]:
# fit the model to our training data. We must specify the column in y_train, 
lm2 = LinearRegression(normalize=True)

# since we have converted it to a dataframe from a series! 
lm2.fit(X2_train, y_train.tax_value)

# predict train
y_train['value_pred_lm2'] = lm2.predict(X2_train)
#y_train['value_pred_lm2'] = target_scaler.inverse_transform(y_train[['value_pred_lm2']])

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.value_pred_lm2)**(1/2)

# predict validate
y_validate['value_pred_lm2'] = lm2.predict(X2_validate)
#y_validate['value_pred_lm2'] = target_scaler.inverse_transform(y_validate[['value_pred_lm2']])

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.value_pred_lm2)**(1/2)

print("RMSE for Polynomial Model, degrees=2\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for Polynomial Model, degrees=2
Training/In-Sample:  202273.0100312374 
Validation/Out-of-Sample:  202743.27911783004


In [114]:
X_train

Unnamed: 0,age,bedrooms,sqft,tract,4plusBath,Orange,Los Angeles,poolcnt,garagecarcnt
50660,0.396947,0.666667,0.256630,0.438661,0.0,0.0,1.0,1.0,0.000000
14557,0.496183,0.500000,0.276326,0.561951,0.0,0.0,1.0,0.0,0.000000
47007,0.389313,0.666667,0.420827,0.077159,0.0,1.0,0.0,0.0,0.222222
26586,0.404580,0.500000,0.254095,0.006532,0.0,0.0,0.0,0.0,0.222222
50383,0.175573,0.500000,0.260335,0.042968,0.0,1.0,0.0,0.0,0.222222
...,...,...,...,...,...,...,...,...,...
637,0.229008,0.666667,0.500390,0.007553,0.0,0.0,0.0,0.0,0.222222
37673,0.244275,0.500000,0.245125,0.077261,0.0,1.0,0.0,0.0,0.222222
32816,0.328244,0.333333,0.238300,0.101449,0.0,1.0,0.0,0.0,0.222222
23077,0.435115,0.500000,0.221919,0.140131,0.0,0.0,1.0,0.0,0.000000
