In [1]:
import wrangle
import pandas as pd
import numpy as np
import modeling
import explore
import prepare

import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_selection import SelectKBest, f_regression, RFE

from sklearn.preprocessing import MinMaxScaler, QuantileTransformer, StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.metrics import explained_variance_score

import matplotlib.pyplot as plt

In [2]:
train, validate, test = wrangle.wrangle_zillow()

In [3]:
train.shape, validate.shape, test.shape

((25908, 20), (8637, 20), (8637, 20))

In [4]:
train.tract

50660    4300
14557    5508
47007     758
26586      66
50383     423
         ... 
637        76
37673     759
32816     996
23077    1375
34188     994
Name: tract, Length: 25908, dtype: int64

In [5]:
train.describe().astype(int).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
parcelid,25908,13007162,2586803,10711855,11503757,12653312,14128263,162960814
bathrooms,25908,2,0,0,2,2,2,5
bedrooms,25908,3,0,0,3,3,4,6
sqft,25908,1722,666,128,1240,1573,2073,5256
fireplacecnt,25908,0,0,0,0,0,0,4
garagecarcnt,25908,0,0,0,0,0,2,9
lotsize,25908,7160,3127,236,5457,6624,8039,21663
poolcnt,25908,0,0,0,0,0,0,1
lat,25908,34,0,33,33,34,34,34
long,25908,-118,0,-119,-118,-118,-117,-117


In [7]:
features = ['age', 'lat', 'sqft', 'long' , '4plusBath', 'tract', 
            'poolcnt', '3to5garage', 'Los Angeles', 'bedrooms','Orange']

In [8]:
# create X and y
X_train = train[features]
y_train = train['tax_value']

X_validate = validate[features]
y_validate = validate['tax_value']

In [9]:
# create MinMaxScaler and fit to train
scaler = MinMaxScaler()
X_train[features] = scaler.fit_transform(X_train[features])
X_validate[features] = scaler.transform(X_validate[features])

# make a dataframe
y_train = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)



In [10]:
# 1. Predict value_pred_mean
value_pred_mean = y_train['tax_value'].mean()
y_train['value_pred_mean'] = value_pred_mean
y_validate['value_pred_mean'] = value_pred_mean

# 2. compute value_pred_median
value_pred_median = y_train['tax_value'].median()
y_train['value_pred_median'] = value_pred_median
y_validate['value_pred_median'] = value_pred_median

# 3. RMSE of value_pred_mean
rmse_train = mean_squared_error(y_train.tax_value, y_train.value_pred_mean)**(1/2)
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.value_pred_mean)**(1/2)

print("RMSE using Mean\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

# 4. RMSE of value_pred_median
rmse_train = mean_squared_error(y_train.tax_value, y_train.value_pred_median)**(1/2)
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.value_pred_median)**(1/2)

print("RMSE using Median\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

RMSE using Mean
Train/In-Sample:  233115.06 
Validate/Out-of-Sample:  234810.22
RMSE using Median
Train/In-Sample:  235121.87 
Validate/Out-of-Sample:  236574.11


In [11]:
# # scale y
# target_scaler = QuantileTransformer()
# y_train.tax_value = target_scaler.fit_transform(y_train[['tax_value']])
# y_validate.tax_value = target_scaler.transform(y_validate[['tax_value']])

In [12]:
# create the model object
lm = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lm.fit(X_train, y_train['tax_value'])

# predict train
y_train['value_pred_lm'] = lm.predict(X_train)
#y_train['value_pred_lm'] = target_scaler.inverse_transform(y_train[['value_pred_lm']])

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.value_pred_lm)**(1/2)

# predict validate
y_validate['value_pred_lm'] = lm.predict(X_validate)
#y_validate['value_pred_lm'] = target_scaler.inverse_transform(y_validate[['value_pred_lm']])

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.value_pred_lm)**(1/2)

print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for OLS using LinearRegression
Training/In-Sample:  198534.50872653947 
Validation/Out-of-Sample:  199033.93261691992


In [13]:
# create the model object
lars = LassoLars(alpha=0)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lars.fit(X_train, y_train['tax_value'])

# predict train
y_train['value_pred_llars'] = lars.predict(X_train)
#y_train['value_pred_llars'] = target_scaler.inverse_transform(y_train[['value_pred_llars']])

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.value_pred_llars)**(1/2)

# predict validate
y_validate['value_pred_llars'] = lars.predict(X_validate)
#y_validate['value_pred_llars'] = target_scaler.inverse_transform(y_validate[['value_pred_llars']])

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.value_pred_llars)**(1/2)

print("RMSE for Lasso + Lars\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for Lasso + Lars
Training/In-Sample:  198534.50872653944 
Validation/Out-of-Sample:  199033.93261691992


In [14]:
# create the model object
glm = TweedieRegressor(power=0, alpha=0) # changed power to 0 since we normalized the target

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
glm.fit(X_train, y_train['tax_value'])

# predict train
y_train['value_pred_glm'] = glm.predict(X_train)
#y_train['value_pred_glm'] = target_scaler.inverse_transform(y_train[['value_pred_glm']])

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.value_pred_glm)**(1/2)

# predict validate
y_validate['value_pred_glm'] = glm.predict(X_validate)
#y_validate['value_pred_glm'] = target_scaler.inverse_transform(y_validate[['value_pred_glm']])

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.value_pred_glm)**(1/2)

print("RMSE for GLM using Tweedie, power=1 & alpha=0\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for GLM using Tweedie, power=1 & alpha=0
Training/In-Sample:  198534.508726578 
Validation/Out-of-Sample:  199033.93199449396


# Polynomial Regression

In [15]:
# create the model object
pf = PolynomialFeatures(degree=2)

# create the X and y datasets
X2_train = train[features]

X2_validate = validate[features]

 

In [16]:
# fit the model to our training data. We must specify the column in y_train, 
lm2 = LinearRegression(normalize=True)

# since we have converted it to a dataframe from a series! 
lm2.fit(X2_train, y_train.tax_value)

# predict train
y_train['value_pred_lm2'] = lm2.predict(X2_train)
#y_train['value_pred_lm2'] = target_scaler.inverse_transform(y_train[['value_pred_lm2']])

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.value_pred_lm2)**(1/2)

# predict validate
y_validate['value_pred_lm2'] = lm2.predict(X2_validate)
#y_validate['value_pred_lm2'] = target_scaler.inverse_transform(y_validate[['value_pred_lm2']])

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.value_pred_lm2)**(1/2)

print("RMSE for Polynomial Model, degrees=2\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for Polynomial Model, degrees=2
Training/In-Sample:  198534.50872653947 
Validation/Out-of-Sample:  199033.9326169199


In [17]:
y_train

Unnamed: 0,tax_value,value_pred_mean,value_pred_median,value_pred_lm,value_pred_llars,value_pred_glm,value_pred_lm2
50660,64364.0,368653.892234,338000.0,289625.419558,289625.419558,289625.489749,289625.419558
14557,81549.0,368653.892234,338000.0,324850.901411,324850.901411,324850.906316,324850.901411
47007,330190.0,368653.892234,338000.0,444134.292402,444134.292402,444134.279974,444134.292402
26586,564363.0,368653.892234,338000.0,333247.685002,333247.685002,333247.681371,333247.685002
50383,330797.0,368653.892234,338000.0,447932.922221,447932.922221,447932.999272,447932.922221
...,...,...,...,...,...,...,...
637,315132.0,368653.892234,338000.0,513692.913892,513692.913892,513693.075653,513692.913892
37673,507625.0,368653.892234,338000.0,372590.573090,372590.573090,372590.557819,372590.573090
32816,467000.0,368653.892234,338000.0,397960.509842,397960.509842,397960.392362,397960.509842
23077,380000.0,368653.892234,338000.0,337139.270218,337139.270218,337139.291114,337139.270218


In [18]:
def zillow_model(model, X_train, y_train, X_val, y_val, features):
    
    
    model.fit(X_train[features], y_train)
    
    yhat = model.predict(X_train[features])
    yhat_val = model.predict(X_val[features])
    
    rmse = mean_squared_error(y_train, yhat, squared=False)
    rmse_val = mean_squared_error(y_val, yhat_val, squared=False)
    print("RMSE for",model,"\nTraining/In-Sample: ", rmse,
          "\nValidation/Out-of-Sample: ", rmse_val)
    return model, rmse, rmse_val, features
      


In [57]:
results = pd.DataFrame(columns = ['model', 'RMSE_train', 'RMSE_val', 'features'])

In [59]:
model_list = [LinearRegression(normalize=True), LassoLars(alpha=0), TweedieRegressor(power=0, alpha=0) ]

Unnamed: 0,model,RMSE_train,RMSE_val,features


In [60]:
zillow_model(LinearRegression(normalize=True), train, train.tax_value, validate, validate.tax_value, features))

RMSE for LinearRegression(normalize=True) 
Training/In-Sample:  198534.50872653947 
Validation/Out-of-Sample:  199033.93261691995


In [61]:
results.loc[len(results)] = newresult

In [62]:
results

Unnamed: 0,model,RMSE_train,RMSE_val,features
0,LinearRegression(normalize=True),198534.508727,199033.932617,"[age, lat, sqft, long, bedrooms, 4plusBath, tr..."


In [109]:
model_list = [LinearRegression(normalize=False), LassoLars(alpha=0), TweedieRegressor(power=0, alpha=0) ]

In [110]:
for model in model_list:
    newresult = zillow_model(model, train, train.tax_value, validate, validate.tax_value, features)
    results.loc[len(results)] = newresult

RMSE for LinearRegression(normalize=False) 
Training/In-Sample:  198330.46907100012 
Validation/Out-of-Sample:  198704.69124455002
RMSE for LassoLars(alpha=0) 
Training/In-Sample:  198330.46907100012 
Validation/Out-of-Sample:  198704.691244551
RMSE for TweedieRegressor(alpha=0, power=0) 
Training/In-Sample:  200334.6817113633 
Validation/Out-of-Sample:  200640.60348222996


In [66]:
train.columns

Index(['parcelid', 'bathrooms', 'bedrooms', 'sqft', 'county', 'fireplacecnt',
       'garagecarcnt', 'lotsize', 'poolcnt', 'lat', 'long', 'logerror',
       'tract', 'tax_value', 'Los Angeles', 'Orange', 'Ventura', 'age',
       '4plusBath', '3to5garage'],
      dtype='object')

In [108]:
features = ['bathrooms',
  'bedrooms',
  'sqft',
  'fireplacecnt',
  'garagecarcnt',
  'lotsize',
  'poolcnt',
  'lat',
  'long',
  'tract',
  'Los Angeles',
  'Orange',
  'age',
  '4plusBath',
  '3to5garage']

In [107]:
results[results.RMSE_val == results.RMSE_val.min()].features.tolist()

[['bathrooms',
  'bedrooms',
  'sqft',
  'fireplacecnt',
  'garagecarcnt',
  'lotsize',
  'poolcnt',
  'lat',
  'long',
  'tract',
  'Los Angeles',
  'Orange',
  'age',
  '4plusBath',
  '3to5garage']]

In [111]:
results.RMSE_val.min()

198704.69124455002