In [1]:
#the basics
import numpy as np
import pandas as pd
from itertools import product

#viz
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(palette='colorblind')

#modeling tools
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression, RFE

#My modules
import wrangle
import utils

# Model Prep

###### Acquire and prepare the data

In [2]:
tr, te, val = wrangle.wrangle_zillow(include_zip=True,val_ratio=.15,test_ratio=.15)

In [3]:
tr.shape

(35956, 343)

In [4]:
tr.columns[0:12]

Index(['value', 'zipcode', 'county', 'bed', 'bath', 'sf', 'sf_per_bed',
       'yearbuilt', 'Orange_CA', 'Ventura_CA', '95983', '95984'],
      dtype='object')

##### Create scaled data subsets

In [5]:
tr_mm, te_mm, val_mm = wrangle.scale_zillow(tr,te,val)
tr_st, te_st, val_st = wrangle.scale_zillow(tr,te,val,kind='standard')

In [6]:
tr_mm.columns[:12]

Index(['value', 'zipcode', 'county', 'bed', 'bath', 'sf', 'sf_per_bed',
       'yearbuilt', 'Orange_CA', 'Ventura_CA', '95983', '95984'],
      dtype='object')

##### Create model input subsets

In [7]:
#split the train subset
#all contains zip, X_tr contains all but zip, y_tr is target
X_tr_mm_all = tr_mm.iloc[:,3:]
X_tr_st_all = tr_st.iloc[:,3:]

X_tr_mm = tr_mm.iloc[:,3:10]
X_tr_st = tr_st.iloc[:,3:10]

#test
X_te_mm_all = te_mm.iloc[:,3:]
X_te_st_all = te_st.iloc[:,3:]

X_te_mm = te_mm.iloc[:,3:10]
X_te_st = te_st.iloc[:,3:10]

#val
X_val_mm_all = val_mm.iloc[:,3:]
X_val_st_all = val_st.iloc[:,3:]

X_val_mm = val_mm.iloc[:,3:10]
X_val_st = val_st.iloc[:,3:10]

#target variables aren't scaled, so always the same
y_tr = tr.value
y_te = te.value
y_val = val.value

In [8]:
X_tr_mm.head(2)

Unnamed: 0,bed,bath,sf,sf_per_bed,yearbuilt,Orange_CA,Ventura_CA
10290,0.25,0.142857,0.099435,0.128388,0.789855,0,0
45994,0.5,0.214286,0.180413,0.095926,0.84058,1,0


##### Feature Selection

In [9]:
#let's look at 
for k in [2,3,4,5,15]:
    #Create selector
    f_selector_mm = f_selector_st = SelectKBest(f_regression, k=k)
    #fit to train
    f_selector_mm.fit(X_tr_mm_all, y_tr)
    #Let's look at the chosen columns
    print(X_tr_mm_all.columns[f_selector_mm.get_support()])

Index(['bath', 'sf'], dtype='object')
Index(['bath', 'sf', 'sf_per_bed'], dtype='object')
Index(['bed', 'bath', 'sf', 'sf_per_bed'], dtype='object')
Index(['bed', 'bath', 'sf', 'sf_per_bed', 'yearbuilt'], dtype='object')
Index(['bed', 'bath', 'sf', 'sf_per_bed', 'yearbuilt', 'Orange_CA', '96030',
       '96050', '96086', '96116', '96117', '96120', '96957', '96975', '96978'],
      dtype='object')


**NOTES:** Based off this, I want to run linear regression on:
- ['bath','sf']
- ['bed','bath','sf']
- top 15, but dropping sf_per_bed:
  - ['bed', 'bath', 'sf', 'yearbuilt', 'Orange_CA', '96030',
       '96050', '96086', '96116', '96117', '96120', '96957', '96975', '96978']
       

##### Get a baseline

In [10]:
#create df with actual values
tr_res = pd.DataFrame(tr.value)
tr_res.rename(columns={'value':'actual'},inplace=True)
val_res = pd.DataFrame(val.value)
val_res.rename(columns={'value':'actual'},inplace=True)
te_res = pd.DataFrame(te.value)
te_res.rename(columns={'value':'actual'},inplace=True)
#Create potential baselines
tr_res['base_mean'] = val_res['base_mean'] = tr.value.mean()
tr_res['base_median'] = val_res['base_median'] = tr.value.median()


tr_res.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
actual,35956.0,496260.673045,499061.0,1000.0,194083.0,373508.0,613296.75,4993132.0
base_mean,35956.0,496260.673045,1.897014e-07,496260.673045,496260.673045,496260.673045,496260.673045,496260.7
base_median,35956.0,373508.0,0.0,373508.0,373508.0,373508.0,373508.0,373508.0


In [11]:
for c in tr_res.columns[1:]:
    score = utils.rmse(tr_res.actual,tr_res[c])
    score2 = utils.rmse(val_res.actual,val_res[c])
    print(f'{c} has a rmse of {score:.0f} on train and {score2:.0f} on validate')

base_mean has a rmse of 499054 on train and 519798 on validate
base_median has a rmse of 513929 on train and 535369 on validate


mean performs slightly better on both train and validate >> use it.

In [12]:
#set it as official baseline
tr_res['baseline'] = val_res['baseline'] = tr.value.mean()
#drop the other columns we don't need anymore
tr_res.drop(columns=['base_mean','base_median'],inplace=True)
val_res.drop(columns=['base_mean','base_median'],inplace=True)

#take a peek
tr_res.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
actual,35956.0,496260.673045,499061.0,1000.0,194083.0,373508.0,613296.75,4993132.0
baseline,35956.0,496260.673045,1.897014e-07,496260.673045,496260.673045,496260.673045,496260.673045,496260.7


##### Time permitting: update this function to output 

In [13]:
def select_kbest(X,y,k):
    '''
    Uses sklearn.feature_selection.SelectKBest to select top k features.
    
    Returns: List corresp
    Inputs: 
      (R) X: Pandas Dataframe of features and values
      (R) y: target variable
      (R) k: number of features to select
    '''
    #Create feature selector & fit
    f_selector = SelectKBest(f_regression,k=k).fit(X,y)
    # Boolean mask of which columns are selected
    f_mask = f_selector.get_support()
    #get list of top features
    k_features = X.columns[f_mask].tolist()
    #return features as list
    return k_features

# Modeling

- OLS - LinearRegression:
  - normalize - true
  - scaler: standard, min/max
  - features: all, all but zip, top 3
- Lasso + LARS 
  - NOTE: 
    - Does feature selection
    - Y should be normal
    - alpha penalizes more features
  - alpha: .25, .5, .75, 1, 1.5
- Generalized Linear MOdel - TweedieRegressor
  - power: 1 (poisson) 
  - power: 2 (gamma), link: 'log'
- Polynomialfeature >> linearregression
  - NOTE: not expect any benefit from this
  - degree: 2


In [14]:
#initialize model dictionary and performance
model_dict ={}
mod_perf = pd.DataFrame()

#####  Linear Regression

In [15]:
#perform Linear regression on 
for k in [2,3,4,5,6,15,20]:
    #generate model name
    model_name = 'lr_mm_k'+str(k)
    #get kbest features
    k_best = select_kbest(X_tr_mm_all,y_tr,k)
    #create and fit model on train
    model = LinearRegression(normalize=True).fit(X_tr_mm_all[k_best],y_tr)
    #add model to dictionary
    model_dict[model_name] = {
        "model_name": model_name,
        "model": model}
    #generate and store model predictions
    tr_res[model_name] = model.predict(X_tr_mm_all[k_best])

#take a peek
tr_res.head()

Unnamed: 0,actual,baseline,lr_mm_k2,lr_mm_k3,lr_mm_k4,lr_mm_k5,lr_mm_k6,lr_mm_k15,lr_mm_k20
10290,140000,496260.673045,250553.581897,305833.395485,366386.814236,299754.960694,300440.263953,275630.630845,92614.017142
45994,312139,496260.673045,477914.108758,440025.378161,397411.107874,337820.603042,339148.946464,368775.907897,368385.329745
32278,517318,496260.673045,568943.438849,526325.237148,485738.092709,493338.229819,490841.108056,461664.747345,461693.322649
20637,241333,496260.673045,491762.74425,464034.943122,429710.678515,381184.314231,380926.663541,349256.723041,362414.242131
37265,226781,496260.673045,232351.620928,280149.424426,351379.629736,422800.400892,415981.676814,390114.600058,378298.696754


In [16]:
#Now let's see how they did
for model_name in tr_res.columns[1:]:
    mod_perf.loc['rmse',model_name] = round(utils.rmse(tr_res.actual,tr_res[model_name]))

mod_perf

Unnamed: 0,baseline,lr_mm_k2,lr_mm_k3,lr_mm_k4,lr_mm_k5,lr_mm_k6,lr_mm_k15,lr_mm_k20
rmse,499054.0,392355.0,388397.0,384208.0,381880.0,378563.0,360776.0,355014.0


In [17]:
#repeat with other scale
for k in [2,3,4,5,6,15,20]:
    #generate model name
    model_name = 'lr_st_k'+str(k)
    #get kbest features
    k_best = select_kbest(X_tr_st_all,y_tr,k)
    #create and fit model on train
    model = LinearRegression(normalize=True).fit(X_tr_st_all[k_best],y_tr)
    #add model to dictionary
    model_dict[model_name] = {
        "model_name": model_name,
        "model": model}
    #generate and store model predictions
    tr_res[model_name] = model.predict(X_tr_st_all[k_best])
    #gather model performance
    mod_perf.loc['rmse',model_name] = round(utils.rmse(tr_res.actual,tr_res[model_name]))

#take a peek at performance
mod_perf

Unnamed: 0,baseline,lr_mm_k2,lr_mm_k3,lr_mm_k4,lr_mm_k5,lr_mm_k6,lr_mm_k15,lr_mm_k20,lr_st_k2,lr_st_k3,lr_st_k4,lr_st_k5,lr_st_k6,lr_st_k15,lr_st_k20
rmse,499054.0,392355.0,388397.0,384208.0,381880.0,378563.0,360776.0,355014.0,392355.0,388397.0,384208.0,381880.0,378563.0,360776.0,355014.0


In [18]:
mod_perf.columns

Index(['baseline', 'lr_mm_k2', 'lr_mm_k3', 'lr_mm_k4', 'lr_mm_k5', 'lr_mm_k6',
       'lr_mm_k15', 'lr_mm_k20', 'lr_st_k2', 'lr_st_k3', 'lr_st_k4',
       'lr_st_k5', 'lr_st_k6', 'lr_st_k15', 'lr_st_k20'],
      dtype='object')

In [19]:
#both scaling methods had the same results - just use min max moving forward
#drop the standard scaling results form tr_res and mod_perf
drp_cols = ['lr_st_k2', 'lr_st_k3', 'lr_st_k4','lr_st_k5', 'lr_st_k6', 'lr_st_k15', 'lr_st_k20']
mod_perf.drop(columns=drp_cols,inplace=True)
tr_res.drop(columns=drp_cols,inplace=True)

##### Generalized Linear MOdel - TweedieRegressor
- power: 1 (poisson)
- power: 2 (gamma), link: 'log'
- alphas: 0,.25,.5,.75,1,2

In [20]:
glm_mod_perf = pd.DataFrame()
tr_res_glm_mm = pd.DataFrame()
ks = [2,3,4,5,6,15,20]
alphas = [0,.25,.5,.75,1,2]
params = product(ks,alphas)
#perform Generalized Linear Modeul using Poisson
for pair in params:
    k = pair[0]
    a = pair[1]
    #generate model name
    model_name = 'pow1_k'+str(k)+'_a'+str(a)
    #get kbest features
    k_best = select_kbest(X_tr_mm_all,y_tr,k)
    #create and fit model on train
    model = TweedieRegressor(power=1,alpha=a).fit(X_tr_mm_all[k_best],y_tr)
    #add model to dictionary
    model_dict[model_name] = {
        "model_name": model_name,
        "model": model}
    #generate and store model predictions
    tr_res_glm_mm[model_name] = model.predict(X_tr_mm_all[k_best])
    #gather model performance
    glm_mod_perf.loc['rmse',model_name] = round(utils.rmse(tr_res.actual,tr_res_glm_mm[model_name]))

#perform Generalized Linear Modeul using Gamma - log
for pair in params:
    k = pair[0]
    a = pair[1]
    #generate model name
    model_name = 'pow2_linkLog'+str(k)+'_a'+str(a)
    #get kbest features
    k_best = select_kbest(X_tr_mm_all,y_tr,k)
    #create and fit model on train
    model = TweedieRegressor(power=2,link='log').fit(X_tr_mm_all[k_best],y_tr)
    #add model to dictionary
    model_dict[model_name] = {
        "model_name": model_name,
        "model": model}
    #generate and store model predictions
    tr_res_glm_mm[model_name] = model.predict(X_tr_mm_all[k_best])
    #gather model performance
    glm_mod_perf.loc['rmse',model_name] = round(utils.rmse(tr_res.actual,tr_res_glm_mm[model_name]))

In [24]:
glm_mod_perf.T.sort_values(by='rmse',ascending=True)

Unnamed: 0,rmse
pow1_k20_a2,371885.0
pow1_k20_a1,371893.0
pow1_k20_a0.75,371895.0
pow1_k20_a0.5,371897.0
pow1_k20_a0.25,371899.0
pow1_k20_a0,371901.0
pow1_k15_a2,382205.0
pow1_k15_a1,382216.0
pow1_k15_a0.75,382218.0
pow1_k15_a0.5,382221.0


# TO DO:
### just finished glm (glm and lr are complete)
### Do a few lassos and maybe a polynomial
### Choose top 10(ish) to run against validate
### create new results subset (maybe just predict again using the stored models
### Create new performance subset >> look to how I did it in classification