## Load and pre-process data

In [1]:
import numpy as np
import pandas as pd

cluster_data = pd.read_csv('../../data/all_areas_clusters_hier.csv') # contains clustering groups
PCdata = pd.read_csv('../../data/zillow_withPCA.csv') # contains prinicpal components

In [2]:
clusters = cluster_data[['Zipcode','Date','Clusters']] # subset the cluster col + two to merge on
pc_df = PCdata.drop(['City','State','Metro','County','SizeRank','State-County','Year'],axis=1) # the relevant PC columns

In [3]:
# dummify cluster column
clusters['Clusters'].value_counts() # we will drop the cluster 0 column bc its the largest of the groups

0    7747
3    3965
2    3477
1    2562
Name: Clusters, dtype: int64

In [9]:
cluster_dummy = pd.get_dummies(clusters['Clusters'],prefix="Cluster",drop_first=True)
clusters = pd.concat((clusters,cluster_dummy),axis=1)

In [10]:
zillow_df = pc_df.merge(clusters, "inner", on = ["Date","Zipcode"]) # merge on date + zipcode
# make sure n_rows post-merge == 17751
zillow_df.set_index('Date',inplace=True)

In [11]:
zillow_df.head()

Unnamed: 0_level_0,Zipcode,Rent,median_age,gini_index,housing_availability,move_within_city,cityPC1,cityPC2,econPC1,econPC2,econPC3,Clusters,Cluster_1,Cluster_2,Cluster_3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2015-01-01,10025,3566.0,0.00812,0.009306,0.007598,0.008591,-0.007053,0.009281,0.003816,0.002187,0.002316,2,0,1,0
2015-01-01,10023,3413.0,0.008513,0.008738,0.008109,0.007416,-0.004638,0.00434,0.01216,0.003883,0.002966,1,1,0,0
2015-01-01,10002,3508.0,0.008572,0.009103,0.007013,0.005223,-0.007252,0.004014,-0.005684,0.004695,-0.002163,2,0,1,0
2015-01-01,11226,1876.0,0.006822,0.007221,0.007228,0.004562,-0.010629,0.005907,-0.003696,-0.001239,0.000227,2,0,1,0
2015-01-01,10467,1442.0,0.006625,0.0075,0.006907,0.009433,-0.011097,0.006011,-0.009925,0.001891,0.003597,2,0,1,0


In [12]:
# separate target from features
y = np.log(zillow_df['Rent']) # log-scaled
X = zillow_df.drop('Rent',axis=1)

In [13]:
# remove other features from X that will not go into the regression model
X.drop(['Zipcode'],axis=1,inplace=True)

In [14]:
from PCARandomForest import train_test
Xtrain,Xtest,ytrain,ytest = train_test(X,y)

## OLS

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

ols = LinearRegression()

In [16]:
def OLSRegression(model,Xtrain,Xtest,ytrain,ytest):
    '''
    Input a (tuned) model, X and y train/test df's.
    This function will output test and training R2 as well as RMSE
    Reminder-to-self: This is just to play around. use statsmodels to check AIC, VIF, etc.
    '''
    model.fit(Xtrain,ytrain)
    print(f'training R2: {model.score(Xtrain,ytrain)}')
    print(f'test R2: {model.score(Xtest,ytest)}')
    
    ypred = model.predict(Xtest)
    RMSE = mean_squared_error(ytest,ypred,squared=False)
    print(f'RMSE: {RMSE}')

### Model 1
Trying just the PCs

In [17]:
pc_train = Xtrain.filter(regex='PC')
pc_test = Xtest.filter(regex='PC')

In [18]:
OLSRegression(ols,pc_train,pc_test,ytrain,ytest) # yikes

training R2: 0.48202084944712686
test R2: 0.4441023303339105
RMSE: 0.211323505558185


### Model 2
Trying with clustering cols

In [19]:
OLSRegression(model = ols, Xtrain = Xtrain.filter(regex='Cluster|PC'), Xtest = Xtest.filter(regex='Cluster|PC'),
             ytrain = ytrain, ytest = ytest)

# performs better with lower RMSE when including the cluster cols

training R2: 0.5048337034382637
test R2: 0.46929249135311535
RMSE: 0.20648000202099204


### Model 3
Try everything

In [20]:
OLSRegression(ols,Xtrain,Xtest,ytrain,ytest) # best performance but still poor

training R2: 0.5724424148883809
test R2: 0.5436446906166782
RMSE: 0.19147052737331083


## Trying a lasso

In [21]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

In [22]:
lasso = Lasso()
params = {"alpha":[0.000001, 0.0001, 0.001, 0.005, 0.01, 0.03, 0.05, 0.08, 0.1, 0.25, 0.5]}
grid = GridSearchCV(lasso,param_grid=params, return_train_score=True)

In [24]:
grid.fit(Xtrain,ytrain)

GridSearchCV(cv=None, error_score=nan,
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [1e-06, 0.0001, 0.001, 0.005, 0.01, 0.03,
                                   0.05, 0.08, 0.1, 0.25, 0.5]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [25]:
grid.cv_results_

{'mean_fit_time': array([0.01058841, 0.00762119, 0.00454865, 0.00455823, 0.00467553,
        0.00455475, 0.00489941, 0.00526156, 0.00463109, 0.00496836,
        0.00472212]),
 'std_fit_time': array([3.38331329e-03, 8.54401719e-04, 2.66469241e-04, 3.61591896e-04,
        5.22809270e-05, 4.01630572e-04, 2.71682113e-04, 4.54439036e-04,
        2.26334835e-04, 3.36505756e-04, 2.31875668e-04]),
 'mean_score_time': array([0.00235152, 0.00164795, 0.00155025, 0.00166173, 0.00193863,
        0.00166454, 0.00172319, 0.00189466, 0.00174823, 0.00187573,
        0.00166359]),
 'std_score_time': array([6.99728432e-04, 1.10293682e-04, 4.66456430e-05, 1.38809205e-04,
        2.79812514e-04, 1.86144218e-04, 1.42908470e-04, 1.70430392e-04,
        1.03323537e-04, 1.26946608e-04, 6.17051235e-05]),
 'param_alpha': masked_array(data=[1e-06, 0.0001, 0.001, 0.005, 0.01, 0.03, 0.05, 0.08,
                    0.1, 0.25, 0.5],
              mask=[False, False, False, False, False, False, False, False,
         

In [29]:
grid.best_score_ # not ideal

0.5606342377914968