## Load and pre-process data

In [1]:
import numpy as np
import pandas as pd

cluster_data = pd.read_csv('../../data/all_areas_clusters_hier.csv') # contains clustering groups
PCdata = pd.read_csv('../../data/zillow_withPCA.csv') # contains prinicpal components

In [2]:
clusters = cluster_data[['Zipcode','Date','Clusters']] # subset the cluster col + two to merge on
pc_df = PCdata.drop(['City','State','Metro','County','SizeRank','State-County','Year'],axis=1) # the relevant PC columns

In [3]:
zillow_df = pc_df.merge(clusters, "inner", on = ["Date","Zipcode"]) # merge on date + zipcode
# make sure n_rows post-merge == 17751
zillow_df.set_index('Date',inplace=True)

In [4]:
# separate target from features
y = np.log(zillow_df['Rent']) # log-scaled
X = zillow_df.drop('Rent',axis=1)

In [5]:
# remove other features from X that will not go into the regression model
X.drop(['Zipcode','housing_availability'],axis=1,inplace=True)

In [6]:
# dummify cluster column
X['Clusters'].value_counts() # we will drop the cluster 0 column bc its the largest of the groups

0    7747
3    3965
2    3477
1    2562
Name: Clusters, dtype: int64

In [7]:
cluster_dummy = pd.get_dummies(X['Clusters'],prefix="Cluster",drop_first=True)

In [8]:
X = pd.concat((X,cluster_dummy),axis=1).drop('Clusters',axis=1)

In [9]:
from PCARandomForest import train_test
Xtrain,Xtest,ytrain,ytest = train_test(X,y)

## OLS

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

ols = LinearRegression()

In [24]:
def OLSRegression(model,Xtrain,Xtest,ytrain,ytest):
    '''
    Input a (tuned) model, X and y train/test df's.
    This function will output test and training R2 as well as RMSE
    Reminder-to-self: This is just to play around. use statsmodels to check AIC, VIF, etc.
    '''
    model.fit(Xtrain,ytrain)
    print(f'training R2: {model.score(Xtrain,ytrain)}')
    print(f'test R2: {model.score(Xtest,ytest)}')
    
    ypred = model.predict(Xtest)
    RMSE = mean_squared_error(ytest,ypred,squared=False)
    print(f'RMSE: {RMSE}')

### Model 1
Trying just the PCs

In [16]:
pc_train = Xtrain.filter(regex='PC')
pc_test = Xtest.filter(regex='PC')

In [25]:
OLSRegression(ols,pc_train,pc_test,ytrain,ytest) # yikes

training R2: 0.48202084944712686
test R2: 0.4441023303339105
RMSE: 0.211323505558185


In [29]:
OLSRegression?

### Model 2
Trying with clustering cols

In [31]:
OLSRegression(model = ols, Xtrain = Xtrain.filter(regex='Cluster|PC'), Xtest = Xtest.filter(regex='Cluster|PC'),
             ytrain = ytrain, ytest = ytest)

# performs better with lower RMSE when including the cluster cols

training R2: 0.5048337034382637
test R2: 0.46929249135311507
RMSE: 0.2064800020209921


### Model 3
Try everything

In [32]:
OLSRegression(ols,Xtrain,Xtest,ytrain,ytest) # best performance but still poor

training R2: 0.5714957825302953
test R2: 0.541552111086042
RMSE: 0.19190901142446715


## Trying regression in R because a stepwise approach with more information would be nice

In [42]:
big_df = PCdata.merge(clusters, "inner", on = ["Date","Zipcode"]) # merge on date + zipcode

In [45]:
cluster_dummy = pd.get_dummies(big_df['Clusters'],prefix="Cluster",drop_first=True)

In [50]:
big_df = pd.concat((big_df,cluster_dummy),axis=1).drop('Clusters',axis=1)

In [52]:
big_df.to_csv('../../data/zillow_df_0331.csv',index=False)

## Yeah use lasso

In [70]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

In [90]:
lasso

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [98]:
lasso = Lasso()
params = {"alpha":[0.000001, 0.0001, 0.001, 0.005, 0.01, 0.03, 0.05, 0.08, 0.1, 0.25, 0.5]}
grid = GridSearchCV(lasso,param_grid=params, return_train_score=True)

In [99]:
grid.fit(Xtrain,ytrain)

GridSearchCV(cv=None, error_score=nan,
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [1e-06, 0.0001, 0.001, 0.005, 0.01, 0.03,
                                   0.05, 0.08, 0.1, 0.25, 0.5]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [100]:
grid.cv_results_

{'mean_fit_time': array([0.00841689, 0.00492058, 0.00367703, 0.00333967, 0.00328054,
        0.00322886, 0.00325184, 0.00371075, 0.00367661, 0.00340924,
        0.00372725]),
 'std_fit_time': array([0.00145286, 0.00067229, 0.00027388, 0.00037857, 0.00013899,
        0.00011543, 0.00014818, 0.00021466, 0.00021392, 0.00032545,
        0.00019809]),
 'mean_score_time': array([0.00243506, 0.00142999, 0.00134363, 0.00134292, 0.00135198,
        0.00138621, 0.00144038, 0.00140548, 0.00169282, 0.00161409,
        0.00135241]),
 'std_score_time': array([6.73476750e-04, 1.53317303e-04, 1.11351595e-04, 1.08703353e-04,
        1.42003963e-04, 1.55881808e-04, 3.23340763e-04, 1.46580006e-04,
        1.36538838e-04, 1.20140728e-04, 7.91738671e-05]),
 'param_alpha': masked_array(data=[1e-06, 0.0001, 0.001, 0.005, 0.01, 0.03, 0.05, 0.08,
                    0.1, 0.25, 0.5],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False],
        fil