## Load and pre-process data

In [1]:
import numpy as np
import pandas as pd

cluster_data = pd.read_csv('../../data/all_areas_clusters_hier.csv') # contains clustering groups
PCdata = pd.read_csv('../../data/zillow_withPCA.csv') # contains prinicpal components

In [2]:
clusters = cluster_data[['Zipcode','Date','Clusters']] # subset the cluster col + two to merge on
pc_df = PCdata.drop(['City','State','Metro','County','SizeRank','State-County','Year'],axis=1) # the relevant PC columns

In [3]:
zillow_df = pc_df.merge(clusters, "inner", on = ["Date","Zipcode"]) # merge on date + zipcode
# make sure n_rows post-merge == 17751
zillow_df.set_index('Date',inplace=True)

In [4]:
# separate target from features
y = np.log(zillow_df['Rent']) # log-scaled
X = zillow_df.drop('Rent',axis=1)

In [5]:
# remove other features from X that will not go into the regression model
X.drop(['Zipcode','housing_availability'],axis=1,inplace=True)

In [6]:
# dummify cluster column
X['Clusters'].value_counts() # we will drop the cluster 0 column bc its the largest of the groups

0    7747
3    3965
2    3477
1    2562
Name: Clusters, dtype: int64

In [7]:
cluster_dummy = pd.get_dummies(X['Clusters'],prefix="Cluster",drop_first=True)

In [8]:
X = pd.concat((X,cluster_dummy),axis=1).drop('Clusters',axis=1)

In [9]:
from PCARandomForest import train_test
Xtrain,Xtest,ytrain,ytest = train_test(X,y)

## OLS

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

ols = LinearRegression()

In [24]:
def OLSRegression(model,Xtrain,Xtest,ytrain,ytest):
    '''
    Input a (tuned) model, X and y train/test df's.
    This function will output test and training R2 as well as RMSE
    Reminder-to-self: This is just to play around. use statsmodels to check AIC, VIF, etc.
    '''
    model.fit(Xtrain,ytrain)
    print(f'training R2: {model.score(Xtrain,ytrain)}')
    print(f'test R2: {model.score(Xtest,ytest)}')
    
    ypred = model.predict(Xtest)
    RMSE = mean_squared_error(ytest,ypred,squared=False)
    print(f'RMSE: {RMSE}')

### Model 1
Trying just the PCs

In [16]:
pc_train = Xtrain.filter(regex='PC')
pc_test = Xtest.filter(regex='PC')

In [25]:
OLSRegression(ols,pc_train,pc_test,ytrain,ytest) # yikes

training R2: 0.48202084944712686
test R2: 0.4441023303339105
RMSE: 0.211323505558185


In [29]:
OLSRegression?

### Model 2
Trying with clustering cols

In [31]:
OLSRegression(model = ols, Xtrain = Xtrain.filter(regex='Cluster|PC'), Xtest = Xtest.filter(regex='Cluster|PC'),
             ytrain = ytrain, ytest = ytest)

# performs better with lower RMSE when including the cluster cols

training R2: 0.5048337034382637
test R2: 0.46929249135311507
RMSE: 0.2064800020209921


### Model 3
Try everything

In [32]:
OLSRegression(ols,Xtrain,Xtest,ytrain,ytest) # best performance but still poor

training R2: 0.5714957825302953
test R2: 0.541552111086042
RMSE: 0.19190901142446715
