In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error

model1 = pd.read_csv('../../data/fivePCModel.csv',index_col='Date',parse_dates=True)
model2 = pd.read_csv('../../data/PCModel2.csv',index_col='Date',parse_dates=True)

In [48]:
LE = LabelEncoder() #sklearn.preprocessing

def preProc(zillow_df, ytype='log'):
    features = zillow_df.drop(['Rent','State-County','State'],axis=1)
    cat_cols = features.select_dtypes(include = object).copy()
    num_cols = features.select_dtypes(exclude = object).copy()
    for col in cat_cols.columns:
        cat_cols[col] = LE.fit_transform(cat_cols[col])
    
    X = pd.concat([cat_cols,num_cols],axis=1)
    
    if ytype == 'log':
        y = np.log(zillow_df['Rent'])
    elif ytype == 'normal':
        y = zillow_df['Rent']
    return X,y

In [51]:
X,y = preProc(model1,'log')
Xtrain = X.loc[X.index <'2019-01-01']
train_index = Xtrain.shape[0]
Xtest = X[train_index:]
ytrain = y[:train_index]
ytest = y[train_index:]

In [70]:
def randForest(model,Xtrain,Xtest,ytrain,ytest):
    model.fit(Xtrain,ytrain)
    print(f'training R2: {model.score(Xtrain,ytrain)}')
    print(f'test R2: {model.score(Xtest,ytest)}')
    
    ypred = rfc.predict(Xtest)
    RMSE = mean_squared_error(ytest,ypred, squared = False)
    print(f'RMSE: {RMSE}')
    
    feature_imps = pd.DataFrame({'Columns':Xtrain.columns,'Feature_importances':model.feature_importances_})
    return feature_imps.sort_values('Feature_importances',ascending=False)

In [71]:
from sklearn.ensemble import RandomForestRegressor
rfc = RandomForestRegressor()

randForest(rfc,Xtrain,Xtest,ytrain,ytest)

training R2: 0.9988398439019632
test R2: 0.9792883532558285
RMSE: 0.04079037470119717


Unnamed: 0,Columns,Feature_importances
1,Metro,0.393796
6,PC1,0.294761
3,Zipcode,0.196494
2,County,0.035391
9,PC4,0.020974
8,PC3,0.015757
5,Year,0.012606
7,PC2,0.012321
4,SizeRank,0.00955
0,City,0.004262


In [58]:
mini_train = Xtrain[['PC1','PC2','PC3','PC4','PC5']]
mini_test = Xtest[['PC1','PC2','PC3','PC4','PC5']]

In [72]:
randForest(rfc,mini_train,mini_test,ytrain,ytest)

training R2: 0.9977768250651446
test R2: 0.9383261818255458
RMSE: 0.07038835208147487


Unnamed: 0,Columns,Feature_importances
0,PC1,0.361624
1,PC2,0.259888
2,PC3,0.18645
3,PC4,0.127496
4,PC5,0.064542


In [67]:
x2,y2 = preProc(model2,'log')
X2train = x2.loc[x2.index <'2019-01-01']
train_index = X2train.shape[0]
X2test = x2[train_index:]
y2train = y2[:train_index]
y2test = y2[train_index:]

In [68]:
randForest(rfc,X2train,X2test,y2train,ytest)

training R2: 0.9994701701004696
test R2: 0.9813419004994669


Unnamed: 0,Columns,Feature_importances
1,Metro,0.392856
8,PC1,0.195652
11,secondPC1,0.131733
3,Zipcode,0.083705
17,housing_availability,0.053208
16,gini_index,0.043275
10,PC3,0.019375
9,PC2,0.016912
2,County,0.009987
13,PersonalIncome,0.008707


In [73]:
%run ../Jane/extract_data.py

Your data is ready! Merged table name is zillow_full


In [76]:
# pd.isnull(zillow_full).sum() # check for missingness

In [77]:
zillow_full

Unnamed: 0,Zipcode,City,State,Metro,County,SizeRank,Date,Rent,Year,State-County,...,pct_poverty,housing_availability,home_density,pct_employed,pct_jobs_nightlife,pct_unemployed,move_within_city,move_new_city,avg_commute_time,pct_college
0,10025,New York,NY,New York-Newark-Jersey City,New York County,1,2015-01-01,3566.0,2015,NY-New York County,...,0.145510,1.153877,2.214388,0.932888,0.088021,0.066820,0.084888,0.051420,30.096886,0.216281
1,10023,New York,NY,New York-Newark-Jersey City,New York County,3,2015-01-01,3413.0,2015,NY-New York County,...,0.082199,1.231472,1.853056,0.955315,0.082397,0.044685,0.073276,0.039193,27.057535,0.262305
2,10002,New York,NY,New York-Newark-Jersey City,New York County,7,2015-01-01,3508.0,2015,NY-New York County,...,0.276575,1.065116,2.235927,0.929920,0.168466,0.070080,0.051605,0.018042,30.594358,0.172389
3,11226,New York,NY,New York-Newark-Jersey City,Kings County,11,2015-01-01,1876.0,2015,NY-Kings County,...,0.174138,1.097732,2.729889,0.928099,0.103083,0.071901,0.045075,0.010311,42.388151,0.127736
4,10467,New York,NY,New York-Newark-Jersey City,Bronx County,12,2015-01-01,1442.0,2015,NY-Bronx County,...,0.278866,1.048949,2.824023,0.874915,0.129880,0.125085,0.093202,0.008001,43.596975,0.086000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17746,11416,New York,NY,New York-Newark-Jersey City,Queens County,1818,2020-01-01,2184.0,2020,NY-Queens County,...,0.152349,1.076348,3.758487,0.926849,0.118820,0.073151,0.064774,0.016516,45.186076,0.105364
17747,11436,New York,NY,New York-Newark-Jersey City,Queens County,1820,2020-01-01,2057.0,2020,NY-Queens County,...,0.110755,1.076510,3.597384,0.908314,0.064390,0.091686,0.053295,0.014245,46.974765,0.091597
17748,11366,New York,NY,New York-Newark-Jersey City,Queens County,1826,2020-01-01,2185.0,2020,NY-Queens County,...,0.120030,1.085258,3.162098,0.915833,0.117797,0.084167,0.056246,0.006280,40.697500,0.187850
17749,11109,New York,NY,New York-Newark-Jersey City,New York County,1828,2020-01-01,3325.0,2020,NY-New York County,...,0.055509,1.376439,2.024712,0.963464,0.034176,0.036536,0.136599,0.120883,28.097646,0.328373
