In [5]:
import pandas as pd
import numpy as np

In [6]:
def impute_by_county(df,colname,method):
    '''Returns series with missing values imputed by specified method.
    
    Args:
    
    df: pd.Dataframe, Dataframe to pass in
    Series: str, column name in df
    method: str, Central tendency method by which to impute (examples: "mean","median")
    '''
    
    if not isinstance(df, pd.DataFrame):
        raise TypeError('df argument must be of type pd.DataFrame')
        
    if not isinstance(colname, str):
        raise TypeError('Series but be the column name as a string')
    return df[colname].fillna(df.groupby(['City','State','County'])[colname].transform(method))

## Using zillow full file

In [7]:
# using cleaning from zillow_rent_index/casey/pipeline-execution-test.ipynb
zillow_data = pd.read_csv('../../data/zillow_full.csv')

In [8]:
zillow_data.drop(['RegionID','do_date'],axis = 1, inplace = True)
zillow_data['Date'] = pd.to_datetime(zillow_data['Date'])

In [9]:
zillow_data.isnull().sum()

Zipcode                    0
City                       0
State                      0
Metro                      0
County                     0
SizeRank                   0
Date                       0
Rent                       0
Year                       0
State-County               0
PersonalIncome           291
Vol_moderate_income     5238
Vol_low_income          5238
total_pop                 61
households                61
median_age                61
median_income             61
income_per_capita         61
gini_index                61
pct_poverty               61
housing_availability      61
home_density              61
pct_employed              61
pct_jobs_nightlife        61
pct_unemployed            61
move_within_city          61
move_new_city             61
avg_commute_time         122
pct_college               61
dtype: int64

In [10]:
null_data = zillow_data.isnull().sum()

In [11]:
null_cols= null_data[null_data>=1].index
null_cols

Index(['PersonalIncome', 'Vol_moderate_income', 'Vol_low_income', 'total_pop',
       'households', 'median_age', 'median_income', 'income_per_capita',
       'gini_index', 'pct_poverty', 'housing_availability', 'home_density',
       'pct_employed', 'pct_jobs_nightlife', 'pct_unemployed',
       'move_within_city', 'move_new_city', 'avg_commute_time', 'pct_college'],
      dtype='object')

In [12]:
for col in null_cols:
    zillow_data[col] = impute_by_county(zillow_data,col,'mean')

In [13]:
zillow_data.set_index('Date',inplace = True)

In [14]:
zillow_data.head()

Unnamed: 0_level_0,Zipcode,City,State,Metro,County,SizeRank,Rent,Year,State-County,PersonalIncome,...,pct_poverty,housing_availability,home_density,pct_employed,pct_jobs_nightlife,pct_unemployed,move_within_city,move_new_city,avg_commute_time,pct_college
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01,10025,New York,NY,New York-Newark-Jersey City,New York County,1,3566.0,2015,NY-New York County,52904.0,...,0.14551,1.153877,2.214388,0.932888,0.088021,0.06682,0.084888,0.05142,30.096886,0.216281
2015-01-01,10023,New York,NY,New York-Newark-Jersey City,New York County,3,3413.0,2015,NY-New York County,52904.0,...,0.082199,1.231472,1.853056,0.955315,0.082397,0.044685,0.073276,0.039193,27.057535,0.262305
2015-01-01,10002,New York,NY,New York-Newark-Jersey City,New York County,7,3508.0,2015,NY-New York County,52904.0,...,0.276575,1.065116,2.235927,0.92992,0.168466,0.07008,0.051605,0.018042,30.594358,0.172389
2015-01-01,11226,New York,NY,New York-Newark-Jersey City,Kings County,11,1876.0,2015,NY-Kings County,52904.0,...,0.174138,1.097732,2.729889,0.928099,0.103083,0.071901,0.045075,0.010311,42.388151,0.127736
2015-01-01,10467,New York,NY,New York-Newark-Jersey City,Bronx County,12,1442.0,2015,NY-Bronx County,52904.0,...,0.278866,1.048949,2.824023,0.874915,0.12988,0.125085,0.093202,0.008001,43.596975,0.086


In [15]:
zillow_data.isnull().sum()

Zipcode                 0
City                    0
State                   0
Metro                   0
County                  0
SizeRank                0
Rent                    0
Year                    0
State-County            0
PersonalIncome          0
Vol_moderate_income     0
Vol_low_income          0
total_pop               0
households              0
median_age              0
median_income           0
income_per_capita       0
gini_index              0
pct_poverty             0
housing_availability    0
home_density            0
pct_employed            0
pct_jobs_nightlife      0
pct_unemployed          0
move_within_city        0
move_new_city           0
avg_commute_time        0
pct_college             0
dtype: int64

In [161]:
zillow_data.head(15)

Unnamed: 0_level_0,Zipcode,City,State,Metro,County,SizeRank,Rent,Year,State-County,PersonalIncome,...,pct_poverty,housing_availability,home_density,pct_employed,pct_jobs_nightlife,pct_unemployed,move_within_city,move_new_city,avg_commute_time,pct_college
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01,10025,New York,NY,New York-Newark-Jersey City,New York County,1,3566.0,2015,NY-New York County,52904.0,...,0.14551,1.153877,2.214388,0.932888,0.088021,0.06682,0.084888,0.05142,30.096886,0.216281
2015-01-01,10023,New York,NY,New York-Newark-Jersey City,New York County,3,3413.0,2015,NY-New York County,52904.0,...,0.082199,1.231472,1.853056,0.955315,0.082397,0.044685,0.073276,0.039193,27.057535,0.262305
2015-01-01,10002,New York,NY,New York-Newark-Jersey City,New York County,7,3508.0,2015,NY-New York County,52904.0,...,0.276575,1.065116,2.235927,0.92992,0.168466,0.07008,0.051605,0.018042,30.594358,0.172389
2015-01-01,11226,New York,NY,New York-Newark-Jersey City,Kings County,11,1876.0,2015,NY-Kings County,52904.0,...,0.174138,1.097732,2.729889,0.928099,0.103083,0.071901,0.045075,0.010311,42.388151,0.127736
2015-01-01,10467,New York,NY,New York-Newark-Jersey City,Bronx County,12,1442.0,2015,NY-Bronx County,52904.0,...,0.278866,1.048949,2.824023,0.874915,0.12988,0.125085,0.093202,0.008001,43.596975,0.086
2015-01-01,78660,Pflugerville,TX,Austin-Round Rock,Travis County,13,1133.0,2015,TX-Travis County,50350.0,...,0.069724,1.035899,2.999833,0.959805,0.080879,0.037766,0.018869,0.126194,26.03747,0.171346
2015-01-01,94109,San Francisco,CA,San Francisco-Oakland-Hayward,San Francisco County,14,3243.0,2015,CA-San Francisco County,64192.0,...,0.119091,1.130693,1.711882,0.960364,0.133581,0.039383,0.096988,0.092718,27.902058,0.355347
2015-01-01,10016,New York,NY,New York-Newark-Jersey City,New York County,15,3405.0,2015,NY-New York County,52904.0,...,0.096666,1.197884,1.765701,0.961919,0.0584,0.037024,0.10263,0.057348,23.632181,0.350922
2015-01-01,11201,New York,NY,New York-Newark-Jersey City,Kings County,18,2814.0,2015,NY-Kings County,52904.0,...,0.129985,1.095272,2.238323,0.946164,0.069407,0.053599,0.112268,0.06079,29.722454,0.270299
2015-01-01,11235,New York,NY,New York-Newark-Jersey City,Kings County,19,1887.0,2015,NY-Kings County,52904.0,...,0.193871,1.094518,2.421147,0.939791,0.070872,0.058887,0.077987,0.007833,43.212166,0.212126


## Importing previously imputed data

In [86]:
zillow_data = pd.read_csv('../../data/zillow_full_imputed.csv',dtype={'Zipcode':str})

In [87]:
zillow_data.set_index('Date',inplace = True)

In [88]:
zillow_data.shape

(17751, 28)

In [89]:
zillow_data.columns

Index(['Zipcode', 'City', 'State', 'Metro', 'County', 'SizeRank', 'Rent',
       'Year', 'State-County', 'PersonalIncome', 'Vol_moderate_income',
       'Vol_low_income', 'total_pop', 'households', 'median_age',
       'median_income', 'income_per_capita', 'gini_index', 'pct_poverty',
       'housing_availability', 'home_density', 'pct_employed',
       'pct_jobs_nightlife', 'pct_unemployed', 'move_within_city',
       'move_new_city', 'avg_commute_time', 'pct_college'],
      dtype='object')

In [90]:
zillow_data['State-County'].value_counts()

NY-Queens County           2989
NY-Kings County            2257
NY-New York County         2013
FL-Miami-Dade County       1647
TX-Travis County           1525
FL-Broward County          1220
NY-Bronx County            1098
CA-Alameda County           976
CA-Santa Clara County       854
FL-Palm Beach County        793
NY-Richmond County          732
CA-San Mateo County         610
CA-San Francisco County     427
CA-Contra Costa County      427
CA-Marin County             122
CA-Sonoma County             61
Name: State-County, dtype: int64

## Random Forest regressor

In [37]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [94]:
LE = LabelEncoder()
ss = StandardScaler()
features = zillow_data.drop(['Rent','State-County','State'],axis = 1)
y = np.log(zillow_data['Rent'])

In [95]:
from sklearn.compose import ColumnTransformer

features['Zipcode'] = features['Zipcode'].astype(int)

In [99]:
cat_cols = features.select_dtypes(include = object).copy()
num_cols = features.select_dtypes(exclude = object).copy()

In [100]:
for col in cat_cols.columns:
    cat_cols[col] = LE.fit_transform(cat_cols[col])

X = pd.concat([cat_cols,num_cols],axis = 1)

In [101]:
X.shape

(17751, 25)

In [102]:
from sklearn.ensemble import RandomForestRegressor
rfc = RandomForestRegressor()

In [103]:
print(X.index[0])
print(X.index[-1])

2015-01-01
2020-01-01


In [104]:
#splitting train/test sets before/after 2019
Xtrain = X.loc[X.index <'2019-01-01']
train_index = Xtrain.shape[0]
Xtest = X[train_index:]
ytrain = y[:train_index]
ytest = y[train_index:]


In [105]:
rfc.fit(Xtrain,ytrain)

RandomForestRegressor()

In [106]:
rfc.score(Xtrain,ytrain)

0.9994761194854755

In [107]:
rfc.score(Xtest,ytest)

0.9841671418712025

In [76]:
rfc.feature_importances_

array([0.00167064, 0.00158435, 0.39256525, 0.00297062, 0.00398324,
       0.07849101, 0.00530482, 0.00793884, 0.00610174, 0.00561054,
       0.00192252, 0.00393   , 0.00123416, 0.00370091, 0.00692234,
       0.12738826, 0.02786916, 0.00497314, 0.06234597, 0.00354769,
       0.00211722, 0.00986635, 0.00285022, 0.00280346, 0.01068545,
       0.21356327, 0.00805885])

In [108]:
rfc_imp = pd.DataFrame({'Columns':X.columns,'Feature_importances':rfc.feature_importances_})

In [109]:
rfc_imp.sort_values(by = 'Feature_importances',ascending = False)

Unnamed: 0,Columns,Feature_importances
1,Metro,0.393426
23,avg_commute_time,0.211093
13,income_per_capita,0.128891
3,Zipcode,0.081815
16,housing_availability,0.062019
14,gini_index,0.027622
22,move_new_city,0.010829
19,pct_jobs_nightlife,0.009716
24,pct_college,0.008441
5,Year,0.008261


In [81]:
ypred = rfc.predict(Xtest)

In [82]:
from sklearn.metrics import mean_squared_error

In [84]:
RMSE = mean_squared_error(ytest,ypred, squared = False)
print(f'RMSE: {RMSE}')

RMSE: 0.03470971420655388
