In [148]:
import pandas as pd
import numpy as np

In [149]:
def impute_by_county(df,colname,method):
    '''Returns series with missing values imputed by specified method.
    
    Args:
    
    df: pd.Dataframe, Dataframe to pass in
    Series: str, column name in df
    method: str, Central tendency method by which to impute (examples: "mean","median")
    '''
    
    if not isinstance(df, pd.DataFrame):
        raise TypeError('df argument must be of type pd.DataFrame')
        
    if not isinstance(colname, str):
        raise TypeError('Series but be the column name as a string')
    return df[colname].fillna(df.groupby(['City','State','County'])[colname].transform(method))

## Using zillow full file

In [150]:
# using cleaning from zillow_rent_index/casey/pipeline-execution-test.ipynb
zillow_data = pd.read_csv('../../data/zillow_full.csv')

In [151]:
zillow_data.drop(['RegionID','do_date'],axis = 1, inplace = True)
zillow_data['Date'] = pd.to_datetime(zillow_data['Date'])

In [152]:
zillow_data.isnull().sum()

Zipcode                    0
City                       0
State                      0
Metro                      0
County                     0
SizeRank                   0
Date                       0
Rent                       0
Year                       0
State-County               0
PersonalIncome           291
Vol_moderate_income     5238
Vol_low_income          5238
total_pop                 61
households                61
median_age                61
median_income             61
income_per_capita         61
gini_index                61
pct_poverty               61
housing_availability      61
home_density              61
pct_employed              61
pct_jobs_nightlife        61
pct_unemployed            61
move_within_city          61
move_new_city             61
avg_commute_time         122
pct_college               61
dtype: int64

In [153]:
null_data = zillow_data.isnull().sum()

In [154]:
null_cols= null_data[null_data>=1].index
null_cols

Index(['PersonalIncome', 'Vol_moderate_income', 'Vol_low_income', 'total_pop',
       'households', 'median_age', 'median_income', 'income_per_capita',
       'gini_index', 'pct_poverty', 'housing_availability', 'home_density',
       'pct_employed', 'pct_jobs_nightlife', 'pct_unemployed',
       'move_within_city', 'move_new_city', 'avg_commute_time', 'pct_college'],
      dtype='object')

In [155]:
for col in null_cols:
    zillow_data[col] = impute_by_county(zillow_data,col,'mean')

In [156]:
zillow_data.set_index('Date',inplace = True)

In [157]:
zillow_data.head()

Unnamed: 0_level_0,Zipcode,City,State,Metro,County,SizeRank,Rent,Year,State-County,PersonalIncome,...,pct_poverty,housing_availability,home_density,pct_employed,pct_jobs_nightlife,pct_unemployed,move_within_city,move_new_city,avg_commute_time,pct_college
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01,10025,New York,NY,New York-Newark-Jersey City,New York County,1,3566.0,2015,NY-New York County,52904.0,...,0.14551,1.153877,2.214388,0.932888,0.088021,0.06682,0.084888,0.05142,30.096886,0.216281
2015-01-01,10023,New York,NY,New York-Newark-Jersey City,New York County,3,3413.0,2015,NY-New York County,52904.0,...,0.082199,1.231472,1.853056,0.955315,0.082397,0.044685,0.073276,0.039193,27.057535,0.262305
2015-01-01,10002,New York,NY,New York-Newark-Jersey City,New York County,7,3508.0,2015,NY-New York County,52904.0,...,0.276575,1.065116,2.235927,0.92992,0.168466,0.07008,0.051605,0.018042,30.594358,0.172389
2015-01-01,11226,New York,NY,New York-Newark-Jersey City,Kings County,11,1876.0,2015,NY-Kings County,52904.0,...,0.174138,1.097732,2.729889,0.928099,0.103083,0.071901,0.045075,0.010311,42.388151,0.127736
2015-01-01,10467,New York,NY,New York-Newark-Jersey City,Bronx County,12,1442.0,2015,NY-Bronx County,52904.0,...,0.278866,1.048949,2.824023,0.874915,0.12988,0.125085,0.093202,0.008001,43.596975,0.086


In [158]:
zillow_data.isnull().sum()

Zipcode                 0
City                    0
State                   0
Metro                   0
County                  0
SizeRank                0
Rent                    0
Year                    0
State-County            0
PersonalIncome          0
Vol_moderate_income     0
Vol_low_income          0
total_pop               0
households              0
median_age              0
median_income           0
income_per_capita       0
gini_index              0
pct_poverty             0
housing_availability    0
home_density            0
pct_employed            0
pct_jobs_nightlife      0
pct_unemployed          0
move_within_city        0
move_new_city           0
avg_commute_time        0
pct_college             0
dtype: int64