In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import GridSearchCV

In [2]:
zillow = pd.read_csv('../../data/zillow/Zip_Zri_MultiFamilyResidenceRental.csv',dtype = {'RegionName':str})

In [3]:
zillow = zillow.rename(columns = {'RegionName':'Zipcode'})

In [4]:
zillow.drop(['2010-09','2010-10','2010-11','2010-12','2011-01','2011-02','2011-03','2011-04','2011-05','2011-06',
             '2011-07','2011-08','2011-09','2011-10','2011-11','2011-12','2012-01','2012-02','2012-03','2012-04',
             '2012-05','2012-06','2012-07','2012-08','2012-09','2012-10','2012-11','2012-12','2013-01','2013-02',
             '2013-03','2013-04','2013-05','2013-06','2013-07','2013-08','2013-09','2013-10','2013-11','2013-12',
             '2014-01','2014-02','2014-03','2014-04','2014-05','2014-06','2014-07','2014-08','2014-09','2014-10',
             '2014-11','2014-12'],axis=1,inplace=True)

In [5]:
# from transformers.py

sf_counties = ['Alameda County', 'Contra Costa County', 'Marin County', 'Napa County', 'San Mateo County', 
               'Santa Clara County', 'Solano County', 'Sonoma County', 'San Francisco County']
# NY Metro:
ny_counties = ['New York County', 'Bronx County', 'Queens County', 'Kings County', 'Richmond County']
# Greater Austin Metro:
tx_counties = ['Travis County']
# Miami Metro:
mia_counties = ['Miami-Dade County', 'Broward County', 'Palm Beach County']

counties_dict = {'CA':sf_counties,'NY':ny_counties,'TX':tx_counties,'FL':mia_counties}

all_counties = []
for state,counties in counties_dict.items():
    for county in counties:
        all_counties.append('%s-%s' % (state,county))

In [6]:
zillow['State-County'] = zillow['State'] + '-' + zillow['CountyName']
zillow = zillow[zillow['State-County'].isin(all_counties)].copy()
zillow.drop(['RegionID','City','State','Metro','CountyName','SizeRank','State-County'],axis=1,inplace=True)
zillow.set_index('Zipcode',inplace=True)

In [12]:
zipcodes = zillow.isnull().sum(axis=1).reset_index().rename(columns = {0:'n_missing'})

In [15]:
test_zips = zipcodes[zipcodes['n_missing']==0].Zipcode.to_list()

In [16]:
test_zips

['10025',
 '11226',
 '10467',
 '78660',
 '94109',
 '11235',
 '11375',
 '10009',
 '94565',
 '11206',
 '10462',
 '10456',
 '10019',
 '10003',
 '11230',
 '11209',
 '11221',
 '10024',
 '11207',
 '78745',
 '94110',
 '33025',
 '10458',
 '11211',
 '11377',
 '11234',
 '11355',
 '33024',
 '11385',
 '11373',
 '11233',
 '33009',
 '10027',
 '33186',
 '95123',
 '11372',
 '33311',
 '11218',
 '11223',
 '10021',
 '33313',
 '95051',
 '11225',
 '11101',
 '11204',
 '11213',
 '11219',
 '94538',
 '78758',
 '78741',
 '11368',
 '11203',
 '11220',
 '10031',
 '95014',
 '95112',
 '33433',
 '11354',
 '94087',
 '33436',
 '33141',
 '33324',
 '94086',
 '33178',
 '10466',
 '10460',
 '33418',
 '33312',
 '11374',
 '95008',
 '10472',
 '33065',
 '11210',
 '11434',
 '94115',
 '33020',
 '11224',
 '11691',
 '33125',
 '10010',
 '33142',
 '33180',
 '10461',
 '33130',
 '11106',
 '11222',
 '11432',
 '94403',
 '33126',
 '11217',
 '94103',
 '33161',
 '33435',
 '94107',
 '10065',
 '94010',
 '10459',
 '94066',
 '33134',
 '78744',
