In [1]:
#Data Management
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import geopandas as gpd

#Data visualization
import pylab as pl
%pylab inline

#Feature selection
from scipy import stats
from sklearn import cross_validation
import statsmodels.formula.api as smf

Populating the interactive namespace from numpy and matplotlib




# Heat/hot water complaints and Class C violations
* weekly
* CBG level
* 2014-2016

In [2]:
comvio = pd.read_csv('../data/output/00JOIN_COM_VIO_CBG_WEEK.csv', dtype = {'CBG': str},
                    parse_dates = ['Date'])
comvio = comvio.drop(['Unnamed: 0'], axis = 1)
comvio['year'] = comvio['Date'].dt.year

In [3]:
comvio.head(2)

Unnamed: 0,CBG,com_count,Date,vio_count,year
0,360050001001,0.0,2014-03-17,0.0,2014
1,360050002001,0.0,2014-03-17,0.0,2014


In [4]:
comvio.dtypes

CBG                  object
com_count           float64
Date         datetime64[ns]
vio_count           float64
year                  int64
dtype: object

# Demographic variables
* ACS

In [5]:
demo_2014 = pd.read_csv('../data/output/00VAR_Demographic_ACS_CBG_2014.csv', dtype = {'Id2': str})
demo_2015 = pd.read_csv('../data/output/00VAR_Demographic_ACS_CBG_2015.csv', dtype = {'Id2': str})
demo_2014['CBG'] = demo_2014['Id2']
demo_2015['CBG'] = demo_2015['Id2']
demo_2014 = demo_2014.drop(['Unnamed: 0', 'Id2'], axis = 1)
demo_2015 = demo_2015.drop(['Unnamed: 0', 'Id2'], axis = 1)
demo_2016 = demo_2015
demo_2016['year'] = 2016

In [6]:
frames = [demo_2014, demo_2015, demo_2016]
demo = pd.concat(frames)
demo = demo.fillna(0)
demo = demo.replace(np.inf, 0)
demo = demo.replace(-np.inf, 0)

In [7]:
demo.head(2)

Unnamed: 0,pop,female_r,elderly_r,white_r,black_r,asian_r,hh,hh_child_6_r,hh_living_alone_r,edu_high_r,non_eng_r,unemployed_r,vacancy_r,med_rent,med_income,merried_r,longercommute_r,year,CBG
0,1158.0,0.61399,0.076857,0.290155,0.489637,0.016408,240.0,0.0,0.311404,0.185526,0.08114,0.105618,0.142857,615.0,40781.0,0.282022,0.25486,2014,360010001001
1,1043.0,0.566635,0.085331,0.380633,0.543624,0.027804,332.0,0.313253,0.284916,0.102941,0.0,0.078189,0.191874,659.0,28700.0,0.363755,0.088608,2014,360010001002


In [8]:
print len(demo_2016)
print len(demo_2016)
print len(demo_2016)
print len(demo_2014) + len(demo_2015) + len(demo_2016)
print len(demo)

15463
15463
15463
46389
46389


# Physical condition variables
* ACS

In [9]:
phys_2014 = pd.read_csv('../data/output/phys_cond_14.csv', dtype = {'Id2': str})

In [13]:
phys_2014 = phys_2014.rename(columns = {'utility_gas_r': 'gas_utility_r', 'homeowner_r': 'ownership_r'})
phys_2014.head(2)

Unnamed: 0.1,Unnamed: 0,Id2,gas_utility_r,Geography,ownership_r,avg_bldng_age
0,0,360050001000,,"Block Group 0, Census Tract 1, Bronx County, N...",,
1,1,360050001001,,"Block Group 1, Census Tract 1, Bronx County, N...",,


In [11]:
phys_2015 = pd.read_csv('../data/output/phys_cond_15.csv', dtype = {'Id2': str})

In [12]:
phys_2015.head(2)

Unnamed: 0.1,Unnamed: 0,Id2,Geography,gas_utility_r,avg_bldng_age,ownership_r
0,0,360050001000,"Block Group 0, Census Tract 1, Bronx County, N...",,,
1,1,360050001001,"Block Group 1, Census Tract 1, Bronx County, N...",,,


In [14]:
phys_2014['CBG'] = phys_2014['Id2']
phys_2015['CBG'] = phys_2015['Id2']
phys_2014 = phys_2014.drop(['Unnamed: 0', 'Id2'], axis = 1)
phys_2015 = phys_2015.drop(['Unnamed: 0', 'Id2'], axis = 1)
phys_2016 = phys_2015
phys_2014['year'] = 2014
phys_2015['year'] = 2015
phys_2016['year'] = 2016

In [15]:
frames = [phys_2014, phys_2015, phys_2016]
phys = pd.concat(frames)
phys = phys.fillna(0)
phys = phys.replace(np.inf, 0)
phys = phys.replace(-np.inf, 0)

In [16]:
phys.head(2)

Unnamed: 0,CBG,Geography,avg_bldng_age,gas_utility_r,ownership_r,year
0,360050001000,"Block Group 0, Census Tract 1, Bronx County, N...",0.0,0.0,0.0,2014
1,360050001001,"Block Group 1, Census Tract 1, Bronx County, N...",0.0,0.0,0.0,2014


In [17]:
print len(phys_2014)
print len(phys_2015)
print len(phys_2016)
print len(phys_2014) + len(phys_2015) +  len(phys_2016)
print len(phys)

6493
6493
6493
19479
19479


In [18]:
phys = phys.drop(['Geography'], axis = 1)
phys.head(2)

Unnamed: 0,CBG,avg_bldng_age,gas_utility_r,ownership_r,year
0,360050001000,0.0,0.0,0.0,2014
1,360050001001,0.0,0.0,0.0,2014


# Join demographic + physical condition

In [19]:
iv = pd.merge(demo, phys, how = 'left', on = ['CBG', 'year'])

In [20]:
iv.head(2)

Unnamed: 0,pop,female_r,elderly_r,white_r,black_r,asian_r,hh,hh_child_6_r,hh_living_alone_r,edu_high_r,...,vacancy_r,med_rent,med_income,merried_r,longercommute_r,year,CBG,avg_bldng_age,gas_utility_r,ownership_r
0,1158.0,0.61399,0.076857,0.290155,0.489637,0.016408,240.0,0.0,0.311404,0.185526,...,0.142857,615.0,40781.0,0.282022,0.25486,2014,360010001001,,,
1,1043.0,0.566635,0.085331,0.380633,0.543624,0.027804,332.0,0.313253,0.284916,0.102941,...,0.191874,659.0,28700.0,0.363755,0.088608,2014,360010001002,,,


In [21]:
iv = iv.drop_duplicates()
print len(iv)

30926


# Urban Form data (MapPLUTO)
* from MapPLUTO 
* CBG level

In [None]:
# Data will be loaded

# Weather condition and holiday data
* open source

In [22]:
wh = pd.read_csv('../data/output/00VAR_weather_holiday_2013_2016.csv', parse_dates = ['Date'])
wh = wh.drop(['Unnamed: 0'], axis = 1)

In [23]:
wh.head(2)

Unnamed: 0,Temp_min (F),Prep_sum (in),Snow (in),WindSpeed (mph),Date,Weather_events,Holiday_Count
0,23.0,0.0,0.0,16.0,2012-12-31,0.0,0.0
1,30.0,0.64,0.0,9.0,2013-01-07,0.0,1.0


# Join all data
* Dependent variavle (weekly heat/hot water complaints at CBG level)
* Independent variables (Demographic + Physical condition + Weather and holidays)

In [24]:
df = pd.merge(comvio, iv, how = 'left', on = ['CBG', 'year'])
df = df.drop_duplicates()
print len(df)
print len(comvio)

869064
869064


In [25]:
df = pd.merge(df, wh, how ='left', on = 'Date')
df = df.drop_duplicates()
print len(df)
print len(comvio)

869064
869064


In [26]:
df.head(2)

Unnamed: 0,CBG,com_count,Date,vio_count,year,pop,female_r,elderly_r,white_r,black_r,...,longercommute_r,avg_bldng_age,gas_utility_r,ownership_r,Temp_min (F),Prep_sum (in),Snow (in),WindSpeed (mph),Weather_events,Holiday_Count
0,360050001001,0.0,2014-03-17,0.0,2014,8430.0,0.092052,0.00344,0.140451,0.575445,...,0.0,0.0,0.0,0.0,25.0,0.72,0.0,14.0,0.0,0.0
1,360050002001,0.0,2014-03-17,0.0,2014,1519.0,0.516129,0.143515,0.134957,0.420013,...,0.623209,63.282609,0.813043,0.656522,25.0,0.72,0.0,14.0,0.0,0.0


### Export dataset for modeling

In [27]:
df.to_csv('../data/output/00DATASET_READY.csv')