In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
%matplotlib inline

# Join all data (weekly + zipcode level)
* Weekly Heat/hot water complaints
* Weekly C class HPD HMC violations
* Demographic & socio-economic characteristics 

In [2]:
# Load weekly Heat/hot water complaints and class c violations (cleaned data) at zipcode level
df_weekly = pd.read_csv('data/output/weekly_heathotwater_com_vio.csv', dtype = {'Zipcode': str}, 
                        parse_dates = ['Date'])
df_weekly['ZIPCODE'] = df_weekly['Zipcode']
df_weekly = df_weekly.drop(['Unnamed: 0', 'Zipcode'], axis = 1)


# Load explanatory variables 
df_acs= pd.read_csv('data/output/acs_zip.csv', dtype = {'ZIPCODE': str})
df_acs = df_acs.drop(['Unnamed: 0', 'Id2', 'native_r'], axis = 1)

In [3]:
df_weekly.head(2)

Unnamed: 0,Count_com,Date,Count_vio,Year,Month,Week,ZIPCODE
0,9.0,2014-03-17,0.0,2014.0,3.0,12.0,10001
1,9.0,2014-03-24,1.0,2014.0,3.0,13.0,10001


In [4]:
df_acs.head(2)

Unnamed: 0,POPULATION,ZIPCODE,disability_r,edu_high_r,edu_low_r,employment_r,hh,hh_with6_r,utility_gas_r,tank_gas_r,...,below_poverty_r,pop,male_pop_r,female_pop_r,elder_pop_r,white_r,black_r,asian_r,towork_less30_r,towork_over30_r
0,18681.0,11436,0.123,0.177,0.357,0.578,5454.0,0.002915,0.814,0.021,...,0.137,19425.0,0.489936,0.510064,0.000587,0.062445,0.701982,0.069086,0.252197,0.747803
1,62426.0,11213,0.106,0.219,0.31,0.532,24163.0,0.000857,0.781,0.01,...,0.276,64603.0,0.454638,0.545362,0.00017,0.199464,0.683467,0.017058,0.278333,0.721667


In [5]:
# Join all data set
df = pd.merge(df_weekly, df_acs, how = 'left', on = 'ZIPCODE')

# Drop dulicates 
df = df.drop_duplicates()

# Reindex based on the week
#index = df['Date']
#df.index = index

In [6]:
df.head()

Unnamed: 0,Count_com,Date,Count_vio,Year,Month,Week,ZIPCODE,POPULATION,disability_r,edu_high_r,...,below_poverty_r,pop,male_pop_r,female_pop_r,elder_pop_r,white_r,black_r,asian_r,towork_less30_r,towork_over30_r
0,9.0,2014-03-17,0.0,2014.0,3.0,12.0,10001,22413.0,0.078,0.667,...,0.204,23537.0,0.502485,0.497515,0.000501,0.602456,0.121893,0.191996,0.624704,0.375296
1,9.0,2014-03-24,1.0,2014.0,3.0,13.0,10001,22413.0,0.078,0.667,...,0.204,23537.0,0.502485,0.497515,0.000501,0.602456,0.121893,0.191996,0.624704,0.375296
2,9.0,2014-03-31,0.0,2014.0,3.0,14.0,10001,22413.0,0.078,0.667,...,0.204,23537.0,0.502485,0.497515,0.000501,0.602456,0.121893,0.191996,0.624704,0.375296
3,17.0,2014-04-07,0.0,2014.0,4.0,15.0,10001,22413.0,0.078,0.667,...,0.204,23537.0,0.502485,0.497515,0.000501,0.602456,0.121893,0.191996,0.624704,0.375296
4,2.0,2014-04-14,0.0,2014.0,4.0,16.0,10001,22413.0,0.078,0.667,...,0.204,23537.0,0.502485,0.497515,0.000501,0.602456,0.121893,0.191996,0.624704,0.375296


In [7]:
df.describe()



Unnamed: 0,Count_com,Count_vio,Year,Month,Week,POPULATION,disability_r,edu_high_r,edu_low_r,employment_r,...,below_poverty_r,pop,male_pop_r,female_pop_r,elder_pop_r,white_r,black_r,asian_r,towork_less30_r,towork_over30_r
count,26137.0,26547.0,26547.0,26547.0,26547.0,27645.0,27447.0,27447.0,27447.0,27447.0,...,27447.0,27448.0,27447.0,27447.0,27447.0,27447.0,27447.0,27447.0,27447.0,27447.0
mean,22.432299,9.020341,2015.023694,6.537349,26.84567,45985.637945,0.101681,0.38416,0.232783,0.585925,...,0.182151,48092.535157,0.478122,0.521878,0.000427,0.467314,0.222075,0.141062,0.334278,0.665722
std,47.106726,14.430171,0.819416,3.415917,14.967008,25956.569123,0.035197,0.213843,0.092047,0.080053,...,0.10323,26431.500512,0.026245,0.026245,0.000511,0.26217,0.2543,0.134774,0.122368,0.122368
min,0.0,0.0,2014.0,1.0,1.0,0.0,0.0,0.09,0.019,0.398,...,0.031,0.0,0.401037,0.405244,0.0,0.018846,0.000749,0.0,0.144436,0.315239
25%,,,,,,,,,,,...,,,,,,,,,,
50%,,,,,,,,,,,...,,,,,,,,,,
75%,,,,,,,,,,,...,,,,,,,,,,
max,716.0,135.0,2017.0,12.0,53.0,109069.0,0.239,0.898,0.456,0.87,...,0.479,112709.0,0.594756,0.598963,0.005067,0.989016,0.912797,0.73318,0.684761,0.855564


In [8]:
df['Year'] = pd.DatetimeIndex(df['Date']).year
df['Month'] = pd.DatetimeIndex(df['Date']).month
df['Week'] = pd.DatetimeIndex(df['Date']).week

# Export data as csv
df.to_csv('data/output/data_ready.csv')

In [9]:
#df[df['Date']=='2014-05-05']