In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
%matplotlib inline

# Data cleaning
## * Independent variables (ACS)

In [2]:
# All these data was premilinarily cleaned based on ACS raw data

a = pd.read_csv('data/acs/disability.csv', dtype = {'Id2': str})
b = pd.read_csv('data/acs/edu.csv', dtype = {'Id2': str})
c = pd.read_csv('data/acs/employment.csv', dtype = {'Id2': str})
d = pd.read_csv('data/acs/hh.csv', dtype = {'Id2': str})
e = pd.read_csv('data/acs/housing_physical.csv', dtype = {'Id2': str})
f = pd.read_csv('data/acs/married_r.csv', dtype = {'Id2': str})
g = pd.read_csv('data/acs/non_eng.csv', dtype = {'Id2': str})
h = pd.read_csv('data/acs/poverty.csv', dtype = {'Id2': str})
i = pd.read_csv('data/acs/race.csv', dtype = {'Id2': str})
j = pd.read_csv('data/acs/towork.csv', dtype = {'Id2': str})
k = pd.read_csv('data/acs/pop.csv', dtype = {'Id2': str})
zipcode = gpd.read_file('data/ZIP_CODE_040114/ZIP_CODE_040114.shp')

In [3]:
zipcode.dtypes

AREA          float64
BLDGZIP        object
COUNTY         object
CTY_FIPS       object
POPULATION    float64
PO_NAME        object
SHAPE_AREA    float64
SHAPE_LEN     float64
STATE          object
ST_FIPS        object
URL            object
ZIPCODE        object
geometry       object
dtype: object

In [4]:
# Make a key 'Id2' which is matched with other ACS data
# Drop unrelated columns

zipcode['Id2'] = zipcode['ZIPCODE']
zipcode = zipcode.drop(['COUNTY', 'AREA', 'BLDGZIP', 'geometry', 'CTY_FIPS', 'PO_NAME', 'SHAPE_AREA', 'SHAPE_LEN', 'STATE', 'ST_FIPS',
                       'URL'], axis = 1)

In [5]:
# Make a percentage data (ratio data)

a['disability_r'] = a['disability_r']/100.0

b['edu_high_r'] = b['edu_high_r'] / 100.0
b['edu_low_r'] = b['edu_low_r'] / 100.0

c['employment_r'] = c['employment_r'] / 100.0

d['hh_with6_r'] = d['hh_with6'] / d['hh']
d = d.drop(['hh_with6'], axis = 1)

e['occupied_hu_own_r'] = e['occupied_hu_owner'] / e['occupied_hu']
e['occupied_hu_rent_r'] = e['occupied_hu_rent'] / e['occupied_hu']
e['built_before1980_r'] = e['built_before1980'] / 100.0
e['utility_gas_r'] = e['utility_gas_r'] / 100.0
e['tank_gas_r'] = e['tank_gas_r'] / 100.0
e['electricity_r'] = e['electricity_r'] / 100.0
e['fueloil_r'] = e['fueloil_r'] / 100.0
e['coal_r'] = e['coal_r'] / 100.0
e['other_fuels_r'] = e['other_fuels_r'] / 100.0
e = e.drop(['occupied_hu_owner', 'occupied_hu_rent', 'occupied_hu', 'built_before1980'], axis =1)

f['married_r'] = f['married_r'] / 100.0

g['non_eng_r'] = g['non_eng_r'] / 100.0

h['below_poverty_r'] = h['below_poverty_r'] / 100.0

i = pd.merge(k, i, how = 'left', on = 'Id2')

i['male_pop_r'] = i['male_pop'] / i['pop']
i['female_pop_r'] = i['female_pop'] / i['pop']
i['elder_pop_r'] = i['elderly_pop'] / i['pop']
i['white_r'] = i['white'] / i['pop']
i['black_r'] = i['black'] / i['pop']
i['asian_r'] = i['asian'] / i['pop']
i['native_r'] = i['native'] / i['pop']
i = i.drop(['male_pop', 'female_pop', 'elderly_pop', 'white', 'black', 'asian', 'native'], axis = 1)

j['towork_less30_r'] = j['towork_less30'] / j['towork_total']
j['towork_over30_r'] = j['towork_over30'] / j['towork_total']
j = j.drop(['towork_total', 'towork_less30', 'towork_over30'], axis = 1)

In [6]:
# Merge cleaned acs data with NYC zipcode 
acs_zip = pd.merge(zipcode, a, how = 'left', on = 'Id2')
acs_zip = pd.merge(acs_zip, b, how = 'left', on = 'Id2')
acs_zip = pd.merge(acs_zip, c, how = 'left', on = 'Id2')
acs_zip = pd.merge(acs_zip, d, how = 'left', on = 'Id2')
acs_zip = pd.merge(acs_zip, e, how = 'left', on = 'Id2')
acs_zip = pd.merge(acs_zip, f, how = 'left', on = 'Id2')
acs_zip = pd.merge(acs_zip, g, how = 'left', on = 'Id2')
acs_zip = pd.merge(acs_zip, h, how = 'left', on = 'Id2')
acs_zip = pd.merge(acs_zip, i, how = 'left', on = 'Id2')
acs_zip = pd.merge(acs_zip, j, how = 'left', on = 'Id2')

In [7]:
# Drop dulicates 
acs_zip = acs_zip.drop_duplicates()

In [8]:
#Export csv
acs_zip.to_csv('data/output/acs_zip.csv')

In [9]:
acs_zip

Unnamed: 0,POPULATION,ZIPCODE,Id2,disability_r,edu_high_r,edu_low_r,employment_r,hh,hh_with6_r,utility_gas_r,...,pop,male_pop_r,female_pop_r,elder_pop_r,white_r,black_r,asian_r,native_r,towork_less30_r,towork_over30_r
0,18681.0,11436,11436,0.123,0.177,0.357,0.578,5454.0,0.002915,0.814,...,19425.0,0.489936,0.510064,0.000587,0.062445,0.701982,0.069086,0.008134,0.252197,0.747803
1,62426.0,11213,11213,0.106,0.219,0.310,0.532,24163.0,0.000857,0.781,...,64603.0,0.454638,0.545362,0.000170,0.199464,0.683467,0.017058,0.003282,0.278333,0.721667
2,83866.0,11212,11212,0.120,0.125,0.366,0.461,31443.0,0.000681,0.801,...,88668.0,0.422362,0.577638,0.000130,0.049409,0.836040,0.012823,0.003259,0.201659,0.798341
3,56527.0,11225,11225,0.069,0.315,0.252,0.575,23133.0,0.000990,0.648,...,60180.0,0.455500,0.544500,0.000189,0.208109,0.701113,0.023845,0.004503,0.220052,0.779948
4,72280.0,11218,11218,0.086,0.418,0.212,0.607,24909.0,0.001104,0.674,...,75691.0,0.492502,0.507498,0.000137,0.572750,0.085255,0.171077,0.003739,0.266002,0.733998
5,106132.0,11226,11226,0.079,0.249,0.321,0.596,36009.0,0.000642,0.741,...,99026.0,0.449680,0.550320,0.000116,0.121352,0.720164,0.030739,0.001171,0.184667,0.815333
6,92561.0,11219,11219,0.090,0.170,0.335,0.542,26390.0,0.000940,0.812,...,97670.0,0.512163,0.487837,0.000105,0.656189,0.012839,0.249125,0.001464,0.392820,0.607180
7,67067.0,11210,11210,0.070,0.342,0.310,0.564,21925.0,0.000771,0.768,...,67432.0,0.456059,0.543941,0.000160,0.296165,0.594792,0.048226,0.001172,0.322547,0.677453
8,80857.0,11230,11230,0.118,0.389,0.229,0.526,30653.0,0.000757,0.651,...,88589.0,0.483017,0.516983,0.000161,0.695955,0.074129,0.142038,0.004493,0.334741,0.665259
9,77354.0,11204,11204,0.089,0.251,0.310,0.534,24480.0,0.000878,0.806,...,80486.0,0.493266,0.506734,0.000163,0.637788,0.008225,0.293865,0.002460,0.323334,0.676666


In [10]:
len(acs_zip)

248