In [88]:
%%javascript
$('#menubar').toggle();

<IPython.core.display.Javascript object>

Data from [data.census.org](https://data.census.gov/cedsci/table?q=ACSST5Y2018.S2001%20New%20York%20city,%20New%20York&g=0100000US.860000&y=2018&tid=ACSST5Y2018.S2001&moe=false&hidePreview=true).

Seems like we must use the 5 year estimates from 2018 (2019 doesn't have 5 year estimates).
The 1 year estimates don't provide ZIP code granularity.

See also [When to Use 1-year, 3-year, or 5-year Estimates](https://www.census.gov/programs-surveys/acs/guidance/estimates.html)

In [13]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 50)

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
import seaborn as sns
sns.set()
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
FILE_PATH = "data/ACSST5Y2018.S2001_data_with_overlays_2020-11-20T230734.csv"

In [20]:
income = pd.read_csv(FILE_PATH, skiprows=[1])
income.head(5)

Unnamed: 0,GEO_ID,NAME,S2001_C01_001E,S2001_C01_001M,S2001_C02_001E,S2001_C02_001M,S2001_C03_001E,S2001_C03_001M,S2001_C04_001E,S2001_C04_001M,S2001_C05_001E,S2001_C05_001M,S2001_C06_001E,S2001_C06_001M,S2001_C01_002E,S2001_C01_002M,S2001_C02_002E,S2001_C02_002M,S2001_C03_002E,S2001_C03_002M,S2001_C04_002E,S2001_C04_002M,S2001_C05_002E,S2001_C05_002M,S2001_C06_002E,...,S2001_C06_018M,S2001_C01_019E,S2001_C01_019M,S2001_C02_019E,S2001_C02_019M,S2001_C03_019E,S2001_C03_019M,S2001_C04_019E,S2001_C04_019M,S2001_C05_019E,S2001_C05_019M,S2001_C06_019E,S2001_C06_019M,S2001_C01_020E,S2001_C01_020M,S2001_C02_020E,S2001_C02_020M,S2001_C03_020E,S2001_C03_020M,S2001_C04_020E,S2001_C04_020M,S2001_C05_020E,S2001_C05_020M,S2001_C06_020E,S2001_C06_020M
0,8600000US00601,ZCTA5 00601,4165.0,407.0,4165.0,407.0,2347.0,260.0,2347.0,260.0,1818.0,261.0,1818.0,261.0,14448,1245,(X),(X),14331,1329,(X),(X),14641,2242,(X),...,(X),19840,2299,(X),(X),17241,16214,(X),(X),19950,1990,(X),(X),27333,18271,(X),(X),37667,5276,(X),(X),19697,10134,(X),(X)
1,8600000US00602,ZCTA5 00602,12288.0,603.0,12288.0,603.0,7017.0,409.0,7017.0,409.0,5271.0,442.0,5271.0,442.0,13322,1139,(X),(X),13222,1292,(X),(X),13487,1639,(X),...,(X),19317,1409,(X),(X),26048,1923,(X),(X),18176,1296,(X),(X),29073,5815,(X),(X),43676,17019,(X),(X),26864,2872,(X),(X)
2,8600000US00603,ZCTA5 00603,13838.0,697.0,13838.0,697.0,7840.0,495.0,7840.0,495.0,5998.0,425.0,5998.0,425.0,15980,677,(X),(X),16205,851,(X),(X),15656,1237,(X),...,(X),23089,2843,(X),(X),30125,8970,(X),(X),19411,2836,(X),(X),40570,6690,(X),(X),49946,9084,(X),(X),32171,6709,(X),(X)
3,8600000US00606,ZCTA5 00606,1489.0,175.0,1489.0,175.0,1003.0,149.0,1003.0,149.0,486.0,126.0,486.0,126.0,10554,2480,(X),(X),9913,3259,(X),(X),11231,4545,(X),...,(X),23194,9551,(X),(X),17188,16394,(X),(X),25321,8802,(X),(X),-,**,(X),(X),-,**,(X),(X),-,**,(X),(X)
4,8600000US00610,ZCTA5 00610,9100.0,619.0,9100.0,619.0,5116.0,381.0,5116.0,381.0,3984.0,399.0,3984.0,399.0,15298,717,(X),(X),14931,1031,(X),(X),15629,1026,(X),...,(X),25833,3599,(X),(X),28591,4740,(X),(X),21994,4142,(X),(X),35234,3263,(X),(X),38152,2679,(X),(X),33143,7189,(X),(X)


- How to deal with ZCTA? 
> In most instances the ZCTA code is the same as the ZIP Code for an area.
from [here](https://www.census.gov/programs-surveys/geography/guidance/geo-areas/zctas.html)

In [76]:
# Use slice 6 because there's a space after 'ZCTA5'
income["processed_zip"] = income["NAME"].str.slice(6)  

In [77]:
sample_complaints = pd.read_csv("data/sample_complaints.csv")
sample_complaints.head(2)

Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,Street Name,Cross Street 1,Cross Street 2,Intersection Street 1,Intersection Street 2,Address Type,City,Landmark,Facility Type,Status,Due Date,Resolution Description,Resolution Action Updated Date,Community Board,BBL,Borough,X Coordinate (State Plane),Y Coordinate (State Plane),Open Data Channel Type,Park Facility Name,Park Borough,Vehicle Type,Taxi Company Borough,Taxi Pick Up Location,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Latitude,Longitude,Location
0,25356119,04/13/2013 03:55:00 PM,04/13/2013 03:55:00 PM,DEP,Department of Environmental Protection,Sewer,Sewer Backup (Use Comments) (SA),,11429.0,215-27 HOLLIS AVENUE,HOLLIS AVENUE,215 ST,216 ST,,,ADDRESS,Queens Village,,,Closed,,The Department of Environmental Protection has...,04/13/2013 03:55:00 PM,13 QUEENS,4111010000.0,QUEENS,1055561.0,198095.0,PHONE,Unspecified,QUEENS,,,,,,,,40.710114,-73.742781,"(40.7101144, -73.7427809)"
1,25356123,04/13/2013 03:36:00 PM,04/13/2013 04:00:00 PM,DEP,Department of Environmental Protection,Water System,Hydrant Leaking (WC1),,10460.0,EAST TREMONT AVENUE,EAST TREMONT AVENUE,PROSPECT AVENUE,MAPES AVENUE,,,BLOCKFACE,BRONX,,,Closed,,The Department of Environmental Protection det...,04/13/2013 04:00:00 PM,06 BRONX,,BRONX,1015108.0,246825.0,PHONE,Unspecified,BRONX,,,,,,,,40.8441,-73.888469,"(40.8440997, -73.8884686)"


In [78]:
print("% ofinvalid zips", 100*sample_complaints["Incident Zip"].isna().mean())
sample_complaints.dropna(subset=["Incident Zip"], inplace=True)

% ofinvalid zips 0.40040040040040037


In [79]:
sample_complaints["processed_zip"] = sample_complaints["Incident Zip"].astype(int).astype(str)

In [82]:
pd.merge(sample_complaints, income, 
         on="processed_zip", validate="m:1")[["processed_zip", "S2001_C01_001E"]]

Unnamed: 0,processed_zip,S2001_C01_001E
0,11429,15130.0
1,11429,15130.0
2,11429,15130.0
3,10460,25474.0
4,10460,25474.0
...,...,...
989,11364,19615.0
990,11419,26256.0
991,11419,26256.0
992,11358,18953.0


In [87]:
"""
"S2001_C03_013E","Estimate!!Male!!Population 16 years and over with earnings!!FULL-TIME, YEAR-ROUND WORKERS WITH EARNINGS!!Median earnings (dollars) for full-time, year-round workers with earnings"
"""


'\n"S2001_C03_013E","Estimate!!Male!!Population 16 years and over with earnings!!FULL-TIME, YEAR-ROUND WORKERS WITH EARNINGS!!Median earnings (dollars) for full-time, year-round workers with earnings"\n'