In [1]:
print(packages_import_statement); pd.set_option("max_columns",0)


import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
import os

import re
import time 

from pprint import pprint




Data pulled from [IPUMS](https://usa.ipums.org/usa/index.shtml). Screenshot of our extract screen 

![ipums screenshot](images/ipums_data_pull.png)

In [2]:
dtypes_use={"SAMPLE":int,"PUMA":str,"PWPUMA00":str}

# Read in raw IPUMS csv from download
df = pd.read_csv("./ipums_data/ipums_data_v4.csv",dtype=dtypes_use)
print(df.shape)
df.head(2)

(289250, 34)


Unnamed: 0,YEAR,SAMPLE,SERIAL,CBSERIAL,HHWT,CLUSTER,REGION,STATEFIP,COUNTYFIP,DENSITY,CITY,PUMA,STRATA,GQ,PERNUM,PERWT,EMPSTAT,EMPSTATD,WKSWORK2,UHRSWORK,POVERTY,DIFFREM,DIFFPHYS,DIFFSENS,DIFFEYE,PWSTATE2,PWCOUNTY,PWPUMA00,TRANWORK,CARPOOL,RIDERS,TRANTIME,DEPARTS,ARRIVES
0,2017,201701,241934,2017000002252,55.0,2017002419341,11,9,5,415.4,0,500,50009,1,1,55.0,1,10,5,40,158,1,1,1,1,36,0,3100,10,1,1,90,702,834
1,2017,201701,241939,2017000002881,75.0,2017002419391,11,9,1,1433.9,0,100,10009,1,1,75.0,1,10,6,55,501,1,1,1,1,36,27,2800,10,1,1,60,732,834


In [3]:
### Cleaning Steps - weightt columns need to be divided by 100
df['HHWT_CLEANED'] = df['HHWT'].astype(float)/100
df['PERWT_CLEANED'] = df['PERWT'].astype(float)/100

### Create departs and arrives hour values - minutes are too granular
df["DEPARTS_HOUR"] = df["DEPARTS"]//100
df["ARRIVES_HOUR"] = df["ARRIVES"]//100

In [4]:
## Want to filter on place of work = Manhattan
## Has to be used at both state and PWPuma level - PWPUMA is state dependent.
## Even though we've filtered for PWSTATE2 in ipums, this is just us being extra cautious
works_in_ny = df['PWSTATE2']==36
works_in_manhattan = df["PWPUMA00"]=="3800"

df = df[works_in_ny&works_in_manhattan]

In [5]:
## Columns we will aggregate on for the main table
agg_columns = ['SAMPLE','STATEFIP','COUNTYFIP','PUMA'\
               ,'PWPUMA00','TRANWORK','DEPARTS_HOUR'\
               ,'ARRIVES_HOUR']

## Create year by year copies for separate files
df_17 = df[df['SAMPLE']==201701].copy()
df_18 = df[df['SAMPLE']==201801].copy()
df_19 = df[df['SAMPLE']==201901].copy()

In [6]:
df.groupby(by=agg_columns).agg({"PERWT_CLEANED":"sum"})\
.rename({"PERWT_CLEANED":"EST_PERSONS"},axis=1).reset_index().head(5)

Unnamed: 0,SAMPLE,STATEFIP,COUNTYFIP,PUMA,PWPUMA00,TRANWORK,DEPARTS_HOUR,ARRIVES_HOUR,EST_PERSONS
0,201701,9,1,100,3800,10,5,5,1.1
1,201701,9,1,100,3800,10,5,7,1.14
2,201701,9,1,100,3800,10,6,8,2.56
3,201701,9,1,100,3800,10,7,7,1.23
4,201701,9,1,100,3800,10,7,9,0.42


In [7]:
### Group by agg columns and sum up PERWT_CLEANED
### Save resulting data as travel_data csvs
df.groupby(by=agg_columns).agg({"PERWT_CLEANED":"sum"})\
.rename({"PERWT_CLEANED":"EST_PERSONS"},axis=1).reset_index()\
.to_csv("./ipums_data/ipums_nyc_commute_travel_data.csv")

df_17.groupby(by=agg_columns).agg({"PERWT_CLEANED":"sum"})\
.rename({"PERWT_CLEANED":"EST_PERSONS"},axis=1).reset_index()\
.to_csv("./ipums_data/ipums_nyc_commute_travel_data_2017.csv")

df_18.groupby(by=agg_columns).agg({"PERWT_CLEANED":"sum"})\
.rename({"PERWT_CLEANED":"EST_PERSONS"},axis=1).reset_index()\
.to_csv("./ipums_data/ipums_nyc_commute_travel_data_2018.csv")

df_19.groupby(by=agg_columns).agg({"PERWT_CLEANED":"sum"})\
.rename({"PERWT_CLEANED":"EST_PERSONS"},axis=1).reset_index()\
.to_csv("./ipums_data/ipums_nyc_commute_travel_data_2019.csv")

In [8]:
### Get health information by commuting pattern + place of origin
health_columns = ["DIFFREM","DIFFPHYS","DIFFSENS","DIFFEYE"]
agg_columns_health = ['SAMPLE','STATEFIP','COUNTYFIP','PUMA'\
                      ,'PWPUMA00','TRANWORK']+health_columns

### Save resulting data as health_commute_data csvs
df.groupby(by=agg_columns_health).agg({"PERWT_CLEANED":"sum"})\
.rename({"PERWT_CLEANED":"EST_PERSONS"},axis=1).reset_index()\
.to_csv("./ipums_data/ipums_nyc_health_commute_data.csv")

df_17.groupby(by=agg_columns_health).agg({"PERWT_CLEANED":"sum"})\
.rename({"PERWT_CLEANED":"EST_PERSONS"},axis=1).reset_index()\
.to_csv("./ipums_data/ipums_nyc_health_commute_data_2017.csv")

df_18.groupby(by=agg_columns_health).agg({"PERWT_CLEANED":"sum"})\
.rename({"PERWT_CLEANED":"EST_PERSONS"},axis=1).reset_index()\
.to_csv("./ipums_data/ipums_nyc_health_commute_data_2018.csv")

df_19.groupby(by=agg_columns_health).agg({"PERWT_CLEANED":"sum"})\
.rename({"PERWT_CLEANED":"EST_PERSONS"},axis=1).reset_index()\
.to_csv("./ipums_data/ipums_nyc_health_commute_data_2019.csv")

In [9]:
df.groupby(by=agg_columns_health).agg({"PERWT_CLEANED":"sum"})\
.rename({"PERWT_CLEANED":"EST_PERSONS"},axis=1).reset_index().head(5)

Unnamed: 0,SAMPLE,STATEFIP,COUNTYFIP,PUMA,PWPUMA00,TRANWORK,DIFFREM,DIFFPHYS,DIFFSENS,DIFFEYE,EST_PERSONS
0,201701,9,1,100,3800,10,1,1,1,1,7.89
1,201701,9,1,100,3800,32,1,1,1,1,0.63
2,201701,9,1,100,3800,32,1,2,1,1,0.97
3,201701,9,1,100,3800,36,1,1,1,1,1.06
4,201701,9,1,100,3800,37,1,1,1,1,19.04


In [10]:
##### Create TranWork reference table - values taken from codebook
tranwork_key = [(0,"N/A")
 ,(10,"Auto, truck, or van")
 ,(11,"Auto")
 ,(12,"Driver")
 ,(13,"Passenger")
 ,(14,"Truck")
 ,(15,"Van")
 ,(20,"Motorcycle")
 ,(31,"Bus")
 ,(32,"Bus or trolley bus")
 ,(33,"Bus or streetcar")
 ,(34,"Light rail, streetcar, or trolley (Carro público in PR)")
 ,(35,"Streetcar or trolley car (publico in Puerto Rico, 2000)")
 ,(36,"Subway or elevated")
 ,(37,"Long-distance train or commuter train")
 ,(38,"Taxicab")
 ,(39,"Ferryboat")
 ,(50,"Bicycle")
 ,(60,"Walked only")
 ,(70,"Other")
 ,(80,"Worked at home")
 ]
pd.DataFrame(tranwork_key,columns=['TRANWORK',"TRANWORK_DESC"])\
.to_csv("./ipums_data/ipums_tranwork_reference.csv")
    

In [11]:
##### Create geography table - raw table from here
# https://www.census.gov/geographies/reference-files/2019/demo/popest/2019-fips.html
geo_df = pd.read_csv("./ipums_data/census_data_geocodes_us_2019.csv",skiprows=4)
geo_df.rename({"Summary Level":"SUMMARY_LEVEL","State Code (FIPS)":"STATEFIP"\
              ,"County Code (FIPS)":"COUNTYFIP"\
               ,"County Subdivision Code (FIPS)":"COUNTY_SUBDIV_FIP"\
              ,"Place Code (FIPS)":"PLACE_FIP"
              ,"Consolidtated City Code (FIPS)":"CONSOL_CITY_FIP"\
              ,"Area Name (including legal/statistical area description)":"AREA_NAME"},axis=1,inplace=True)
geo_df.to_csv("./ipums_data/geocodes_us_2019_clean.csv")

In [12]:
geo_df.head(5)

Unnamed: 0,SUMMARY_LEVEL,STATEFIP,COUNTYFIP,COUNTY_SUBDIV_FIP,PLACE_FIP,CONSOL_CITY_FIP,AREA_NAME
0,10,0,0,0,0,0,United States
1,40,1,0,0,0,0,Alabama
2,50,1,1,0,0,0,Autauga County
3,50,1,3,0,0,0,Baldwin County
4,50,1,5,0,0,0,Barbour County


In [13]:
##### Create PUMA name table - raw table from 
# https://www2.census.gov/geo/docs/reference/puma/2010_PUMA_Names.txt
puma_df = pd.read_csv("./ipums_data/census_puma_names_2019.txt")
puma_df.rename({"STATEFP":"STATEFIP","PUMA5CE":"PUMA","PUMA NAME":"PUMA_NAME"},axis=1,inplace=True)
puma_df.to_csv("./ipums_data/pumanames_us_2019_clean.csv")

In [14]:
puma_df.head(5)

Unnamed: 0,STATEFIP,PUMA,PUMA_NAME
0,1,100,"Lauderdale, Colbert, Franklin & Marion (Northe..."
1,1,200,Limestone & Madison (Outer) Counties--Huntsvil...
2,1,301,Huntsville (North) & Madison (East) Cities
3,1,302,Huntsville City (Central & South)
4,1,400,DeKalb & Jackson Counties
