In [1]:
import pandas as pd
import geopandas as gpd
from functools import reduce

In [2]:
hbd = pd.read_csv('./original_data/est_commuters_HBD.csv', index_col=0)
ipums = pd.read_csv('./original_data/commuter_origin_counts.csv',index_col=0)
ipums = ipums[ipums['YEAR']==2019].reset_index(drop=True)

#### We will only visualize common transmodes of hbd & ipums

In [3]:
transmode = list(set(hbd.TransMode.unique()) & set(ipums.TransMode.unique()))
print('In Both:',transmode)
hbd = hbd[hbd['TransMode'].isin(transmode)].reset_index(drop=True)

In Both: ['Ferry', 'AutoOccupants', 'CommuterRail', 'Subway', 'Bicycle', 'Bus']


#### Check the PointEntryExit of each sector

In [4]:
def entry_check(dataframe):
    hbd_sector_entry = dataframe[["Sector","PointEntryExit"]].drop_duplicates().reset_index(drop=True)
    entry_counts = pd.DataFrame(hbd_sector_entry['PointEntryExit'].value_counts())
    repeat_entry = entry_counts[entry_counts['PointEntryExit']>1].index
    return hbd_sector_entry[hbd_sector_entry['PointEntryExit'].isin(repeat_entry)]
entry_check(hbd)

Unnamed: 0,Sector,PointEntryExit
34,Brooklyn,Ferry
43,NewJersey,AmtrakNECorridor
45,NewJersey,Ferry
54,Queens,AmtrakNECorridor
55,Queens,Ferry
61,StatenIsland,Ferry


In [5]:
### this step is only for the convenience of visualization
### distinguish the PointEntryExit with the same name in each sector
hbd.loc[(hbd['PointEntryExit']=='Ferry') & (hbd['Sector']=='Brooklyn'),'PointEntryExit'] = 'Ferry_Brooklyn'
hbd.loc[(hbd['PointEntryExit']=='Ferry') & (hbd['Sector']=='Queens'),'PointEntryExit'] = 'Ferry_Queens'
hbd.loc[(hbd['PointEntryExit']=='Ferry') & (hbd['Sector']=='NewJersey'),'PointEntryExit'] = 'Ferry_NewJersey'
hbd.loc[(hbd['PointEntryExit']=='Ferry') & (hbd['Sector']=='StatenIsland'),'PointEntryExit'] = 'Ferry_StatenIsland'
hbd.loc[(hbd['PointEntryExit']=='AmtrakNECorridor') & (hbd['Sector']=='NewJersey'),'PointEntryExit'] = 'AmtrakNECorridor_NewJersey'
hbd.loc[(hbd['PointEntryExit']=='AmtrakNECorridor') & (hbd['Sector']=='Queens'),'PointEntryExit'] = 'AmtrakNECorridor_Queens'

#### Check what kinds of TransMode each entry has, and whether they are 'one-way'
For example, as shown in the below chart, '10thAve', '1stAve', '2ndAve'... are one-way road (bus and autos),\
but there are some slow traffic (bicycle) pass these entries in the wrong direction

In [6]:
pd.set_option('display.max_rows', None)

hbd_entry_dir = hbd.groupby(by=['PointEntryExit','TransMode','Direction']).agg({"Estimated_Commuters":"sum"}).reset_index()

## build a complete dataframe for 'sanity check', 62*6*2=744 (entry*mode*dir)
PointEntryExit = pd.DataFrame(hbd['PointEntryExit'].unique(),columns=['PointEntryExit'])
TransMode = pd.DataFrame(transmode,columns=['TransMode'])
Direction = pd.DataFrame(['In','Out'],columns=['Direction'])
PointEntryExit['tmp'] = 1
TransMode['tmp'] = 1
Direction['tmp'] = 1
tmp = reduce(lambda left,right:pd.merge(left,right,on='tmp',how='inner'),[PointEntryExit,TransMode,Direction]).drop('tmp',axis=1)

## using this table, we can observe what kinds of TransMode each entry has, and whether they are 'one-way'
check = tmp.merge(right=hbd_entry_dir,on=['PointEntryExit','TransMode','Direction'],how='outer')
check = check.pivot_table(values='Estimated_Commuters',index=['PointEntryExit'],columns=['TransMode','Direction'])
check.fillna('-')

TransMode,AutoOccupants,AutoOccupants,Bicycle,Bicycle,Bus,Bus,CommuterRail,CommuterRail,Ferry,Ferry,Subway,Subway
Direction,In,Out,In,Out,In,Out,In,Out,In,Out,In,Out
PointEntryExit,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
10thAveAmsterdamAve,-,25841.0,219.0,1504.0,-,1697.0,-,-,-,-,-,-
11thAveWestEndAve,18030.0,12663.0,791.0,91.0,1191.0,943.0,-,-,-,-,-,-
12thAveWestSideHighway,74175.0,64316.0,42.0,140.0,-,-,-,-,-,-,-,-
14thStTunnel,-,-,-,-,-,-,-,-,-,-,123513.0,123278.0
1stAve,-,32855.0,79.0,3515.0,-,8942.0,-,-,-,-,-,-
2ndAve,52336.0,-,4275.0,142.0,8245.0,-,-,-,-,-,-,-
2ndAveLocal,-,-,-,-,-,-,-,-,-,-,79827.0,72200.0
3rdAve,-,34899.0,88.0,2227.0,-,5488.0,-,-,-,-,-,-
53rdStTunnel,-,-,-,-,-,-,-,-,-,-,145109.0,145534.0
5thAve,28001.0,-,2498.0,32.0,12817.0,-,-,-,-,-,-,-


#### Reshaping the HBD dataset for easier spatial visualization

In [7]:
pd.reset_option('display.max_rows', None)
hbd_reshape = hbd.pivot_table(values='Estimated_Commuters', index=['Sector','PointEntryExit','TransMode','Hour'], columns='Direction').fillna(0).astype(int).reset_index()
hbd_reshape

Direction,Sector,PointEntryExit,TransMode,Hour,In,Out
0,60thSt,10thAveAmsterdamAve,AutoOccupants,0,0,693
1,60thSt,10thAveAmsterdamAve,AutoOccupants,1,0,432
2,60thSt,10thAveAmsterdamAve,AutoOccupants,2,0,308
3,60thSt,10thAveAmsterdamAve,AutoOccupants,3,0,319
4,60thSt,10thAveAmsterdamAve,AutoOccupants,4,0,350
...,...,...,...,...,...,...
2395,StatenIsland,Ferry_StatenIsland,Ferry,19,772,2310
2396,StatenIsland,Ferry_StatenIsland,Ferry,20,638,1331
2397,StatenIsland,Ferry_StatenIsland,Ferry,21,458,925
2398,StatenIsland,Ferry_StatenIsland,Ferry,22,295,730


#### Add the location information of each PointEntryExit

In [8]:
# Add 'ALL' (the sum of all TransMode) in the TransMode
hbd_allmodes = hbd_reshape.groupby(by=['Sector','PointEntryExit','Hour']).agg({"In":"sum","Out":"sum"}).reset_index()
hbd_allmodes['TransMode'] = 'ALL'
hbd_viz = pd.concat([hbd_reshape,hbd_allmodes],axis=0)

hbd_viz['In+Out'] = hbd_viz['In'] + hbd_viz['Out']    # to measure the total traffic volumn
hbd_viz['|In-Out|'] = abs(hbd_viz['In'] - hbd_viz['Out'])    # to measure the 'net' traffic volumn
hbd_viz['In>Out'] = (hbd_viz['In'] - hbd_viz['Out'])>0    # just for easier visualization

entryexit_point = gpd.read_file('./spatial_data/entryexit_point.geojson')
entryexit_point['lon'] = entryexit_point['geometry'].x
entryexit_point['lat'] = entryexit_point['geometry'].y
hbd_viz = hbd_viz.merge(entryexit_point,on=['Sector','PointEntryExit'])
hbd_viz = hbd_viz.drop(['Type','geometry'], axis=1)
hbd_viz

Unnamed: 0,Sector,PointEntryExit,TransMode,Hour,In,Out,In+Out,|In-Out|,In>Out,lon,lat
0,60thSt,10thAveAmsterdamAve,AutoOccupants,0,0,693,693,693,False,-73.987987,40.769873
1,60thSt,10thAveAmsterdamAve,AutoOccupants,1,0,432,432,432,False,-73.987987,40.769873
2,60thSt,10thAveAmsterdamAve,AutoOccupants,2,0,308,308,308,False,-73.987987,40.769873
3,60thSt,10thAveAmsterdamAve,AutoOccupants,3,0,319,319,319,False,-73.987987,40.769873
4,60thSt,10thAveAmsterdamAve,AutoOccupants,4,0,350,350,350,False,-73.987987,40.769873
...,...,...,...,...,...,...,...,...,...,...,...
3883,StatenIsland,Ferry_StatenIsland,ALL,19,772,2310,3082,1538,False,-74.012935,40.700181
3884,StatenIsland,Ferry_StatenIsland,ALL,20,638,1331,1969,693,False,-74.012935,40.700181
3885,StatenIsland,Ferry_StatenIsland,ALL,21,458,925,1383,467,False,-74.012935,40.700181
3886,StatenIsland,Ferry_StatenIsland,ALL,22,295,730,1025,435,False,-74.012935,40.700181


#### Now, the dataframe is good for spatial visualiation and analysis.

In [9]:
### if we want to use the timebar function of kepler.gl, time column must be 'time' type
hbd_viz['Hour'] = pd.to_datetime(hbd_viz['Hour'], unit='h')
hbd_viz.to_csv('hbd_viz.csv',index=0)

In [10]:
### check the total number, match perfectly
sum_by_sector_dir = hbd_viz[(hbd_viz['TransMode']!='ALL')].groupby(by=["Sector"]).agg({"In":"sum","Out":"sum"})
sum_by_sector_dir

Unnamed: 0_level_0,In,Out
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1
60thSt,1374700,1387208
Brooklyn,1075958,1044430
NewJersey,590217,576048
Queens,776591,762328
StatenIsland,34526,34264
