In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv(r"ny_od_main_JT00_2010.csv")
df = df.drop(columns=['createdate'])
df.head()

Unnamed: 0,w_geocode,h_geocode,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03
0,360010001001004,360010001001009,1,0,1,0,0,0,1,1,0,0
1,360010001001004,360010001001010,1,0,1,0,0,0,1,1,0,0
2,360010001001004,360010001001033,1,0,1,0,0,0,1,1,0,0
3,360010001001004,360010014004001,1,0,1,0,0,0,1,0,1,0
4,360010001001004,360010015001007,1,0,1,0,0,0,1,0,1,0


In [3]:
df['w_tract'] = df['w_geocode'].astype(str).str[:11].astype(int)
df['h_tract'] = df['h_geocode'].astype(str).str[:11].astype(int)

w_geocode: workplace census tract code

h_geocode: home census tract code

S000: total number of jobs

SA01: number of workers age 29 or younger

SA02: number of workers age 30-54

SA03: number of workers age 55 or older

SE01: number of jobs with earnings 1250 or less

SE02: number of jobs with earnings 1251-3333

SE03: number of jobs with earnings above 3334

SI01: number of jobs in goods producing industry sectors

SI02: number of jobs in Trade, Transportation and Utilities

SI03: number of jobs in All Other Services

In [4]:
df['w_county'] = df['w_tract'].astype(str).str[:5].astype(int)
df['h_county'] = df['h_tract'].astype(str).str[:5].astype(int)
df.head()

Unnamed: 0,w_geocode,h_geocode,S000,SA01,SA02,SA03,SE01,SE02,SE03,SI01,SI02,SI03,w_tract,h_tract,w_county,h_county
0,360010001001004,360010001001009,1,0,1,0,0,0,1,1,0,0,36001000100,36001000100,36001,36001
1,360010001001004,360010001001010,1,0,1,0,0,0,1,1,0,0,36001000100,36001000100,36001,36001
2,360010001001004,360010001001033,1,0,1,0,0,0,1,1,0,0,36001000100,36001000100,36001,36001
3,360010001001004,360010014004001,1,0,1,0,0,0,1,0,1,0,36001000100,36001001400,36001,36001
4,360010001001004,360010015001007,1,0,1,0,0,0,1,0,1,0,36001000100,36001001500,36001,36001


In [5]:
df_agg = df.groupby(['w_county', 'h_county']).sum().reset_index().drop(columns=['w_geocode', 'h_geocode'])

In [7]:
df_agg['w_albany_core'] = df_agg['w_county'].apply(lambda x: 1 if x in [36001, 36083, 36091, 36093] else 0)
df_agg['h_albany_core'] = df_agg['h_county'].apply(lambda x: 1 if x in [36001, 36083, 36091, 36093] else 0)
df_agg['w_syracuse_core'] = df_agg['w_county'].apply(lambda x: 1 if x in [36067] else 0)
df_agg['h_syracuse_core'] = df_agg['h_county'].apply(lambda x: 1 if x in [36067] else 0)

In [9]:
df_agg_s = df_agg[['w_county', 'h_county', 'w_albany_core', 'h_albany_core', 'w_syracuse_core', 'h_syracuse_core', 'S000']]

In [10]:
df_agg_s

Unnamed: 0,w_county,h_county,w_albany_core,h_albany_core,w_syracuse_core,h_syracuse_core,S000
0,36001,36001,1,1,0,0,88217
1,36001,36003,1,0,0,0,50
2,36001,36005,1,0,0,0,836
3,36001,36007,1,0,0,0,983
4,36001,36009,1,0,0,0,96
...,...,...,...,...,...,...,...
3739,36123,36113,0,0,0,0,6
3740,36123,36115,0,0,0,0,4
3741,36123,36117,0,0,0,0,103
3742,36123,36121,0,0,0,0,15


In [11]:
total_worker_per_w_county = df_agg_s[['w_county', 'S000']].groupby('w_county').sum().reset_index()
total_worker_per_h_county = df_agg_s[['h_county', 'S000']].groupby('h_county').sum().reset_index()

In [13]:
df_agg_s1 = pd.merge(df_agg_s, total_worker_per_w_county, left_on='w_county', right_on='w_county', how='inner')
df_agg_s2 = pd.merge(df_agg_s1, total_worker_per_h_county, left_on='h_county', right_on='h_county', how='inner')

In [14]:
df_agg_s2.rename(columns={'S000_x': 'S000', 'S000_y': 'S000_w_total', 'S000': 'S000_h_total'}, inplace=True)

In [16]:
df_agg_s2['w_share'] = df_agg_s2['S000'] / df_agg_s2['S000_w_total']
df_agg_s2['h_share'] = df_agg_s2['S000'] / df_agg_s2['S000_h_total']

In [19]:
df_agg_s2.query('h_albany_core==1 & h_share>0.25')

Unnamed: 0,w_county,h_county,w_albany_core,h_albany_core,w_syracuse_core,h_syracuse_core,S000,S000_w_total,S000_h_total,w_share,h_share
0,36001,36001,1,1,0,0,88217,214915,137366,0.410474,0.642204
41,36001,36083,1,1,0,0,28059,214915,69376,0.130559,0.404448
46,36001,36093,1,1,0,0,23053,214915,70658,0.107266,0.326262
2511,36083,36083,1,1,0,0,21264,45815,69376,0.464127,0.306504
2761,36091,36091,1,1,0,0,39614,72260,101801,0.548215,0.389132
2824,36093,36093,1,1,0,0,27276,61315,70658,0.44485,0.386028
