# Aggregation for employment table
* Employment table is structured differently than other ACS tables
* It has population, labor force participation rate, employment/population ratio, and unemployment rate by race.
* The lf, epr, and unemp portions need to be merged with population, then we can calculate the % lf, % employed, % unemployed by race at the CD, NC, etc level.
* When aggregating from tract up to larger geographies, we need to make sure that we're aggregating raw counts, then taking the percents. Otherwise, a simple average of the percents would not be population-weighted and is incorrect.
* Dashboard: 
    * pre-aggregate by various geographies, then feed the pre-aggregated df into dashboard.
    * Or, do the aggregation off of 1 giant df. Might be longer?

In [1]:
import numpy as np
import pandas as pd
import intake
import os

In [2]:
catalog = intake.open_catalog('../catalogs/*.yml')

In [3]:
# Import Census tabular data
census = pd.read_parquet('s3://hcid-cdbg-project-ita-data/data/raw/raw_census_cleaned.parquet')
# Test this on 2016, 2017 for all census tracts
census = census[(census.year>=2016) & (census.table=='emp')]

In [4]:
# Import crosswalks
council_districts = catalog.crosswalk_tracts_council_districts.read()
neighborhood_councils = catalog.crosswalk_tracts_neighborhood_councils.read()
zipcodes = catalog.crosswalk_tracts_zipcodes.read()
congressional_districts = catalog.crosswalk_tracts_congressional_districts.read()
neighborhoods = catalog.crosswalk_tracts_neighborhoods.read()

In [5]:
boundaries = {'council_districts': council_districts, 'neighborhood_councils': neighborhood_councils,
             'zipcodes': zipcodes, 'congressional_districts': congressional_districts, 'neighborhoods': neighborhoods}

processed_dfs = {}

for key, value in boundaries.items():
    # Merge the emp table with each boundary 
    merged = pd.merge(census, value, on = 'GEOID', how = 'left', validate = 'm:1')
    merged.max_val = merged.max_val.fillna(0)
    # Allocate the num column according to however many CDs, NCs, etc each tract intersects with. 
    # Find the sum for num1, num2, ... columns. Then, append and take the sum again.
    n = merged.max_val.max().astype(int)
    uniform_id_col = 'ID'
    uniform_num_col = 'num' 
    aggregated = pd.DataFrame()
    # Depending on the boundary, tract might intersect with 1, 2,...,5 of the larger geographies.
    for i in range(1, n + 1):
        new_col = f"num{i}"
        allocate_col = f"allocate{i}"
        # Allocate the num column for all the various intersections.
        merged[new_col] = merged.num * merged[allocate_col]
        id_col = f"ID{i}"
        num_col = f"num{i}"
        # Take the sum of the num column by CD, NC, etc.
        agg = merged.groupby([id_col, 'year', 'table', 'main_var', 'second_var', 'new_var']).agg({num_col: 'sum'}).reset_index()
        agg.rename(columns = {id_col: uniform_id_col, num_col: uniform_num_col}, inplace = True)
        # Append these sums together
        aggregated = aggregated.append(agg)
    # Take the sum again. For each CD, NC, etc, calculate the total pop, # in labor force, # employed, # unemployed.
    aggregated2 = aggregated.groupby([uniform_id_col, 'year', 'table', 'main_var', 'second_var', 'new_var']).agg({uniform_num_col: 'sum'}).reset_index()
    processed_dfs[key] = aggregated2

In [6]:
final_dfs = {}

for key, value in processed_dfs.items():
    pop = value[value.main_var == 'pop']
    # Get a final_df ready because the derived output will be merged onto this "base" later
    final_df = pop[['ID', 'year', 'table', 'second_var']]
    for emp_var in ['lf', 'epr', 'unemp']:
        subset = value[value.main_var == emp_var]
        num_col = f"num_{emp_var}"
        pct_col = f"pct_{emp_var}"
        # Merge the pop table with labor force, employment population ratio, and unemployment rate
        df = pd.merge(pop, subset, on = ['ID', 'year', 'table', 'second_var'], how = 'left', validate = '1:1')
        df.rename(columns = {'num_x': 'num_pop', 'num_y': num_col}, inplace = True)
        # Derive the % in labor force, employed, unemployment rate for the new boundary
        df[pct_col] = df[num_col] / df.num_pop
        df = df[['ID', 'year', 'table', 'second_var', 'num_pop', num_col, pct_col]]
        final_df = pd.merge(final_df, df, on = ['ID', 'year', 'table', 'second_var'], how = 'left', validate = '1:1')
    # Clean up final_df before saving into dictionary
    final_df.drop(columns = ['num_pop_x', 'num_pop_y'], inplace = True)
    final_df = final_df[['ID', 'year', 'table', 'second_var', 'num_pop', 
                         'num_lf', 'pct_lf', 'num_epr', 'pct_epr', 'num_unemp', 'pct_unemp']]
    for col in ['num_pop', 'num_lf', 'num_epr', 'num_unemp']:
        final_df[col] = final_df[col].round(0).astype(int)
    final_dfs[key] = final_df

In [7]:
for key, value in final_dfs.items():
    display(key)
    display(value.head())

'council_districts'

Unnamed: 0,ID,year,table,second_var,num_pop,num_lf,pct_lf,num_epr,pct_epr,num_unemp,pct_unemp
0,1.0,2016,emp,amerind,2340,1696,0.725022,1518,0.648942,228,0.097318
1,1.0,2016,emp,asian,40077,22184,0.553539,20442,0.510064,3085,0.076974
2,1.0,2016,emp,ba,30139,25916,0.859893,24287,0.805833,1892,0.062768
3,1.0,2016,emp,black,7001,4261,0.608616,3672,0.52448,1081,0.154404
4,1.0,2016,emp,college,26710,21497,0.80486,19819,0.742027,2057,0.077024


'neighborhood_councils'

Unnamed: 0,ID,year,table,second_var,num_pop,num_lf,pct_lf,num_epr,pct_epr,num_unemp,pct_unemp
0,1.0,2016,emp,amerind,179,91,0.510281,79,0.443248,16,0.089377
1,1.0,2016,emp,asian,3576,2325,0.650176,2153,0.602016,249,0.069572
2,1.0,2016,emp,ba,3099,2809,0.906478,2705,0.872931,118,0.03806
3,1.0,2016,emp,black,540,354,0.654566,288,0.533638,102,0.189723
4,1.0,2016,emp,college,4165,3582,0.859964,3345,0.803045,276,0.066191


'zipcodes'

Unnamed: 0,ID,year,table,second_var,num_pop,num_lf,pct_lf,num_epr,pct_epr,num_unemp,pct_unemp
0,90001.0,2016,emp,amerind,228,146,0.641074,142,0.622049,8,0.034244
1,90001.0,2016,emp,asian,59,34,0.571096,22,0.376955,12,0.204967
2,90001.0,2016,emp,ba,954,774,0.811127,763,0.799321,14,0.014909
3,90001.0,2016,emp,black,3848,1658,0.430924,1333,0.346312,753,0.195699
4,90001.0,2016,emp,college,2560,1942,0.758754,1740,0.679923,271,0.106002


'congressional_districts'

Unnamed: 0,ID,year,table,second_var,num_pop,num_lf,pct_lf,num_epr,pct_epr,num_unemp,pct_unemp
0,625.0,2016,emp,amerind,81,49,0.607701,47,0.587299,2,0.020402
1,625.0,2016,emp,asian,4963,3166,0.637909,2975,0.599423,286,0.057665
2,625.0,2016,emp,ba,6852,5622,0.820567,5344,0.779966,343,0.050047
3,625.0,2016,emp,black,492,328,0.665656,273,0.553914,75,0.152376
4,625.0,2016,emp,college,3135,2438,0.777806,2335,0.744665,133,0.042371


'neighborhoods'

Unnamed: 0,ID,year,table,second_var,num_pop,num_lf,pct_lf,num_epr,pct_epr,num_unemp,pct_unemp
0,1.0,2016,emp,amerind,119,119,1.0,72,0.606249,47,0.393751
1,1.0,2016,emp,asian,1521,536,0.352747,497,0.326601,140,0.092301
2,1.0,2016,emp,ba,1490,1100,0.738466,1008,0.676724,116,0.077849
3,1.0,2016,emp,black,3064,1553,0.506762,1163,0.379632,762,0.24881
4,1.0,2016,emp,college,2170,1637,0.754364,1447,0.666824,245,0.11297
