# Aggregation for employment table
* One strategy is to pre-aggregate by various geographies, and feed those pre-aggregated dfs into dashboard
* Or, can do the aggregation on one df in dashboard (which might take longer)

In [1]:
import numpy as np
import pandas as pd
import intake
import os

In [2]:
catalog = intake.open_catalog('../catalogs/*.yml')

In [8]:
# Import Census tabular data
census = pd.read_parquet('s3://hcid-cdbg-project-ita-data/data/raw/raw_census_cleaned.parquet')
census = census[(census.year>=2016) & (census.table=='emp') & (census.GEOID=='06037109601')]

In [9]:
census = census[(census.main_var=='pop') | (census.main_var=='lf')]
census = census[(census.second_var=='total_pop16') | (census.second_var=='white') | (census.second_var=='black')]

In [10]:
# Import crosswalks
council_districts = catalog.crosswalk_tracts_council_districts.read()
neighborhood_councils = catalog.crosswalk_tracts_neighborhood_councils.read()
zipcodes = catalog.crosswalk_tracts_zipcodes.read()
congressional_districts = catalog.crosswalk_tracts_congressional_districts.read()
neighborhoods = catalog.crosswalk_tracts_neighborhoods.read()

In [11]:
test1 = pd.merge(census, council_districts, on = 'GEOID', how = 'left', validate = 'm:1')
test1.max_val = test1.max_val.fillna(0)
n = test1.max_val.max().astype(int)  

In [12]:
for i in range(1, n + 1):
    new_col = f"num{i}"
    allocate_col = f"allocate{i}"
    test1[new_col] = test1.num * test1[allocate_col]

In [13]:
test1

Unnamed: 0,GEOID,year,variable,table,main_var,second_var,new_var,num,pct,max_val,ID1,allocate1,ID2,allocate2,ID3,allocate3,num1,num2
0,6037109601,2016,S2301_C01_001E,emp,pop,total_pop16,pop_total_pop16,4285.0,1.0,2,7.0,0.826238,6.0,0.173762,,,3540.430068,744.569932
1,6037109601,2016,S2301_C01_012E,emp,pop,white,pop_white,2365.0,1.0,2,7.0,0.826238,6.0,0.173762,,,1954.053001,410.946999
2,6037109601,2016,S2301_C01_013E,emp,pop,black,pop_black,78.0,1.0,2,7.0,0.826238,6.0,0.173762,,,64.446568,13.553432
3,6037109601,2017,S2301_C01_001E,emp,pop,total_pop16,pop_total_pop16,4220.0,1.0,2,7.0,0.826238,6.0,0.173762,,,3486.724594,733.275406
4,6037109601,2017,S2301_C01_012E,emp,pop,white,pop_white,2442.0,1.0,2,7.0,0.826238,6.0,0.173762,,,2017.673332,424.326668
5,6037109601,2017,S2301_C01_013E,emp,pop,black,pop_black,69.0,1.0,2,7.0,0.826238,6.0,0.173762,,,57.010426,11.989574
6,6037109601,2016,S2301_C02_001E,emp,lf,total_pop16,lf_total_pop16,2764.0,0.645,2,7.0,0.826238,6.0,0.173762,,,2283.721985,480.278015
7,6037109601,2016,S2301_C02_012E,emp,lf,white,lf_white,1391.0,0.588,2,7.0,0.826238,6.0,0.173762,,,1149.297135,241.702865
8,6037109601,2016,S2301_C02_013E,emp,lf,black,lf_black,50.0,0.641,2,7.0,0.826238,6.0,0.173762,,,41.311903,8.688097
9,6037109601,2017,S2301_C02_001E,emp,lf,total_pop16,lf_total_pop16,2743.0,0.65,2,7.0,0.826238,6.0,0.173762,,,2266.370986,476.629014


In [25]:
sum1 = test1.groupby(['ID1', 'year', 'table', 'main_var', 'second_var', 'new_var']).agg({'num1': 'sum'}).reset_index()
sum1.rename(columns = {'ID1': 'CD', 'num1': 'num'}, inplace = True)
sum2 = test1.groupby(['ID2', 'year', 'table', 'main_var', 'second_var', 'new_var']).agg({'num2': 'sum'}).reset_index()
sum2.rename(columns = {'ID2': 'CD', 'num2': 'num'}, inplace = True)

In [28]:
sum_cd = sum1.append(sum2, sort = False)
sum_cd.head(20)

Unnamed: 0,CD,year,table,main_var,second_var,new_var,num
0,7.0,2016,emp,lf,black,lf_black,41.311903
1,7.0,2016,emp,lf,total_pop16,lf_total_pop16,2283.721985
2,7.0,2016,emp,lf,white,lf_white,1149.297135
3,7.0,2016,emp,pop,black,pop_black,64.446568
4,7.0,2016,emp,pop,total_pop16,pop_total_pop16,3540.430068
5,7.0,2016,emp,pop,white,pop_white,1954.053001
6,7.0,2017,emp,lf,black,lf_black,42.964379
7,7.0,2017,emp,lf,total_pop16,lf_total_pop16,2266.370986
8,7.0,2017,emp,lf,white,lf_white,1228.615989
9,7.0,2017,emp,pop,black,pop_black,57.010426


In [29]:
pop = sum_cd[sum_cd.main_var=='pop']
lf = sum_cd[sum_cd.main_var=='lf']

In [34]:
m2 = pd.merge(pop, lf, on = ['CD', 'year', 'table', 'second_var'], how = 'left', validate = '1:1')
m2.rename(columns = {'num_y': 'num_lf', 'num_x': 'num_pop'}, inplace = True)
m2['pct_lf'] = m2.num_lf / m2.num_pop
m2

Unnamed: 0,CD,year,table,main_var_x,second_var,new_var_x,num_pop,main_var_y,new_var_y,num_lf,pct_lf
0,7.0,2016,emp,pop,black,pop_black,64.446568,lf,lf_black,41.311903,0.641026
1,7.0,2016,emp,pop,total_pop16,pop_total_pop16,3540.430068,lf,lf_total_pop16,2283.721985,0.645041
2,7.0,2016,emp,pop,white,pop_white,1954.053001,lf,lf_white,1149.297135,0.588161
3,7.0,2017,emp,pop,black,pop_black,57.010426,lf,lf_black,42.964379,0.753623
4,7.0,2017,emp,pop,total_pop16,pop_total_pop16,3486.724594,lf,lf_total_pop16,2266.370986,0.65
5,7.0,2017,emp,pop,white,pop_white,2017.673332,lf,lf_white,1228.615989,0.608927
6,6.0,2016,emp,pop,black,pop_black,13.553432,lf,lf_black,8.688097,0.641026
7,6.0,2016,emp,pop,total_pop16,pop_total_pop16,744.569932,lf,lf_total_pop16,480.278015,0.645041
8,6.0,2016,emp,pop,white,pop_white,410.946999,lf,lf_white,241.702865,0.588161
9,6.0,2017,emp,pop,black,pop_black,11.989574,lf,lf_black,9.035621,0.753623


In [None]:
boundaries = {'council_districts': council_districts, 'neighborhood_councils': neighborhood_councils}

merged_dfs = {}

for key, value in boundaries.items():
    m1 = pd.merge(census, value, on = 'GEOID', how = 'left', validate = 'm:1')
    m1.max_val = m1.max_val.fillna(0)
    n = m1.max_val.max().astype(int)
    for i in range(1, n + 1):
        new_col = f"num{i}"
        allocate_col = f"allocate{i}"
        m1[new_col] = m1.num * m1[allocate_col]
    merged_dfs[key] = m1 

In [None]:
for key, value in merged_dfs.items():
    display(key)
    df1 = value.groupby(['GEOID', 'year', 'main_var'])