# Aggregation for income table
* Income table shows number of hh per tract and the median hh income
* When aggregating from tract up to larger geographies, we need to make sure that we're deriving the corrected weighted median hh income.
    * (median hh income for tract) * (# hh in tract)
    * merge in crosswalk to other geography
    * allocate then sum over all tracts within CD, NC, etc
    * sum # hh within CD, NC, etc
    * calculate the weighted median hh income

In [1]:
import numpy as np
import pandas as pd
import intake
import os

In [2]:
catalog = intake.open_catalog('../catalogs/*.yml')

In [3]:
# Import Census tabular data
census = pd.read_parquet('s3://hcid-cdbg-project-ita-data/data/raw/raw_census_cleaned.parquet')
# Test this on 2016, 2017 for all census tracts
census = census[(census.year>=2016) & (census.table=='income')]

In [4]:
# Import crosswalks
council_districts = catalog.crosswalk_tracts_council_districts.read()
neighborhood_councils = catalog.crosswalk_tracts_neighborhood_councils.read()
zipcodes = catalog.crosswalk_tracts_zipcodes.read()
congressional_districts = catalog.crosswalk_tracts_congressional_districts.read()
neighborhoods = catalog.crosswalk_tracts_neighborhoods.read()

In [5]:
# Merge the hh and income portions of the table to calculate the weighted median hh income by tract.
hh = census[census.main_var == 'hh']
income = census[census.main_var == 'medincome']

m1 = pd.merge(hh, income, on = ['GEOID', 'year', 'table', 'second_var'], how = 'left', validate = 'm:1')
m1.rename(columns = {'num_x': 'hh', 'num_y': 'medincome'}, inplace = True)
m1['num'] = m1.hh * m1.medincome
m1 = m1[['GEOID', 'year', 'table', 'second_var', 'hh', 'medincome', 'num']]

In [6]:
boundaries = {'council_districts': council_districts, 'neighborhood_councils': neighborhood_councils,
             'zipcodes': zipcodes, 'congressional_districts': congressional_districts, 'neighborhoods': neighborhoods}

processed_dfs = {}

for key, value in boundaries.items():
    # Merge the income table table with each boundary 
    merged = pd.merge(m1, value, on = 'GEOID', how = 'left', validate = 'm:1')
    merged.max_val = merged.max_val.fillna(0)
    # Allocate the num column according to however many CDs, NCs, etc each tract intersects with. 
    # Find the sum for num1, num2, ... columns. Then, append and take the sum again.
    n = merged.max_val.max().astype(int)
    uniform_id_col = 'ID'
    uniform_num_col = 'num' 
    aggregated = pd.DataFrame()
    # Depending on the boundary, tract might intersect with 1, 2,...,5 of the larger geographies.
    for i in range(1, n + 1):
        new_col = f"num{i}"
        allocate_col = f"allocate{i}"
        # Allocate the num column for all the various intersections.
        merged[new_col] = merged.num * merged[allocate_col]
        id_col = f"ID{i}"
        num_col = f"num{i}"
        # Take the sum of the num column by CD, NC, etc.
        agg = merged.groupby([id_col, 'year', 'table', 'second_var']).agg({'hh': 'sum', num_col: 'sum'}).reset_index()
        agg.rename(columns = {id_col: uniform_id_col, num_col: uniform_num_col}, inplace = True)
        # Append these sums together
        aggregated = aggregated.append(agg)
    # Take the sum again. For each CD, NC, etc, calculate the total pop, # in labor force, # employed, # unemployed.
    aggregated2 = aggregated.groupby([uniform_id_col, 'year', 'table', 'second_var']).agg({'hh': 'sum', uniform_num_col: 'sum'}).reset_index()
    processed_dfs[key] = aggregated2

In [7]:
final_dfs = {}

for key, value in processed_dfs.items():
    df = value.copy()
    df['medincome'] = df.num / df.hh
    # Clean up final_df before saving into dictionary
    for col in ['hh', 'num']:
        df[col] = df[col].round(0).astype(int)
    df['medincome'] = df.medincome.round(2)
    final_dfs[key] = df

In [8]:
for key, value in final_dfs.items():
    display(key)
    display(value.head())

'council_districts'

Unnamed: 0,ID,year,table,second_var,hh,num,medincome
0,1.0,2016,income,amerind,1053,6312478,5994.76
1,1.0,2016,income,asian,18954,552703129,29160.24
2,1.0,2016,income,black,4197,54748471,13044.67
3,1.0,2016,income,hisp,52763,1638819263,31060.01
4,1.0,2016,income,nonhisp,13054,657148623,50340.79


'neighborhood_councils'

Unnamed: 0,ID,year,table,second_var,hh,num,medincome
0,1.0,2016,income,amerind,61,0,0.0
1,1.0,2016,income,asian,1891,83756431,44292.14
2,1.0,2016,income,black,387,11505321,29729.51
3,1.0,2016,income,hisp,8019,394094314,49145.07
4,1.0,2016,income,nonhisp,2097,47048086,22435.9


'zipcodes'

Unnamed: 0,ID,year,table,second_var,hh,num,medincome
0,90001.0,2016,income,amerind,55,0,0.0
1,90001.0,2016,income,asian,23,0,0.0
2,90001.0,2016,income,black,2557,42081876,16457.52
3,90001.0,2016,income,hisp,8333,238807541,28658.05
4,90001.0,2016,income,nonhisp,67,0,0.0


'congressional_districts'

Unnamed: 0,ID,year,table,second_var,hh,num,medincome
0,625.0,2016,income,amerind,104,0,0.0
1,625.0,2016,income,asian,3272,229655805,70188.2
2,625.0,2016,income,black,424,10249595,24173.57
3,625.0,2016,income,hisp,1131,44317599,39184.44
4,625.0,2016,income,nonhisp,6658,496872097,74627.83


'neighborhoods'

Unnamed: 0,ID,year,table,second_var,hh,num,medincome
0,1.0,2016,income,amerind,41,0,0.0
1,1.0,2016,income,asian,826,7451025,9020.61
2,1.0,2016,income,black,1764,32038517,18162.42
3,1.0,2016,income,hisp,3051,93308979,30583.08
4,1.0,2016,income,nonhisp,481,20612966,42854.4
