# Aggregation for income table
* Income table shows number of hh per tract and the median hh income
* When aggregating from tract up to larger geographies, we need to make sure that we're deriving the corrected weighted median hh income.
    * (median hh income for tract) * (# hh in tract)
    * merge in crosswalk to other geography
    * allocate then sum over all tracts within CD, NC, etc
    * sum # hh within CD, NC, etc
    * calculate the weighted median hh income
* Need to figure out how to reconstruct median hh income at aggregated level
* [Example of using income range](http://www.dof.ca.gov/Forecasting/Demographics/Census_Data_Center_Network/documents/How_to_Recalculate_a_Median.pdf)

In [1]:
import numpy as np
import pandas as pd
import intake
import os

In [2]:
catalog = intake.open_catalog('../catalogs/*.yml')

In [3]:
# Import Census tabular data
census = pd.read_parquet('s3://hcid-cdbg-project-ita-data/data/raw/raw_census_cleaned.parquet')
# Test this on 2016, 2017 for all census tracts
census = census[(census.year>=2016) & (census.table=='income')]

In [4]:
# Import crosswalks
council_districts = catalog.crosswalk_tracts_council_districts.read()
neighborhood_councils = catalog.crosswalk_tracts_neighborhood_councils.read()
zipcodes = catalog.crosswalk_tracts_zipcodes.read()
congressional_districts = catalog.crosswalk_tracts_congressional_districts.read()
neighborhoods = catalog.crosswalk_tracts_neighborhoods.read()

In [5]:
# Merge the hh and income portions of the table to calculate the weighted median hh income by tract.
hh = census[census.main_var == 'hh']
income = census[census.main_var == 'medincome']

m1 = pd.merge(hh, income, on = ['GEOID', 'year', 'table', 'second_var'], how = 'left', validate = 'm:1')
m1.rename(columns = {'num_x': 'hh', 'num_y': 'medincome'}, inplace = True)
m1['num'] = m1.hh * m1.medincome
m1 = m1[['GEOID', 'year', 'table', 'second_var', 'hh', 'medincome', 'num']]

In [10]:
boundaries = {'council_districts': council_districts, 'neighborhood_councils': neighborhood_councils,
             'zipcodes': zipcodes, 'congressional_districts': congressional_districts, 'neighborhoods': neighborhoods}

processed_dfs = {}

for key, value in boundaries.items():
    # Merge the income table with each boundary 
    merged = pd.merge(m1, value, on = 'GEOID', how = 'left', validate = 'm:1')
    merged.max_val = merged.max_val.fillna(0)
    # Allocate the num column according to however many CDs, NCs, etc each tract intersects with. 
    # Find the sum for num1, num2, ... columns. Then, append and take the sum again.
    n = merged.max_val.max().astype(int)
    uniform_id_col = 'ID'
    uniform_num_col = 'num' 
    uniform_pop_col = 'hh'
    aggregated = pd.DataFrame()
    # Depending on the boundary, tract might intersect with 1, 2,...,5 of the larger geographies.
    for i in range(1, n + 1):
        num_col = f"num{i}"
        pop_col = f"hh{i}"
        allocate_col = f"allocate{i}"
        id_col = f"ID{i}"
        # Allocate the num column for all the various intersections.
        merged[num_col] = merged.num * merged[allocate_col]
        merged[pop_col] = merged.hh * merged[allocate_col]
        # Take the sum of the num column by CD, NC, etc.
        agg = merged.groupby([id_col, 'year', 'table', 'second_var']).agg({pop_col: 'sum', num_col: 'sum'}).reset_index()
        agg.rename(columns = {id_col: uniform_id_col, num_col: uniform_num_col, pop_col: uniform_pop_col}, inplace = True)
        # Append these sums together
        aggregated = aggregated.append(agg)
    # Take the sum again. For each CD, NC, etc, calculate the total # of hh and the total hh-weighted income
    aggregated2 = aggregated.groupby([uniform_id_col, 'year', 'table', 'second_var']).agg({uniform_pop_col: 'sum', uniform_num_col: 'sum'}).reset_index()
    processed_dfs[key] = aggregated2

In [11]:
final_dfs = {}

for key, value in processed_dfs.items():
    df = value.copy()
    # Calculate median hh income 
    df['medincome'] = df.num / df.hh
    # Clean up final_df before saving into dictionary
    for col in ['hh', 'num']:
        df[col] = df[col].round(0).astype(int)
    df['medincome'] = df.medincome.round(2)
    final_dfs[key] = df

In [12]:
for key, value in final_dfs.items():
    display(key)
    display(value.head())

'council_districts'

Unnamed: 0,ID,year,table,second_var,hh,num,medincome
0,1.0,2016,income,amerind,995,6312478,6344.73
1,1.0,2016,income,asian,17207,552703129,32121.38
2,1.0,2016,income,black,3935,54748471,13913.38
3,1.0,2016,income,hisp,49094,1638819263,33381.48
4,1.0,2016,income,nonhisp,11868,657148623,55372.26


'neighborhood_councils'

Unnamed: 0,ID,year,table,second_var,hh,num,medincome
0,1.0,2016,income,amerind,32,0,0.0
1,1.0,2016,income,asian,1099,83756431,76221.44
2,1.0,2016,income,black,202,11505321,56986.15
3,1.0,2016,income,hisp,5697,394094314,69170.13
4,1.0,2016,income,nonhisp,1228,47048086,38319.2


'zipcodes'

Unnamed: 0,ID,year,table,second_var,hh,num,medincome
0,90001.0,2016,income,amerind,43,0,0.0
1,90001.0,2016,income,asian,16,0,0.0
2,90001.0,2016,income,black,1864,42081876,22572.66
3,90001.0,2016,income,hisp,6609,238807541,36132.74
4,90001.0,2016,income,nonhisp,49,0,0.0


'congressional_districts'

Unnamed: 0,ID,year,table,second_var,hh,num,medincome
0,625.0,2016,income,amerind,41,0,0.0
1,625.0,2016,income,asian,1875,229655805,122454.96
2,625.0,2016,income,black,273,10249595,37596.67
3,625.0,2016,income,hisp,774,44317599,57243.94
4,625.0,2016,income,nonhisp,4554,496872097,109095.12


'neighborhoods'

Unnamed: 0,ID,year,table,second_var,hh,num,medincome
0,1.0,2016,income,amerind,36,0,0.0
1,1.0,2016,income,asian,477,7451025,15615.52
2,1.0,2016,income,black,1644,32038517,19486.84
3,1.0,2016,income,hisp,2756,93308979,33852.24
4,1.0,2016,income,nonhisp,336,20612966,61317.14


In [15]:
test = final_dfs['council_districts']
test = test[(test.year==2017) & (test.ID==6)]

In [16]:
test

Unnamed: 0,ID,year,table,second_var,hh,num,medincome
110,6.0,2017,income,amerind,374,2307109,6164.09
111,6.0,2017,income,asian,8179,494820011,60499.31
112,6.0,2017,income,black,3765,78364380,20812.09
113,6.0,2017,income,hisp,46229,2088154784,45169.5
114,6.0,2017,income,nonhisp,18351,941643896,51312.78
115,6.0,2017,income,other,24037,1019532547,42414.46
116,6.0,2017,income,pacis,81,0,0.0
117,6.0,2017,income,race2,1300,51044729,39279.19
118,6.0,2017,income,total,77264,3787066430,49014.68
119,6.0,2017,income,white,39527,1970238147,49844.8
