# Aggregation for income table
* Income table shows number of hh per tract and the median hh income
* When aggregating from tract up to larger geographies, we need to make sure that we're deriving the corrected weighted median hh income.
    * (median hh income for tract) * (# hh in tract)
    * merge in crosswalk to other geography
    * allocate then sum over all tracts within CD, NC, etc
    * sum # hh within CD, NC, etc
    * calculate the weighted median hh income
* Need to figure out how to reconstruct median hh income at aggregated level
* [Dept of Finance example of using income range](http://www.dof.ca.gov/Forecasting/Demographics/Census_Data_Center_Network/documents/How_to_Recalculate_a_Median.pdf)

In [1]:
import numpy as np
import pandas as pd
import intake
import os

In [2]:
catalog = intake.open_catalog('../catalogs/*.yml')

## New: Use income ranges, aggregate, then recalculate the median

In [3]:
# Import Census tabular data
census = pd.read_parquet('s3://hcid-cdbg-project-ita-data/data/raw/raw_census_cleaned.parquet')

# Test this on 2016, 2017 for all census tracts
incomerange = census[(census.year >= 2016) & (census.table=='incomerange')]
incomerange_hh = census[(census.year >= 2016) & (census.table=='incomerange_hh')]
incomerange_hh = incomerange_hh[incomerange_hh.second_var.str.find('inc') == -1]

In [4]:
# Import crosswalks
council_districts = catalog.crosswalk_tracts_council_districts.read()
neighborhood_councils = catalog.crosswalk_tracts_neighborhood_councils.read()
zipcodes = catalog.crosswalk_tracts_zipcodes.read()
congressional_districts = catalog.crosswalk_tracts_congressional_districts.read()
neighborhoods = catalog.crosswalk_tracts_neighborhoods.read()

In [5]:
boundaries = {'council_districts': council_districts, 'neighborhood_councils': neighborhood_councils,
             'zipcodes': zipcodes, 'congressional_districts': congressional_districts, 'neighborhoods': neighborhoods}


# Loop through incomerange and incomerange_hh tables
income_dfs = {'incomerange': incomerange, 'incomerange_hh': incomerange_hh}

processed_dfs = {}

for key, value in boundaries.items():
    # Loop through incomerange and incomerange_hh tables, since they have the same structure.
    for filename, file in income_dfs.items():
        # Merge the table with each boundary 
        merged = pd.merge(file, value, on = 'GEOID', how = 'left', validate = 'm:1')
        merged.max_val = merged.max_val.fillna(0)
        # Allocate the num column according to however many CDs, NCs, etc each tract intersects with. 
        # Find the sum for num1, num2, ... columns. Then, append and take the sum again.
        n = merged.max_val.max().astype(int)
        uniform_id_col = 'ID'
        uniform_num_col = 'num' 
        aggregated = pd.DataFrame()
        # Depending on the boundary, tract might intersect with 1, 2,...,5 of the larger geographies.
        for i in range(1, n + 1):
            num_col = f"num{i}"
            allocate_col = f"allocate{i}"
            id_col = f"ID{i}"
            # Allocate the num column for all the various intersections.
            merged[num_col] = merged.num * merged[allocate_col]
            # Take the sum of the num column by CD, NC, etc.
            agg = merged.groupby([id_col, 'year', 'table', 'main_var', 'second_var']).agg({num_col: 'sum'}).reset_index()
            agg.rename(columns = {id_col: uniform_id_col, num_col: uniform_num_col}, inplace = True)
            # Append these sums together
            aggregated = aggregated.append(agg)
        # Take the sum again. For each CD, NC, etc, calculate the total # of hh and the total hh-weighted income
        aggregated2 = aggregated.groupby([uniform_id_col, 'year', 'table', 'main_var', 'second_var']).agg({uniform_num_col: 'sum'}).reset_index()
        # Round the number of households in each range, since allocating them results in decimal places
        final_df = f"{filename}_{key}"
        processed_dfs[final_df] = aggregated2

In [6]:
for key, value in processed_dfs.items():
    display(key)
    display(value.head())

'incomerange_council_districts'

Unnamed: 0,ID,year,table,main_var,second_var,num
0,1.0,2016,incomerange,amerind,gt200,0.0
1,1.0,2016,incomerange,amerind,lt10,128.616755
2,1.0,2016,incomerange,amerind,r100to124,76.306859
3,1.0,2016,incomerange,amerind,r10to14,95.0
4,1.0,2016,incomerange,amerind,r125to149,12.065514


'incomerange_hh_council_districts'

Unnamed: 0,ID,year,table,main_var,second_var,num
0,1.0,2016,incomerange_hh,families,gt200,188.083813
1,1.0,2016,incomerange_hh,families,lt10,620.862927
2,1.0,2016,incomerange_hh,families,r100to149,497.236481
3,1.0,2016,incomerange_hh,families,r10to14,530.133344
4,1.0,2016,incomerange_hh,families,r150to199,154.040598


'incomerange_neighborhood_councils'

Unnamed: 0,ID,year,table,main_var,second_var,num
0,1.0,2016,incomerange,amerind,gt200,0.0
1,1.0,2016,incomerange,amerind,lt10,0.0
2,1.0,2016,incomerange,amerind,r100to124,4.539387
3,1.0,2016,incomerange,amerind,r10to14,0.0
4,1.0,2016,incomerange,amerind,r125to149,0.0


'incomerange_hh_neighborhood_councils'

Unnamed: 0,ID,year,table,main_var,second_var,num
0,1.0,2016,incomerange_hh,families,gt200,20.116843
1,1.0,2016,incomerange_hh,families,lt10,14.389658
2,1.0,2016,incomerange_hh,families,r100to149,105.597044
3,1.0,2016,incomerange_hh,families,r10to14,11.701714
4,1.0,2016,incomerange_hh,families,r150to199,35.317811


'incomerange_zipcodes'

Unnamed: 0,ID,year,table,main_var,second_var,num
0,90001.0,2016,incomerange,amerind,gt200,0.0
1,90001.0,2016,incomerange,amerind,lt10,0.0
2,90001.0,2016,incomerange,amerind,r100to124,6.339701
3,90001.0,2016,incomerange,amerind,r10to14,0.0
4,90001.0,2016,incomerange,amerind,r125to149,0.0


'incomerange_hh_zipcodes'

Unnamed: 0,ID,year,table,main_var,second_var,num
0,90001.0,2016,incomerange_hh,families,gt200,0.621075
1,90001.0,2016,incomerange_hh,families,lt10,82.720078
2,90001.0,2016,incomerange_hh,families,r100to149,41.267252
3,90001.0,2016,incomerange_hh,families,r10to14,46.661046
4,90001.0,2016,incomerange_hh,families,r150to199,13.733813


'incomerange_congressional_districts'

Unnamed: 0,ID,year,table,main_var,second_var,num
0,625.0,2016,incomerange,amerind,gt200,0.0
1,625.0,2016,incomerange,amerind,lt10,0.0
2,625.0,2016,incomerange,amerind,r100to124,0.0
3,625.0,2016,incomerange,amerind,r10to14,15.0
4,625.0,2016,incomerange,amerind,r125to149,20.0


'incomerange_hh_congressional_districts'

Unnamed: 0,ID,year,table,main_var,second_var,num
0,625.0,2016,incomerange_hh,families,gt200,150.008198
1,625.0,2016,incomerange_hh,families,lt10,12.461617
2,625.0,2016,incomerange_hh,families,r100to149,147.314217
3,625.0,2016,incomerange_hh,families,r10to14,10.466922
4,625.0,2016,incomerange_hh,families,r150to199,111.645772


'incomerange_neighborhoods'

Unnamed: 0,ID,year,table,main_var,second_var,num
0,1.0,2016,incomerange,amerind,gt200,0.0
1,1.0,2016,incomerange,amerind,lt10,0.0
2,1.0,2016,incomerange,amerind,r100to124,0.0
3,1.0,2016,incomerange,amerind,r10to14,0.0
4,1.0,2016,incomerange,amerind,r125to149,0.0


'incomerange_hh_neighborhoods'

Unnamed: 0,ID,year,table,main_var,second_var,num
0,1.0,2016,incomerange_hh,families,gt200,7.194837
1,1.0,2016,incomerange_hh,families,lt10,58.33284
2,1.0,2016,incomerange_hh,families,r100to149,37.85088
3,1.0,2016,incomerange_hh,families,r10to14,36.87934
4,1.0,2016,incomerange_hh,families,r150to199,19.389673


### Re-calculate median

In [7]:
# Define the order, so we can generate cumulative percents
incomerange_order = {'total': 1, 'lt10': 2, 'r10to14': 3, 'r15to19': 4, 'r20to24': 5,
           'r25to29': 6, 'r30to34': 7, 'r35to39': 8, 'r40to44': 9,  'r45to49': 10,
            'r50to59': 11, 'r60to74': 12, 'r75to99': 13, 'r100to124': 14, 'r125to149': 15,
            'r150to199': 16,  'gt200': 17}

incomerange_hh_order = {'total': 1, 'lt10': 2, 'r10to14': 3, 'r15to24': 4, 'r25to34': 5,
           'r35to49': 6,  'r50to74': 7, 'r75to99': 8, 'r100to149': 9, 'r150to199': 10,
           'gt200': 11}


# Define functions that will extract the upper and lower bounds for the income range
# Since the width of the range changes depending where you are on the income scale, we need the exact width to re-calculate the new median
def lower_bound(row):
    # Set lower bound to be 1
    if row.second_var.find('lt') != -1:
        return 1
    elif row.second_var.find('gt') != -1:
        return row.second_var.split('gt')[1][0:]
    elif (row.second_var.find('to') != -1) & (row.second_var != 'total'):
        return row.second_var.split('to')[0][1:]
    # Set it as 1. Upper bound is also 1, so width of the range is 0. For total, it doesn't matter, anyway.
    elif row.second_var == 'total':
        return 1
    
def upper_bound(row):
    if row.second_var.find('lt') != -1:
        return row.second_var.split('lt')[1][0:]
    elif row.second_var.find('gt') != -1:
        return row.second_var.split('gt')[1][0:]
    elif (row.second_var.find('to') != -1) & (row.second_var != 'total'):
        return row.second_var.split('to')[1][0:]
    elif row.second_var == 'total':
        return 1

In [8]:
final_dfs = {}

for key, value in processed_dfs.items():
    df = value.copy()
    if key.find('hh') == -1:
        df['order'] = df.second_var.map(incomerange_order)
    elif key.find('hh') != -1:
        df['order'] = df.second_var.map(incomerange_hh_order)
    # Sort into the correct order
    df = df.sort_values(['ID', 'year', 'main_var', 'order'])
    # Calculate midpoint in terms of # of ppl (total # of ppl / 2)
    df['midpoint'] = df.apply(lambda row: row.num / 2 if row.second_var=='total' else np.nan, axis = 1)
    df.midpoint = df.midpoint.fillna(df.groupby(['ID', 'year', 'table', 'main_var'])['midpoint'].transform('max')).round(2) 
    # Generate cumulative number of people
    df['hh'] = df.apply(lambda row: row.num if row.second_var != 'total' else np.nan, axis = 1)
    df['cum_num'] = df.groupby(['ID', 'year', 'main_var'])['hh'].cumsum(skipna = True)
    # Tag the income range where this midpoint occurs by tagging the order number
    df['midpoint_range'] = df.apply(lambda row: row.order if row.midpoint <= row.cum_num else np.nan, axis = 1)
    df['min_range'] = df.groupby(['ID', 'year', 'main_var']).midpoint_range.transform('min')
    # Find the difference between midpoint # of people and the minimum of midpoint range
    df['prior_cum_num'] = df.apply(lambda row: row.cum_num if (row.order == row.min_range - 1)
                                       else np.nan, axis = 1)
    df.prior_cum_num = df.prior_cum_num.fillna(df.groupby(['ID', 'year', 'main_var'])['prior_cum_num'].transform('max'))
    df['diff'] = df.apply(lambda row: row.midpoint - row.prior_cum_num , axis = 1)
    # Tag the upper and lower bound of the income range using the functions defined above
    df['lower'] = df.apply(lower_bound, axis = 1).astype(int)
    df['upper'] = df.apply(upper_bound, axis = 1).astype(int)
    # Do a proportional increase on the range
    # Errors will come up if num value is 0. For some races (American Indian, etc), there are zero people in certain income ranges.
    df['proportion'] = df.apply(lambda row: 
                                ((row.midpoint - row.prior_cum_num) / row.num) * ((row.upper - row.lower + 1) * 1000) if (row.order==row.min_range) & (row.num > 0) 
                                else np.nan, axis = 1)
    # Add this proportion to the lower bound of the range
    df['median'] = (df.lower * 1000) + df.proportion
    df['median'] = df['median'].fillna(df.groupby(['ID', 'year', 'main_var'])['median'].transform('max'))
    # Drop columns/rows not needed
    df = df[df.second_var=='total']
    keep = ['ID', 'year', 'table', 'main_var', 'second_var', 'num', 'median']
    df = df[keep].drop_duplicates()
    final_dfs[key] = df

In [9]:
for key, value in final_dfs.items():
    display(key)
    display(value.head())

'incomerange_council_districts'

Unnamed: 0,ID,year,table,main_var,second_var,num,median
16,1.0,2016,incomerange,amerind,total,998.916026,32867.811619
33,1.0,2016,incomerange,asian,total,17210.808489,33564.701279
50,1.0,2016,incomerange,black,total,3933.551736,25469.018895
67,1.0,2016,incomerange,hisp,total,49085.912754,31748.49792
84,1.0,2016,incomerange,nonhisp,total,11868.830912,56867.83505


'incomerange_hh_council_districts'

Unnamed: 0,ID,year,table,main_var,second_var,num,median
10,1.0,2016,incomerange_hh,families,total,50719.193093,
21,1.0,2016,incomerange_hh,hh,total,82851.507414,
32,1.0,2016,incomerange_hh,married,total,27422.350888,
43,1.0,2016,incomerange_hh,nonfamily,total,32132.31432,
54,1.0,2017,incomerange_hh,families,total,51281.876433,


'incomerange_neighborhood_councils'

Unnamed: 0,ID,year,table,main_var,second_var,num,median
16,1.0,2016,incomerange,amerind,total,33.003558,56285.898462
33,1.0,2016,incomerange,asian,total,1098.856608,82890.636852
50,1.0,2016,incomerange,black,total,199.896807,83003.069575
67,1.0,2016,incomerange,hisp,total,5697.620642,69915.173842
84,1.0,2016,incomerange,nonhisp,total,1228.794166,47867.710514


'incomerange_hh_neighborhood_councils'

Unnamed: 0,ID,year,table,main_var,second_var,num,median
10,1.0,2016,incomerange_hh,families,total,6788.045915,
21,1.0,2016,incomerange_hh,hh,total,8195.963131,
32,1.0,2016,incomerange_hh,married,total,4942.39647,
43,1.0,2016,incomerange_hh,nonfamily,total,1407.917216,
54,1.0,2017,incomerange_hh,families,total,7009.60123,


'incomerange_zipcodes'

Unnamed: 0,ID,year,table,main_var,second_var,num,median
16,90001.0,2016,incomerange,amerind,total,43.235167,53178.612233
33,90001.0,2016,incomerange,asian,total,17.097516,23865.716465
50,90001.0,2016,incomerange,black,total,1864.918833,23055.557559
67,90001.0,2016,incomerange,hisp,total,6607.539562,36262.664968
84,90001.0,2016,incomerange,nonhisp,total,47.238073,39934.790963


'incomerange_hh_zipcodes'

Unnamed: 0,ID,year,table,main_var,second_var,num,median
10,90001.0,2016,incomerange_hh,families,total,7160.150627,
21,90001.0,2016,incomerange_hh,hh,total,8585.377667,
32,90001.0,2016,incomerange_hh,married,total,3662.38143,
43,90001.0,2016,incomerange_hh,nonfamily,total,1425.22704,87443.105161
54,90001.0,2017,incomerange_hh,families,total,7168.305811,


'incomerange_congressional_districts'

Unnamed: 0,ID,year,table,main_var,second_var,num,median
16,625.0,2016,incomerange,amerind,total,42.108039,125480.214755
33,625.0,2016,incomerange,asian,total,1875.561038,123549.462836
50,625.0,2016,incomerange,black,total,273.722778,57735.263961
67,625.0,2016,incomerange,hisp,total,775.291438,61845.650095
84,625.0,2016,incomerange,nonhisp,total,4555.05913,107310.185146


'incomerange_hh_congressional_districts'

Unnamed: 0,ID,year,table,main_var,second_var,num,median
10,625.0,2016,incomerange_hh,families,total,6140.356954,
21,625.0,2016,incomerange_hh,hh,total,7716.238065,
32,625.0,2016,incomerange_hh,married,total,5089.817329,
43,625.0,2016,incomerange_hh,nonfamily,total,1575.881111,
54,625.0,2017,incomerange_hh,families,total,6071.261165,


'incomerange_neighborhoods'

Unnamed: 0,ID,year,table,main_var,second_var,num,median
16,1.0,2016,incomerange,amerind,total,37.16902,29603.127978
33,1.0,2016,incomerange,asian,total,477.155003,12142.218242
50,1.0,2016,incomerange,black,total,1644.110163,24463.93476
67,1.0,2016,incomerange,hisp,total,2757.360207,33818.059488
84,1.0,2016,incomerange,nonhisp,total,336.169727,42938.593551


'incomerange_hh_neighborhoods'

Unnamed: 0,ID,year,table,main_var,second_var,num,median
10,1.0,2016,incomerange_hh,families,total,3424.075676,
21,1.0,2016,incomerange_hh,hh,total,5263.187417,
32,1.0,2016,incomerange_hh,married,total,1715.951666,
43,1.0,2016,incomerange_hh,nonfamily,total,1839.111741,
54,1.0,2017,incomerange_hh,families,total,3336.970534,
