# Aggregation for educational attainment table
* Educational attainment table shows 
    * population 25 yrs and above (denominator for all other stats)
    * number and % of ppl by education
    * median income by education
    * number and % poverty by education
* When aggregating, need to make sure:
    * values are allocated, then aggregated, then % are derived for larger geographies
    * median income is weighted and allocated

In [1]:
import numpy as np
import pandas as pd
import intake
import os

In [2]:
catalog = intake.open_catalog('../catalogs/*.yml')

In [3]:
# Import Census tabular data
census = pd.read_parquet('s3://hcid-cdbg-project-ita-data/data/raw/raw_census_cleaned.parquet')
# Test this on 2016, 2017 for all census tracts
census = census[(census.year>=2016) & (census.table=='edu')]

In [4]:
# Import crosswalks
council_districts = catalog.crosswalk_tracts_council_districts.read()
neighborhood_councils = catalog.crosswalk_tracts_neighborhood_councils.read()
zipcodes = catalog.crosswalk_tracts_zipcodes.read()
congressional_districts = catalog.crosswalk_tracts_congressional_districts.read()
neighborhoods = catalog.crosswalk_tracts_neighborhoods.read()

In [5]:
census.second_var.value_counts()

hs9                   4692
hs12                  4692
hs                    4692
lhs_medearning        4692
pov_lhs               4692
pop25_medearning      4692
college_medearning    4692
aa                    4692
ba_medearning         4692
pct_baplus            4692
college_pov           4692
college               4692
ma                    4692
ma_medearning         4692
total_pop25           4692
ba_pov                4692
ba                    4692
pov_hs                4692
hs_medearning         4692
pct_hsplus            4692
Name: second_var, dtype: int64

In [6]:
# Split the df into 2 parts, 
# First: median earnings, which needs to be weighted by the corresponding pop values
earning = census[census.second_var.str.contains('earning')]
pop_values = ['total_pop25', 'hs9', 'hs12', 'hs', 'college', 'aa', 'ba', 'ma']
pop = census[census.second_var.isin(pop_values)]

# Second, all other count values
counts = census[~census.second_var.str.contains('earning')]

In [7]:
# Create new column that groups the various edu levels together to match median earnings
def edu_group(row):
    if row.second_var.find('pop25') != -1:
        return 'all'
    elif (row.second_var == 'hs9') or (row.second_var == 'hs12') or (row.second_var == 'lhs_medearning'):
        return 'lhs'
    elif (row.second_var == 'hs') or (row.second_var == 'hs_medearning'):
        return 'hs'
    elif (row.second_var == 'college') or (row.second_var == 'aa') or (row.second_var == 'college_medearning'):
        return 'college'
    elif (row.second_var == 'ba') or (row.second_var == 'ba_medearning'):
        return 'ba'
    elif (row.second_var == 'ma') or (row.second_var == 'ma_medearning'):
        return 'ma'
    
pop['group'] = pop.apply(edu_group, axis = 1)
earning['group'] = earning.apply(edu_group, axis = 1)

# Aggregate the pop df to these new edu groups
pop2 = pop.groupby(['GEOID', 'year', 'table', 'main_var', 'group']).agg({'num': 'sum'}).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [8]:
# Merge the earning and pop weights by tract
m1 = pd.merge(pop2, earning, on = ['GEOID', 'year', 'table', 'main_var', 'group'], how = 'left', validate = '1:1')
m1.rename(columns = {'num_x': 'pop', 'num_y': 'medearning'}, inplace = True)
m1['num'] = m1['pop'] * m1.medearning

In [12]:
m1.head()

Unnamed: 0,GEOID,year,table,main_var,group,pop,variable,second_var,new_var,medearning,pct,num
0,6037101110,2016,edu,pop,all,3158.0,S1501_C01_059E,pop25_medearning,pop_pop25_medearning,37817.0,,119426086.0
1,6037101110,2016,edu,pop,ba,373.0,S1501_C01_063E,ba_medearning,pop_ba_medearning,51207.0,,19100211.0
2,6037101110,2016,edu,pop,college,1196.0,S1501_C01_062E,college_medearning,pop_college_medearning,39821.0,,47625916.0
3,6037101110,2016,edu,pop,hs,656.0,S1501_C01_061E,hs_medearning,pop_hs_medearning,30559.0,,20046704.0
4,6037101110,2016,edu,pop,lhs,678.0,S1501_C01_060E,lhs_medearning,pop_lhs_medearning,25104.0,,17020512.0


In [16]:
boundaries = {'council_districts': council_districts, 'neighborhood_councils': neighborhood_councils,
             'zipcodes': zipcodes, 'congressional_districts': congressional_districts, 'neighborhoods': neighborhoods}

processed_dfs = {}

for key, value in boundaries.items():
    # Merge the med earning table with each boundary 
    merged = pd.merge(m1, value, on = 'GEOID', how = 'left', validate = 'm:1')
    merged.max_val = merged.max_val.fillna(0)
    # Allocate the num column according to however many CDs, NCs, etc each tract intersects with. 
    # Find the sum for num1, num2, ... columns. Then, append and take the sum again.
    n = merged.max_val.max().astype(int)
    uniform_id_col = 'ID'
    uniform_num_col = 'num' 
    aggregated = pd.DataFrame()
    # Depending on the boundary, tract might intersect with 1, 2,...,5 of the larger geographies.
    for i in range(1, n + 1):
        new_col = f"num{i}"
        allocate_col = f"allocate{i}"
        # Allocate the num column for all the various intersections.
        merged[new_col] = merged.num * merged[allocate_col]
        id_col = f"ID{i}"
        num_col = f"num{i}"
        # Take the sum of the num column by CD, NC, etc.
        agg = merged.groupby([id_col, 'year', 'table', 'group']).agg({'pop': 'sum', num_col: 'sum'}).reset_index()
        agg.rename(columns = {id_col: uniform_id_col, num_col: uniform_num_col}, inplace = True)
        # Append these sums together
        aggregated = aggregated.append(agg)
    # Take the sum again. For each CD, NC, etc, calculate the total pop and total pop-weighted median earnings.
    aggregated2 = aggregated.groupby([uniform_id_col, 'year', 'table', 'group']).agg({'pop': 'sum', uniform_num_col: 'sum'}).reset_index()
    processed_dfs[key] = aggregated2

In [18]:
final_dfs = {}

for key, value in processed_dfs.items():
    df = value.copy()
    df['medearning'] = df.num / df['pop']
    # Clean up final_df before saving into dictionary
    for col in ['pop', 'num']:
        df[col] = df[col].round(0).astype(int)
    df['medearning'] = df.medearning.round(2)
    final_dfs[key] = df

In [22]:
test = final_dfs['council_districts']
test[(test.year==2017) & (test.ID==7)]

Unnamed: 0,ID,year,table,group,pop,num,medearning
78,7.0,2017,edu,all,203219,6758499711,33257.22
79,7.0,2017,edu,ba,26894,1258179163,46782.89
80,7.0,2017,edu,college,51595,1941815472,37635.73
81,7.0,2017,edu,hs,53496,1551363412,28999.62
82,7.0,2017,edu,lhs,60547,1287840904,21270.1
83,7.0,2017,edu,ma,10687,704400464,65911.9
