# Extract Census tables
Use 2018 ACS data
Use the functions in `utils`, to grab what we want to start. 
Can use `utils` and grab other outcomes later.

In [1]:
import intake
import numpy as np
import pandas as pd
import pcts_census_utils as utils

In [2]:
bucket_name = 'city-planning-entitlements'

In [3]:
# Renter households
tenure = utils.grab_census_table('tenure', 2018, 'pop')
cols = {"pop_renter": "renter", "pop_total": "renter_denom"}
tenure = utils.make_wide(tenure, cols.keys()).rename(columns=cols)
tenure.head()

Unnamed: 0,GEOID,renter,renter_denom
0,6037101110,2199,4219
1,6037101122,577,3234
2,6037101210,5247,5987
3,6037101220,2110,3497
4,6037101300,353,4250


In [4]:
# Zero vehicle households
vehicles = utils.grab_census_table('vehicles', 2018, 'workers')
cols = {"workers_veh0": "zero_veh", "workers_total": "zero_veh_denom"}
vehicles = utils.make_wide(vehicles, cols.keys()).rename(columns=cols)
vehicles.head()

Unnamed: 0,GEOID,zero_veh_denom,zero_veh
0,6037101110,1927,0
1,6037101122,1907,8
2,6037101210,2770,114
3,6037101220,1513,54
4,6037101300,2041,10


In [5]:
# Start with white/non-white -- later can branch into specific race/ethnicity
race = utils.grab_census_table('race', 2018, 'pop')
race = utils.make_wide(race, ['pop_white', 'pop_total'])
race.head()

Unnamed: 0,GEOID,pop_total,pop_white
0,6037101110,4314,3314
1,6037101122,3239,2837
2,6037101210,6052,4771
3,6037101220,3497,2494
4,6037101300,4297,3705


In [6]:
# Start with commute by public transit / walk / bike
transit_options = ['walk', 'transit', 'bike']

commute = utils.grab_census_table('commute', 2018, 'workers')
commute = utils.aggregate_group(commute, transit_options)
cols = {"aggregated_group": "commute_transit", "workers_total": "commute_denom"}
commute = utils.make_wide(commute, cols.keys()).rename(columns=cols)
commute.head()

Unnamed: 0,GEOID,commute_transit,commute_denom
0,6037101110,46,1927
1,6037101122,11,1907
2,6037101210,171,2770
3,6037101220,54,1513
4,6037101300,86,2041


In [7]:
# Find number of households who are under certain income threshold 
low_income = ['lt10', 'r10to14', 'r15to19', 'r20to24',
             'r25to29', 'r30to34', 'r35to39', 'r40to44', 'r45to49']

income_total = utils.grab_census_table('incomerange', 2018, 'total')
income_total = utils.aggregate_group(income_total, low_income)
cols = {"aggregated_group": "low_income_total", "total_total": "income_total"}
income_total = utils.make_wide(income_total, cols.keys()).rename(columns=cols)
income_total.head()

Unnamed: 0,GEOID,low_income_total,income_total
0,6037101110,792,1596
1,6037101122,232,1256
2,6037101210,1573,2321
3,6037101220,841,1294
4,6037101300,436,1435


In [8]:
# Find number of white households who are under certain income threshold. Can do white/nonwhite comparison.
income_white = utils.grab_census_table('incomerange', 2018, 'white')
income_white = utils.aggregate_group(income_white, low_income)
cols = {"aggregated_group": "low_income_white", "white_total": "income_white"}
income_white = utils.make_wide(income_white, cols.keys()).rename(columns=cols)
income_white.head()

Unnamed: 0,GEOID,low_income_white,income_white
0,6037101110,660,1309
1,6037101122,207,1122
2,6037101210,1315,1861
3,6037101220,661,923
4,6037101300,413,1276


In [9]:
# Find median hh income for all races, we can also have individual race's medhhincome
median_income = utils.grab_census_table('income', 2018, 'medincome')
median_income = median_income[median_income.new_var=='medincome_total']

median_income = (
    median_income[median_income.new_var=='medincome_total']
    .rename(columns = {'num':'medhhincome'})
    .drop(columns = 'new_var')
    .sort_values('GEOID')
    .reset_index(drop=True)
)

median_income.head()

Unnamed: 0,GEOID,medhhincome
0,6037101110,53077.0
1,6037101122,88953.0
2,6037101210,32119.0
3,6037101220,41728.0
4,6037101300,86914.0


## Merge Census tables

In [10]:
census_data = [vehicles, commute, race, 
               income_total, income_white, median_income]

df = tenure.copy()

for c in census_data:
    df = pd.merge(df, c, on = 'GEOID', how = 'left', validate = '1:1')

In [11]:
# Derive any other cols we need
df = df.assign(
    pop_nonwhite = df.pop_total - df.pop_white,
    low_income_nonwhite = df.low_income_total - df.low_income_white,
    income_nonwhite = df.income_total - df.income_white
)

# Now, derive the final outcomes we want to track, and keep those only
keep = ['GEOID',
        'p_renter', 'p_zero_veh', 'p_transit', 
        'p_nonwhite', 'p_low_income_total', 'p_low_income_nonwhite',
        'medhhincome']

df = (df.assign(
        p_renter = df.renter / df.renter_denom,
        p_zero_veh = df.zero_veh / df.zero_veh_denom,
        p_transit = df.commute_transit / df.commute_denom,
        p_nonwhite = df.pop_nonwhite / df.pop_total,
        p_low_income_total = df.low_income_total / df.income_total,
        p_low_income_nonwhite = df.low_income_nonwhite / df.income_nonwhite,
    ).replace([np.inf, -np.inf], np.nan)
      .sort_values('GEOID')[keep]
)

df.head()

Unnamed: 0,GEOID,p_renter,p_zero_veh,p_transit,p_nonwhite,p_low_income_total,p_low_income_nonwhite,medhhincome
0,6037101110,0.521214,0.0,0.023871,0.231803,0.496241,0.45993,53077.0
1,6037101122,0.178417,0.004195,0.005768,0.124112,0.184713,0.186567,88953.0
2,6037101210,0.876399,0.041155,0.061733,0.211666,0.677725,0.56087,32119.0
3,6037101220,0.603374,0.035691,0.035691,0.286817,0.649923,0.485175,41728.0
4,6037101300,0.083059,0.0049,0.042136,0.137771,0.303833,0.144654,86914.0


In [12]:
# Add assertions to check for errors
check_cols = ['p_renter', 'p_zero_veh', 'p_transit', 'p_nonwhite', 
            'p_low_income_total', 'p_low_income_nonwhite']

for col in check_cols:
    print(f"{col}: {df[col].max()}")

p_renter: 1.0
p_zero_veh: 0.768280123583934
p_transit: 0.8095238095238095
p_nonwhite: 1.0
p_low_income_total: 1.0
p_low_income_nonwhite: 1.0


In [13]:
# Export and save to S3
df.to_parquet(f's3://{bucket_name}/data/final/census_analysis_table.parquet')