# Extract Census tables
Use 2018 ACS data
Use the functions in `utils`, to grab what we want to start. 
Can use `utils` and grab other outcomes later.

In [1]:
import intake
import numpy as np
import pandas as pd
import utils

In [2]:
bucket_name = 'city-planning-entitlements'

In [3]:
# Renter households
tenure = utils.grab_census_table('tenure', 2018, 'pop')
tenure = utils.make_wide(tenure, 'pop_renter', 'pop_total', 'renter', 'hh')
tenure.head()

Unnamed: 0,GEOID,renter,hh
0,6037101110,2199,4219
1,6037101122,577,3234
2,6037101210,5247,5987
3,6037101220,2110,3497
4,6037101300,353,4250


In [4]:
# Zero vehicle households
vehicles = utils.grab_census_table('vehicles', 2018, 'workers')
vehicles = utils.make_wide(vehicles, 'workers_veh0', 'workers_total', 'zero_veh', 'zero_veh_workers')
vehicles.head()

Unnamed: 0,GEOID,zero_veh,zero_veh_workers
0,6037101110,0,1927
1,6037101122,13,1907
2,6037101210,248,2770
3,6037101220,126,1513
4,6037101300,21,2041


In [5]:
# Start with white/non-white -- later can branch into specific race/ethnicity
race = utils.grab_census_table('race', 2018, 'pop')
race = utils.make_wide(race, 'pop_white', 'pop_total', 'pop_white', 'pop_total')
race.head()

Unnamed: 0,GEOID,pop_white,pop_total
0,6037101110,3314,4314
1,6037101122,2837,3239
2,6037101210,4771,6052
3,6037101220,2494,3497
4,6037101300,3705,4297


In [6]:
# Start with commute by public transit / walk / bike
transit_options = ['walk', 'transit', 'bike']

commute = utils.grab_census_table('commute', 2018, 'workers')
commute = utils.aggregate_group(commute, transit_options)
commute = utils.make_wide(commute, 'aggregated_group', 'workers_total', 'commute_transit', 'commute_workers')
commute.head()

Unnamed: 0,GEOID,commute_transit,commute_workers
0,6037101110,104,1927
1,6037101122,19,1907
2,6037101210,376,2770
3,6037101220,126,1513
4,6037101300,180,2041


In [7]:
# Find number of households who are under certain income threshold 
low_income = ['lt10', 'r10to14', 'r15to19', 'r20to24',
             'r25to29', 'r30to34', 'r35to39', 'r40to44', 'r45to49']

income_total = utils.grab_census_table('incomerange', 2018, 'total')
income_total = utils.aggregate_group(income_total, low_income)
income_total = utils.make_wide(income_total, 'aggregated_group', 'total_total', 'low_income_total', 'income_total')
income_total.head()

Unnamed: 0,GEOID,low_income_total,income_total
0,6037101110,792,1596
1,6037101122,232,1256
2,6037101210,1573,2321
3,6037101220,841,1294
4,6037101300,436,1435


In [8]:
# Find number of white households who are under certain income threshold. Can do white/nonwhite comparison.
income_white = utils.grab_census_table('incomerange', 2018, 'white')
income_white = utils.aggregate_group(income_white, low_income)
income_white = utils.make_wide(income_white, 'aggregated_group', 'white_total', 'low_income_white', 'income_white')
income_white.head()

Unnamed: 0,GEOID,low_income_white,income_white
0,6037101110,660,1309
1,6037101122,207,1122
2,6037101210,1315,1861
3,6037101220,661,923
4,6037101300,413,1276


## Merge Census tables

In [9]:
census_data = [vehicles, commute, race, 
               income_total, income_white]

df = tenure.copy()

for c in census_data:
    df = pd.merge(df, c, on = 'GEOID', how = 'left', validate = '1:1')

In [10]:
# Derive any other cols we need
df = df.assign(
    pop_nonwhite = df.pop_total - df.pop_white,
    low_income_nonwhite = df.low_income_total - df.low_income_white,
    income_nonwhite = df.income_total - df.income_white
)

# Now, derive the final outcomes we want to track, and keep those only
keep = ['GEOID',
        'p_renter', 'p_zero_veh', 'p_transit', 
        'p_nonwhite', 'p_low_income_total', 'p_low_income_nonwhite']

df = (df.assign(
        p_renter = df.renter / df.hh,
        p_zero_veh = df.zero_veh_workers / df.zero_veh,
        p_transit = df.commute_transit / df.commute_workers,
        p_nonwhite = df.pop_nonwhite / df.pop_total,
        p_low_income_total = df.low_income_total / df.income_total,
        p_low_income_nonwhite = df.low_income_nonwhite / df.income_nonwhite,
    ).replace([np.inf, -np.inf], np.nan)
      .sort_values('GEOID')[keep]
)

df.head()

Unnamed: 0,GEOID,p_renter,p_zero_veh,p_transit,p_nonwhite,p_low_income_total,p_low_income_nonwhite
0,6037101110,0.521214,,0.05397,0.231803,0.496241,0.45993
1,6037101122,0.178417,146.692308,0.009963,0.124112,0.184713,0.186567
2,6037101210,0.876399,11.169355,0.13574,0.211666,0.677725,0.56087
3,6037101220,0.603374,12.007937,0.083278,0.286817,0.649923,0.485175
4,6037101300,0.083059,97.190476,0.088192,0.137771,0.303833,0.144654


In [12]:
# Export and save to S3
df.to_parquet(f's3://{bucket_name}/data/final/census_analysis_table.parquet')