# Extract Census tables
Use 2018 ACS data
Use the functions in `utils`, to grab what we want to start. 
Can use `utils` and grab other outcomes later.

In [1]:
import intake
import numpy as np
import pandas as pd
import pcts_census_utils as utils

In [2]:
bucket_name = 'city-planning-entitlements'

In [4]:
# Renter households
tenure = utils.grab_census_table('tenure', 2018, 'pop')
tenure = utils.make_wide(tenure, 'pop_renter', 'pop_total', 'renter', 'renter_denom')
tenure.head()

Unnamed: 0,GEOID,renter,renter_denom
0,6037101110,2199,4219
1,6037101122,577,3234
2,6037101210,5247,5987
3,6037101220,2110,3497
4,6037101300,353,4250


In [5]:
# Zero vehicle households
vehicles = utils.grab_census_table('vehicles', 2018, 'workers')
vehicles = utils.make_wide(vehicles, 'workers_veh0', 'workers_total', 'zero_veh', 'zero_veh_denom')
vehicles.head()

Unnamed: 0,GEOID,zero_veh,zero_veh_denom
0,6037101110,0,1927
1,6037101122,13,1907
2,6037101210,248,2770
3,6037101220,126,1513
4,6037101300,21,2041


In [6]:
# Start with white/non-white -- later can branch into specific race/ethnicity
race = utils.grab_census_table('race', 2018, 'pop')
race = utils.make_wide(race, 'pop_white', 'pop_total', 'pop_white', 'pop_total')
race.head()

Unnamed: 0,GEOID,pop_white,pop_total
0,6037101110,3314,4314
1,6037101122,2837,3239
2,6037101210,4771,6052
3,6037101220,2494,3497
4,6037101300,3705,4297


In [7]:
# Start with commute by public transit / walk / bike
transit_options = ['walk', 'transit', 'bike']

commute = utils.grab_census_table('commute', 2018, 'workers')
commute = utils.aggregate_group(commute, transit_options)
commute = utils.make_wide(commute, 'aggregated_group', 'workers_total', 'commute_transit', 'commute_denom')
commute.head()

Unnamed: 0,GEOID,commute_transit,commute_denom
0,6037101110,104,1927
1,6037101122,19,1907
2,6037101210,376,2770
3,6037101220,126,1513
4,6037101300,180,2041


In [8]:
# Find number of households who are under certain income threshold 
low_income = ['lt10', 'r10to14', 'r15to19', 'r20to24',
             'r25to29', 'r30to34', 'r35to39', 'r40to44', 'r45to49']

income_total = utils.grab_census_table('incomerange', 2018, 'total')
income_total = utils.aggregate_group(income_total, low_income)
income_total = utils.make_wide(income_total, 'aggregated_group', 'total_total', 'low_income_total', 'income_total')
income_total.head()

Unnamed: 0,GEOID,low_income_total,income_total
0,6037101110,792,1596
1,6037101122,232,1256
2,6037101210,1573,2321
3,6037101220,841,1294
4,6037101300,436,1435


In [9]:
# Find number of white households who are under certain income threshold. Can do white/nonwhite comparison.
income_white = utils.grab_census_table('incomerange', 2018, 'white')
income_white = utils.aggregate_group(income_white, low_income)
income_white = utils.make_wide(income_white, 'aggregated_group', 'white_total', 'low_income_white', 'income_white')
income_white.head()

Unnamed: 0,GEOID,low_income_white,income_white
0,6037101110,660,1309
1,6037101122,207,1122
2,6037101210,1315,1861
3,6037101220,661,923
4,6037101300,413,1276


## Merge Census tables

In [10]:
census_data = [vehicles, commute, race, 
               income_total, income_white]

df = tenure.copy()

for c in census_data:
    df = pd.merge(df, c, on = 'GEOID', how = 'left', validate = '1:1')

In [11]:
# Derive any other cols we need
df = df.assign(
    pop_nonwhite = df.pop_total - df.pop_white,
    low_income_nonwhite = df.low_income_total - df.low_income_white,
    income_nonwhite = df.income_total - df.income_white
)

# Now, derive the final outcomes we want to track, and keep those only
keep = ['GEOID',
        'p_renter', 'p_zero_veh', 'p_transit', 
        'p_nonwhite', 'p_low_income_total', 'p_low_income_nonwhite']

df = (df.assign(
        p_renter = df.renter / df.renter_denom,
        p_zero_veh = df.zero_veh / df.zero_veh_denom,
        p_transit = df.commute_transit / df.commute_denom,
        p_nonwhite = df.pop_nonwhite / df.pop_total,
        p_low_income_total = df.low_income_total / df.income_total,
        p_low_income_nonwhite = df.low_income_nonwhite / df.income_nonwhite,
    ).replace([np.inf, -np.inf], np.nan)
      .sort_values('GEOID')[keep]
)

df.head()

Unnamed: 0,GEOID,p_renter,p_zero_veh,p_transit,p_nonwhite,p_low_income_total,p_low_income_nonwhite
0,6037101110,0.521214,0.0,0.05397,0.231803,0.496241,0.45993
1,6037101122,0.178417,0.006817,0.009963,0.124112,0.184713,0.186567
2,6037101210,0.876399,0.089531,0.13574,0.211666,0.677725,0.56087
3,6037101220,0.603374,0.083278,0.083278,0.286817,0.649923,0.485175
4,6037101300,0.083059,0.010289,0.088192,0.137771,0.303833,0.144654


In [14]:
# Add assertions to check for errors
check_cols = ['p_renter', 'p_zero_veh', 'p_transit', 'p_nonwhite', 
            'p_low_income_total', 'p_low_income_nonwhite']

for col in check_cols:
    print(f"{col}: {df[col].max()}")

p_renter: 1.0
p_zero_veh: 4.5097837281153454
p_transit: 9.666666666666666
p_nonwhite: 1.0
p_low_income_total: 1.0
p_low_income_nonwhite: 1.0


In [16]:
commute['p_transit'] = commute.commute_transit / commute.commute_denom


In [18]:
commute_problem_tracts = ['06037115103', '06037203300']

In [19]:
commute = utils.grab_census_table('commute', 2018, 'workers')

In [20]:
commute[commute.GEOID.isin(commute_problem_tracts)]

Unnamed: 0,GEOID,new_var,num
184512,6037115103,workers_total,1161.0
184513,6037115103,workers_car1,1255.0
184514,6037115103,workers_transit,285.0
184515,6037115103,workers_walk,1401.0
184516,6037115103,workers_bike,71.0
902183,6037203300,workers_total,466.0
902184,6037203300,workers_car1,1263.0
902185,6037203300,workers_transit,186.0
902186,6037203300,workers_walk,281.0
902187,6037203300,workers_bike,77.0


In [17]:
commute[commute.p_transit > 1.0]

Unnamed: 0,GEOID,commute_transit,commute_denom,p_transit
104,6037115103,1757,1161,1.513351
509,6037203300,544,466,1.167382
531,6037206020,2392,424,5.641509
536,6037206300,4248,971,4.374871
538,6037207102,1289,1144,1.126748
548,6037208301,1118,1110,1.007207
557,6037208720,2273,2190,1.0379
558,6037208801,1648,1545,1.066667
560,6037208902,2100,1613,1.301922
561,6037208903,2586,2232,1.158602


In [21]:
# Export and save to S3
df.to_parquet(f's3://{bucket_name}/data/final/census_analysis_table.parquet')