# Create census tract stats table
* Create a tract-level dataframe that contains counts and percents for various census outcomes
* Use functions in `pcts_census_utils.py` to create this table
* Save this table to S3 for use in other notebooks

In [1]:
import boto3
import geopandas as gpd
import intake
import os
import pandas as pd
import pcts_census_utils

In [2]:
catalog = intake.open_catalog("../catalogs/*.yml")

s3 = boto3.client('s3')
bucket_name = 'city-planning-entitlements'

## A1. Read in Census tables

In [3]:
# Commute mode
commute_modes = ["workers_transit", "workers_walk", "workers_bike"]

commute = pcts_census_utils.transform_census_percent("commute", 2018, "workers", 
                                   commute_modes, "non_car_workers", 
                                   "non_car_workers", "workers_total")

commute.head(2)

Unnamed: 0,GEOID,non_car_workers,workers_total,pct_non_car_workers
0,6037101110,46,1927,0.023871
1,6037101122,11,1907,0.005768


In [4]:
# Renter occupied
tenure_group = ["pop_renter"]

tenure = pcts_census_utils.transform_census_percent("tenure", 2018, "pop", 
                                  tenure_group, "pop_renter", 
                                  "pop_renter", "pop_total")

tenure.head(2)

Unnamed: 0,GEOID,pop_renter,pop_total,pct_pop_renter
0,6037101110,2199,4219,0.521214
1,6037101122,577,3234,0.178417


In [5]:
# Zero vehicle 
vehicle_group = ["workers_veh0"]

vehicle = pcts_census_utils.transform_census_percent("vehicles", 2018, "workers", 
                                   vehicle_group, "zero_veh_workers",
                                   "zero_veh_workers", "workers_total")

vehicle.head(2)

Unnamed: 0,GEOID,workers_total,zero_veh_workers,pct_zero_veh_workers
0,6037101110,1927,0,0.0
1,6037101122,1907,8,0.004195


In [6]:
# Median household income
medincome = pcts_census_utils.grab_census_table("income", 2018, "medincome")
medincome = (medincome[medincome.new_var == "medincome_total"]
             .assign(medhhincome = medincome["num"])
             .drop(columns = ["new_var", "num"])
            )

medincome.head(2)

Unnamed: 0,GEOID,medhhincome
694416,6037101110,53077.0
694417,6037101122,88953.0


In [7]:
# Race
def transform_census_race():
    # Grab the tables individually.
    # White non-Hispanic is a separate table (with a diff main_var)
    race_total =  pcts_census_utils.grab_census_table("raceethnicity", 2018, "total")
    race_white = pcts_census_utils.grab_census_table("raceethnicity", 2018, "whitenonhisp")

    # Merge together
    race_total = (race_total.assign(
            pop_total = race_total['num'].astype("Int64")
        )[["GEOID", "pop_total"]]
    )
    
    race_white = (race_white.assign(
            pop_whitenonhisp = race_white['num'].astype("Int64")
        )[["GEOID", "pop_whitenonhisp"]]
    )
    
    race = pd.merge(race_total, race_white, on = "GEOID", how = "left", validate = "1:1") 
        
    race = race.assign(
            pct_whitenonhisp = race.pop_whitenonhisp / race.pop_total,
        )
    
    return race

race = transform_census_race()
race.head(2)

Unnamed: 0,GEOID,pop_total,pop_whitenonhisp,pct_whitenonhisp
0,6037101110,4314,2516,0.583217
1,6037101122,3239,2755,0.850571


In [8]:
# Incrace.pct_whitenonhisps to calculate median)
# There are cases where median income has to be calculated after tracts are aggregated.
income_ranges = pcts_census_utils.income_ranges

# The new_var columns to keep all have prefix "total_". 
# Can switch out if we're interested in other races' income ranges
keep = []
for x in income_ranges:
    keep.append("total_" + x)

def process_income_before_iqr():
    df = pcts_census_utils.grab_census_table("incomerange", 2018, "total")
    df = df.pivot(index="GEOID", columns = "new_var", values = "num")
    df.columns.name = ""
    df = df.reset_index()
    
    integrify_me = list(df.columns)
    integrify_me.remove("GEOID")
    
    df[integrify_me] = df[integrify_me].astype("Int64")
    
    return df

income = process_income_before_iqr()
income.head(2)

Unnamed: 0,GEOID,total_gt200,total_lt10,total_r100to124,total_r10to14,total_r125to149,total_r150to199,total_r15to19,total_r20to24,total_r25to29,total_r30to34,total_r35to39,total_r40to44,total_r45to49,total_r50to59,total_r60to74,total_r75to99,total_total
0,6037101110,44,110,91,97,93,109,70,76,123,96,54,78,88,64,126,277,1596
1,6037101122,192,52,114,18,105,133,0,15,39,38,7,40,23,62,186,232,1256


## A2. Merge Census tables

In [9]:
def merge_census_tables(commute, vehicle, tenure, race, medhhincome, income):
    # Grab the geometry for tracts
    census_tracts = catalog.census_tracts.read()
    census_tracts = (census_tracts[["GEOID10", "Shape_STAr", "geometry"]]
                     .rename(columns = {"GEOID10": "GEOID"})
                    )    
    
    # Merge the census pieces together
    c1 = pd.merge(commute, vehicle, on = ["GEOID", "workers_total"], how = "left", validate = "1:1")
    # If we don't drop pop_total in one of the dfs, it'll result in a lot of NaNs once merged
    c2 = pd.merge(tenure, race.drop(columns = "pop_total"), on = ["GEOID"], how = "left", validate = "1:1")
    c3 = pd.merge(c1, c2, on = "GEOID", how = "left", validate = "1:1")
    c4 = pd.merge(c3, medhhincome, on = "GEOID", how = "left", validate = "1:1")
    c5 = pd.merge(c4, income, on = "GEOID", how = "left", validate = "1:1")
    
    # Merge in geometry. 
    # census_tracts is clipped to City of LA, so we lose all the other tracts in LA County here.
    df = pd.merge(census_tracts, c5, on = "GEOID", how = "inner", validate = "1:1")
    
    """
    Denominators
    zero_veh_workers / workers_total
    non_car_workers / workers_total
    pop_renter / pop_total
    pop_whitenonhisp / pop_total
    """
    
    # Calculate pop density again (with 2018 pop total)
    df = df.assign(
        density = df.pop_total.astype(int) / (census_tracts.Shape_STAr/5280./5280.)
    )
    
    col_order = ['GEOID', 
                 'non_car_workers', 'workers_total', 'pct_non_car_workers', 
                 'zero_veh_workers', 'pct_zero_veh_workers', 
                 'pop_renter', 'pop_total', 'pct_pop_renter', 
                 'pop_whitenonhisp', 'pct_whitenonhisp',
                 'medhhincome',
                 'total_lt10', 'total_r10to14', 'total_r15to19',
                 'total_r20to24', 'total_r25to29', 'total_r30to34', 'total_r35to39',
                 'total_r40to44', 'total_r45to49', 'total_r50to59', 'total_r60to74',
                 'total_r75to99', 'total_r100to124', 'total_r125to149',  
                 'total_r150to199', 'total_gt200', 'total_total',
                'density', 'geometry'
                ]    
    
    df = (df.reindex(columns = col_order)
          .sort_values('GEOID')
          .reset_index(drop=True)
         )
     
    return df

In [10]:
df = merge_census_tables(commute, vehicle, tenure, race, medincome, income)
df.head(2)

Unnamed: 0,GEOID,non_car_workers,workers_total,pct_non_car_workers,zero_veh_workers,pct_zero_veh_workers,pop_renter,pop_total,pct_pop_renter,pop_whitenonhisp,...,total_r50to59,total_r60to74,total_r75to99,total_r100to124,total_r125to149,total_r150to199,total_gt200,total_total,density,geometry
0,6037101110,46,1927,0.023871,0,0.0,2199,4219,0.521214,2516,...,64,126,277,91,93,109,44,1596,9565.898824,"MULTIPOLYGON (((6471651.068 1918306.964, 64719..."
1,6037101122,11,1907,0.005768,8,0.004195,577,3234,0.178417,2755,...,62,186,232,114,105,133,192,1256,3168.165623,"MULTIPOLYGON (((6477840.844 1917081.399, 64778..."


### Export as parquet and upload to S3

In [11]:
df = pd.DataFrame(df.drop(columns = "geometry"))
df.to_parquet(f's3://{bucket_name}/data/final/census_analysis_table.parquet')

In [12]:
""" Use geoparquet if catalog can read it in directly. Otherwise, just use parquet.
# Export as geoparquet and upload to S3
file_name = "census_analysis_table"

# Write locally first, then upload to S3
df.to_parquet(f'../processed/{file_name}.parquet')

s3.upload_file(f'../processed/{file_name}.parquet', bucket_name, 
               f'data/final/{file_name}.parquet')

# Remove local version
os.remove(f'../processed/{file_name}.parquet')
"""

' Use geoparquet if catalog can read it in directly. Otherwise, just use parquet.\n# Export as geoparquet and upload to S3\nfile_name = "census_analysis_table"\n\n# Write locally first, then upload to S3\ndf.to_parquet(f\'../processed/{file_name}.parquet\')\n\ns3.upload_file(f\'../processed/{file_name}.parquet\', bucket_name, \n               f\'data/final/{file_name}.parquet\')\n\n# Remove local version\nos.remove(f\'../processed/{file_name}.parquet\')\n'