## Make PCTS data tract level

In [1]:
import intake
import numpy as np
import pandas as pd

In [2]:
catalog = intake.open_catalog("../catalogs/*.yml")

bucket_name = 'city-planning-entitlements'

## Import data

In [3]:
pcts = pd.read_parquet(f's3://{bucket_name}/data/final/master_pcts.parquet')
parents = pd.read_parquet(f's3://{bucket_name}/data/final/parents_with_suffix.parquet')

crosswalk_parcels_tracts = pd.read_parquet(f's3://{bucket_name}/data/crosswalk_parcels_tracts.parquet')
crosswalk_parcels_tracts = crosswalk_parcels_tracts[['AIN', 'GEOID', 'pop']]

## Make PCTS tract-level
* Aggregate and count the number of unique parent cases, number of parcels with ENT, and number of ENT with each suffix by tract

In [4]:
def make_tract_level():
    # (1a) Only keep parent cases
    m1 = pd.merge(pcts, parents, on = 'PARENT_CASE', how = 'inner', validate = 'm:1')

    # (1b) Make cases parcel-level
    parents_by_parcel = (m1.groupby(['AIN'])
                         .agg({'PARENT_CASE':'count'})
                         .reset_index()
                         .rename(columns = {'PARENT_CASE':'num_cases'})
                        )

    # (1c) Merge in tract info and aggregate to tract-level
    m2 = pd.merge(parents_by_parcel, crosswalk_parcels_tracts, on = 'AIN', how = 'inner', validate = '1:1')

    # Even though the column num_AIN shows there are some parcels with more than 1 obs,
    # once we merged in parent cases, no AIN shows up more than once

    ent_by_tract = m2.groupby(['GEOID', 'pop']).agg({'num_cases':'sum', 'AIN':'count'}).reset_index()
    
    
    # (2a) Only keep suffixes
    suffix = m1.loc[:, '1A':'ZV']
    
    m3 = pd.merge(m1[['AIN']], suffix, left_index = True, right_index = True)

    # (2b) Make suffixes parcel-level
    suffix_by_parcel = (m3.pivot_table(index = 'AIN', aggfunc = 'sum')
                        .reset_index()
                       )

    # (2c) Merge in tract info and aggregate to tract-level
    m4 = pd.merge(suffix_by_parcel, crosswalk_parcels_tracts, 
                                 on = 'AIN', how = 'left', validate = '1:1')

    # Aggregate the number of suffixes by tract
    suffix_by_tract = m4.pivot_table(index = ['GEOID', 'pop'], aggfunc = 'sum').reset_index()
    
    
    # (3) Merge number of cases and suffixes by tract
    df = pd.merge(ent_by_tract, suffix_by_tract, on = ['GEOID', 'pop'], how = 'left', validate = '1:1')
        
    # (4) Make sure everything returns as integers and not floats
    colnames = list(df.columns)

    for r in ['GEOID', 'pop', 'AIN']:
        colnames.remove(r)
    
    df[colnames] = df[colnames].fillna(0).astype(int)    
        
    return df

In [5]:
df = make_tract_level()

## Extract census data and make wide
Grab 2018 ACS values

In [6]:
# With a census table that only has numeric values, this is most straightforward to convert from long to wide
def grab_census_table(table_name, year, main_var):
    df = pd.read_parquet(f's3://{bucket_name}/data/final/census_cleaned.parquet')
    cols = ['GEOID', 'new_var', 'num']
    df = df[(df.year == year) & 
            (df.table == table_name) & 
            (df.main_var == main_var)][cols]
    return df


def make_wide(df, numerator_var, denominator_var, numerator_renamed, denominator_renamed): 
    numerator_renamed = f'{numerator_renamed}'
    denominator_renamed = f'{denominator_renamed}'
    
    df = df.assign(
        numerator = df.apply(lambda row: row.num if row.new_var==numerator_var 
                                     else np.nan, axis=1),
        denominator = df.apply(lambda row: row.num if row.new_var==denominator_var 
                                       else np.nan, axis=1),
    )
    
    
    df = df.assign(
        numerator = df.numerator.fillna(df.groupby('GEOID')['numerator'].transform('max')),
        denominator = df.denominator.fillna(df.groupby('GEOID')['denominator'].transform('max')),
    )
    
    
    keep_col = ['GEOID', 'numerator', 'denominator']
    
    df = (df[keep_col].drop_duplicates()
          .sort_values('GEOID')
          # If the max by GEOID was still NaN, fill it in now with 0
          .assign(
              numerator = df.numerator.fillna(0).astype(int),
              denominator = df.denominator.fillna(0).astype(int),
          )
          .rename(columns = {'numerator': numerator_renamed, 
                               'denominator': denominator_renamed})   
          .reset_index(drop=True)
         )
    
    return df


def aggregate_group(df, aggregate_me):
    df = (df.assign(
        new_var2 = df.apply(lambda row: 'aggregated_group' if any(x in row.new_var for x in aggregate_me)
                             else row.new_var, axis = 1)
        ).groupby(['GEOID', 'new_var2'])
          .agg({'num':'sum'})
          .reset_index()
          .rename(columns = {'new_var2':'new_var'})
    )
    
    return df

In [7]:
# Renter households
tenure = grab_census_table('tenure', 2018, 'pop')

tenure2 = make_wide(tenure, 'pop_renter', 'pop_total', 'renter', 'hh')
tenure2.head()

Unnamed: 0,GEOID,renter,hh
0,6037101110,2199,4219
1,6037101122,577,3234
2,6037101210,5247,5987
3,6037101220,2110,3497
4,6037101300,353,4250


In [8]:
# Zero vehicle households
vehicles = grab_census_table('vehicles', 2018, 'workers')

vehicles2 = make_wide(vehicles, 'workers_veh0', 'workers_total', 'zero_veh', 'zero_veh_workers')
vehicles2.head()

Unnamed: 0,GEOID,zero_veh,zero_veh_workers
0,6037101110,0,1927
1,6037101122,13,1907
2,6037101210,248,2770
3,6037101220,126,1513
4,6037101300,21,2041


In [9]:
# Start with white/non-white -- later can branch into specific race/ethnicity
race = grab_census_table('race', 2018, 'pop')

race2 = make_wide(race, 'pop_white', 'pop_total', 'pop_white', 'pop_total')
race2.head()

Unnamed: 0,GEOID,pop_white,pop_total
0,6037101110,3314,4314
1,6037101122,2837,3239
2,6037101210,4771,6052
3,6037101220,2494,3497
4,6037101300,3705,4297


In [10]:
# Start with commute by public transit / walk / bike
commute = grab_census_table('commute', 2018, 'workers')

transit_options = ['walk', 'transit', 'bike']

commute = aggregate_group(commute, transit_options)

commute2 = make_wide(commute, 'aggregated_group', 'workers_total', 'commute_transit', 'commute_workers')
commute2.head()

Unnamed: 0,GEOID,commute_transit,commute_workers
0,6037101110,104,1927
1,6037101122,19,1907
2,6037101210,376,2770
3,6037101220,126,1513
4,6037101300,180,2041


In [11]:
# Find number of households who are under certain income threshold 
income_total = grab_census_table('incomerange', 2018, 'total')

low_income = ['lt10', 'r10to14', 'r15to19', 'r20to24',
             'r25to29', 'r30to34', 'r35to39', 'r40to44', 'r45to49']

income_total = aggregate_group(income_total, low_income)

income_total2 = make_wide(income_total, 'aggregated_group', 'total_total', 'low_income_total', 'income_total')
income_total2.head()

Unnamed: 0,GEOID,low_income_total,income_total
0,6037101110,792,1596
1,6037101122,232,1256
2,6037101210,1573,2321
3,6037101220,841,1294
4,6037101300,436,1435


In [12]:
# Find number of white households who are under certain income threshold. Can do white/nonwhite comparison.
income_white = grab_census_table('incomerange', 2018, 'white')

income_white = aggregate_group(income_white, low_income)

income_white2 = make_wide(income_white, 'aggregated_group', 'white_total', 'low_income_white', 'income_white')
income_white2.head()

Unnamed: 0,GEOID,low_income_white,income_white
0,6037101110,660,1309
1,6037101122,207,1122
2,6037101210,1315,1861
3,6037101220,661,923
4,6037101300,413,1276


# Merge census tables

In [15]:
census_data = [vehicles2, race2, 
               income_total2, income_white2]

df = tenure2.copy()

for c in census_data:
    df = pd.merge(df, c, on = 'GEOID', how = 'left', validate = '1:1')

In [17]:
df.head()
df.to_parquet('../data/census_merged.parquet')