## Make PCTS data tract level

In [1]:
import intake
import numpy as np
import pandas as pd

In [2]:
catalog = intake.open_catalog("../catalogs/*.yml")

bucket_name = 'city-planning-entitlements'

## Import data

In [3]:
pcts = pd.read_parquet(f's3://{bucket_name}/data/final/master_pcts.parquet')
parents = pd.read_parquet(f's3://{bucket_name}/data/final/parents_with_suffix.parquet')

crosswalk_parcels_tracts = pd.read_parquet(f's3://{bucket_name}/data/crosswalk_parcels_tracts.parquet')
crosswalk_parcels_tracts = crosswalk_parcels_tracts[['AIN', 'GEOID', 'pop']]

## Make PCTS tract-level
* Aggregate and count the number of unique parent cases, number of parcels with ENT, and number of ENT with each suffix by tract

In [4]:
def make_tract_level():
    # (1a) Only keep parent cases
    m1 = pd.merge(pcts, parents, on = 'PARENT_CASE', how = 'inner', validate = 'm:1')

    # (1b) Make cases parcel-level
    parents_by_parcel = (m1.groupby(['AIN'])
                         .agg({'PARENT_CASE':'count'})
                         .reset_index()
                         .rename(columns = {'PARENT_CASE':'num_cases'})
                        )

    # (1c) Merge in tract info and aggregate to tract-level
    m2 = pd.merge(parents_by_parcel, crosswalk_parcels_tracts, on = 'AIN', how = 'inner', validate = '1:1')

    # Even though the column num_AIN shows there are some parcels with more than 1 obs,
    # once we merged in parent cases, no AIN shows up more than once

    ent_by_tract = m2.groupby(['GEOID', 'pop']).agg({'num_cases':'sum', 'AIN':'count'}).reset_index()
    
    
    # (2a) Only keep suffixes
    suffix = m1.loc[:, '1A':'ZV']
    
    m3 = pd.merge(m1[['AIN']], suffix, left_index = True, right_index = True)

    # (2b) Make suffixes parcel-level
    suffix_by_parcel = (m3.pivot_table(index = 'AIN', aggfunc = 'sum')
                        .reset_index()
                       )

    # (2c) Merge in tract info and aggregate to tract-level
    m4 = pd.merge(suffix_by_parcel, crosswalk_parcels_tracts, 
                                 on = 'AIN', how = 'left', validate = '1:1')

    # Aggregate the number of suffixes by tract
    suffix_by_tract = m4.pivot_table(index = ['GEOID', 'pop'], aggfunc = 'sum').reset_index()
    
    
    # (3) Merge number of cases and suffixes by tract
    df = pd.merge(ent_by_tract, suffix_by_tract, on = ['GEOID', 'pop'], how = 'left', validate = '1:1')
        
    # (4) Make sure everything returns as integers and not floats
    colnames = list(df.columns)

    for r in ['GEOID', 'pop', 'AIN']:
        colnames.remove(r)
    
    df[colnames] = df[colnames].fillna(0).astype(int)    
        
    return df

In [5]:
df = make_tract_level()

## Merge with census data

In [7]:
# With a census table that only has numeric values, this is most straightforward to convert from long to wide
def grab_census_table(table_name, year):
    df = pd.read_parquet(f's3://{bucket_name}/data/final/census_cleaned.parquet')
    cols = ['GEOID', 'new_var', 'num']
    df = df[(df.year == year) & (df.table==table_name)][cols]
    return df


def make_wide(df, numerator_var, denominator_var, numerator_renamed, denominator_renamed): 
    numerator_renamed = f'{numerator_renamed}'
    denominator_renamed = f'{denominator_renamed}'
    
    df = (df.assign(
        numerator = df.apply(lambda row: row.num if row.new_var==numerator_var 
                                     else np.nan, axis=1),
        denominator = df.apply(lambda row: row.num if row.new_var==denominator_var 
                                       else np.nan, axis=1),
        )
    )
    
    
    df = (df.assign(
        numerator = df.numerator.fillna(df.groupby('GEOID')['numerator'].transform('max')).astype(int),
        denominator = df.denominator.fillna(df.groupby('GEOID')['denominator'].transform('max')).astype(int),
        ).rename(columns = {'numerator': numerator_renamed, 
                           'denominator': denominator_renamed})   
    )
    
    keep_col = ['GEOID', numerator_renamed, denominator_renamed]
    df = (df[keep_col].drop_duplicates()
          .sort_values('GEOID')
          .reset_index(drop=True)
         )
    
    return df

In [8]:
tenure = grab_census_table('tenure', 2018)

tenure2 = make_wide(tenure, 'pop_renter', 'pop_total', 'renter', 'hh')
tenure2.head()

Unnamed: 0,GEOID,renter,hh
0,6037101110,2199,4219
1,6037101122,577,3234
2,6037101210,5247,5987
3,6037101220,2110,3497
4,6037101300,353,4250
