# Entitlements in TOC-eligible parcels

In [1]:
import boto3
import geopandas as gpd
import intake
import numpy as np
import os
import pandas as pd
import utils
import laplan

In [2]:
catalog = intake.open_catalog("../catalogs/*.yml")

s3 = boto3.client('s3')
bucket_name = 'city-planning-entitlements'

## Process PCTS
* Won't know which AINs are used in PCTS. Keep all the CASE_NBR-AINs but have a way to identify how many obs to drop later on
* Join parcels to zoning eligible zones
* Want all the entitlements (TOC or non-TOC) after 10/2017 in the TOC-eligible parcels, and then we can see what activity has occurred

In [3]:
def subset_pcts():   
    # Import PCTS - use function to subset
    pcts = catalog.pcts2.read()
    
    FULL_PREFIX = list(laplan.pcts.VALID_PCTS_PREFIX)
    remove_prefix = ["ENV", "PAR", "ADM"]
    prefix = [x for x in FULL_PREFIX if x not in remove_prefix]

    pcts = laplan.pcts.subset_pcts(
        pcts,
        start_date="2017-10-01",
        prefix_list=prefix,
        get_dummies=True,
    )
    
    pcts = laplan.pcts.drop_child_cases(pcts, keep_child_entitlements = True)
    
    dropme = prefix_suffix_cols = prefix + list(laplan.pcts.VALID_PCTS_SUFFIX)
    pcts = pcts.drop(columns = dropme)
    
    # Import parcels
    parcels = catalog.toc_parcels.read().to_crs("EPSG:2229")
    
    # Grab the centroids and count number of duplicate obs
    parcels2 = utils.get_centroid(parcels)
    
    # geoparquets can't be read from S3 directly. Download and read locally.
    zoning_file = "parsed_zoning.parquet"
    s3.download_file(f'{bucket_name}',
                     f'gis/raw/{zoning_file}', f'../gis/{zoning_file}')

    zoning = gpd.read_parquet(f'../gis/{zoning_file}').to_crs("EPSG:2229")
    os.remove(f'../gis/{zoning_file}')
    
    eligible_zones = ['R2', 'R3', 'RAS3', 'R4', 'RAS4', 'R5', 
                  'RD1.5', 'RD2', 'RD3', 'RD4', 'RD5', 'RD6', 
                  'C1', 'C2', 'C4', 'C5']

    eligible_zoning = zoning[zoning.zone_class.isin(eligible_zones)]

        
    # Merge PCTS with parcel info to see which TOC Tier it falls within
    m1 = pd.merge(parcels2, pcts, on = 'AIN', how = 'inner', validate = '1:m')   

    # Spatial join with eligible zones and attach the zoning info
    m2 = gpd.sjoin(m1, eligible_zoning, how = 'inner', op = 'intersects').drop(columns = ['index_right'])

    m2 = m2.drop_duplicates()
        
    return m2

In [4]:
def more_pcts_processing(df): 
    # We care about TOC vs non-TOC entitlements
    df = df.assign(
        is_TOC = df.CASE_NBR.str.contains("TOC").astype(int),
    )
    
    # Subset by CASE_ACTION_ID -- let's use all cases for now (but approved cases are 1, 2, 11)
    # We have some NaN CASE_ACTION_IDs, so we won't subset at all
    
    # At this point, no more duplicates by PARENT_CASE - AIN combination
    
    # But, there are cases that apply to lots of parcels
    # Drop cases that apply to 20+ parcels (6 cases, which are all non-TOC cases)
    big_cases = (df.groupby('CASE_ID')
                 .agg({'id': 'count'})
                 .reset_index()
                )
    
    big_cases = big_cases[big_cases['id'] >= 20]
    df = df[~df.CASE_ID.isin(big_cases.CASE_ID)]
    
    # Subset and keep colums we need
    keep = ['CASE_ID', 'AIN', 'TOC_Tier', 
            'CASE_NBR', 'CASE_SEQ_NBR', 'CASE_YR_NBR', 'id', 
            'CASE_ACTION_ID', 'CASE_FILE_RCV_DT', 'CASE_FILE_DATE', 
            'PARENT_CASE', 'PROJ_DESC_TXT',
            'zone_class', 'centroid', 'is_TOC']
    
    return df[keep]

In [5]:
def make_parcel_level_df(df):
    """
    Ignore the fact that the same case can touch multiple parcels...
    TOC Tiers analysis will naturally count multiple times, because tiers coming from
    bus stops or rail lines will all overlap each other. 
    That analysis uses parcel-level PCTS combined with transit-stop-level TOC Tiers info.
    
    Reshape and make a parcel-level df to be used in later notebooks.
    Create the toc_parcels_with_entitlements.geojson file.
    """
    # Save the geometry of the parcels, just use centroids
    parcel_centroids = df[['AIN', 'centroid']].drop_duplicates()
    
    keep_col = ['CASE_NBR', 'CASE_ID', 'CASE_ACTION_ID', 'CASE_FILE_DATE', 
                'AIN', 'TOC_Tier', 'zone_class', 'is_TOC']
    
    df = df[keep_col]
         
    # Make into parcel-level df
    df2 = (df.groupby(['AIN', 'TOC_Tier', 'zone_class', 'is_TOC'])
           .agg({'CASE_ID':'count'})
           .rename(columns = {'CASE_ID': 'num_cases'})
           .reset_index()) 

    # Make wide
    df2 = df2.assign(
        num_TOC = df2.apply(lambda row: row.num_cases if row.is_TOC == 1 else np.nan, axis = 1),
        num_nonTOC = df2.apply(lambda row: row.num_cases if row.is_TOC == 0 else np.nan, axis = 1)
    )
    
    
    # If there are multiple obs for the same AIN, fill the NaNs with the max from the other column 
    # Then, drop duplicates
    df2 = df2.assign(
        num_TOC = df2.num_TOC.fillna(df2.groupby('AIN')['num_TOC'].transform('max')),
        num_nonTOC = df2.num_nonTOC.fillna(df2.groupby('AIN')['num_nonTOC'].transform('max'))
    )
    
    df3 = df2.drop_duplicates(subset = ['AIN', 'TOC_Tier', 'zone_class', 'num_TOC', 'num_nonTOC'])

    df3 = (df3.assign(
            num_TOC = df3.num_TOC.fillna(0).astype(int),
            num_nonTOC = df3.num_nonTOC.fillna(0).astype(int)
        ).drop(columns = ['is_TOC', 'num_cases'])
    )

    # Merge geometry back on
    df4 = pd.merge(parcel_centroids, df3, on = 'AIN', how = 'inner', validate = '1:m')
    
    return df4

In [6]:
def unique_to_tract_tier_zone(df):
    crosswalk_parcels_tracts = catalog.crosswalk_parcels_tracts.read()
    
    with_tract = pd.merge(df, crosswalk_parcels_tracts[["AIN", "GEOID"]], 
         on = ["AIN"], how = "left", validate = "m:1")
    
    # Tag TOC entitlements, but make sure it's unique cases at the tract-tier-zone level
    with_tract['obs'] = with_tract.groupby(["CASE_ID", "GEOID", "TOC_Tier"]).cumcount() + 1
    with_tract['max_obs'] = with_tract.groupby(["CASE_ID", "GEOID", "TOC_Tier"])["obs"].transform("max")
    
    # So a CASE_ID can be applied up to 9 different AINs within the same tract-tier-zone.
    print('# parcels applied for each CASE_ID')
    print(with_tract.max_obs.value_counts())
    
    # Only keep 1 case for each tract-tier-zone
    keep_cols = ["CASE_ID", "TOC_Tier", "GEOID", "CASE_NBR", 
                 "CASE_SEQ_NBR", "CASE_YR_NBR", 
                 "CASE_ACTION_ID", "CASE_FILE_RCV_DT", "CASE_FILE_DATE", 
                 "PARENT_CASE", "PROJ_DESC_TXT", "zone_class", "is_TOC"]
    
    unique_case = (with_tract[keep_cols].drop_duplicates()
                   .reset_index(drop=True)
                  )
    
    unique_case['obs'] = unique_case.groupby(["CASE_ID", "GEOID"]).cumcount() + 1
    unique_case['max_obs'] = unique_case.groupby(["CASE_ID", "GEOID"])["obs"].transform("max")
    
    # So a CASE_ID can be applied up to 3 different tiers within the same tract.
    print('# tiers applied for each CASE_ID')
    print(unique_case.max_obs.value_counts())    
    
    unique_case = unique_case.drop(columns = ['obs', 'max_obs'])
    
    
    # Get counts of num_TOC and num_nonTOC by tract-tiers-zone
    df1 = (unique_case.groupby(['GEOID', 'TOC_Tier', 'is_TOC', 'zone_class'])
                .agg({'CASE_ID': 'count'})
                .rename(columns = {'CASE_ID': 'num_cases'})
                .reset_index()
               )

    # Make wide
    df2 = df1.assign(
        num_TOC = df1.apply(lambda row: row.num_cases if row.is_TOC == 1 else np.nan, axis = 1),
        num_nonTOC = df1.apply(lambda row: row.num_cases if row.is_TOC == 0 else np.nan, axis = 1)
    )
    
    # If there are multiple obs for the same tract-tier-zone, fill the NaNs with the max from the other column 
    # Then, drop duplicates
    group_cols = ['GEOID', 'TOC_Tier', 'zone_class']
    df2 = df2.assign(
        num_TOC = df2.num_TOC.fillna(df2.groupby(group_cols)['num_TOC'].transform('max')),
        num_nonTOC = df2.num_nonTOC.fillna(df2.groupby(group_cols)['num_nonTOC'].transform('max'))
    )
    
    df3 = df2.drop_duplicates(subset = ['GEOID', 'TOC_Tier', 'zone_class', 'num_TOC', 'num_nonTOC'])

    df3 = (df3.assign(
            num_TOC = df3.num_TOC.fillna(0).astype(int),
            num_nonTOC = df3.num_nonTOC.fillna(0).astype(int)
        ).drop(columns = ['is_TOC', 'num_cases'])
    )    
    
    return df3

In [7]:
df1 = subset_pcts()   
df2 = more_pcts_processing(df1)



In [8]:
toc_parcels_with_entitlements = make_parcel_level_df(df2)

toc_parcels_with_entitlements.to_file(driver = 'GeoJSON', 
           filename = '../gis/intermediate/toc_eligible_parcels_with_entitlements.geojson')

s3.upload_file('../gis/intermediate/toc_eligible_parcels_with_entitlements.geojson', 
               bucket_name, 
               'gis/intermediate/toc_eligible_parcels_with_entitlements.geojson')

In [9]:
df = unique_to_tract_tier_zone(df2)
df.head()

# parcels applied for each CASE_ID
1    1328
2     382
3     162
4     104
6      84
5      60
8      24
9       9
7       7
Name: max_obs, dtype: int64
# tiers applied for each CASE_ID
1    1559
2      88
3       9
Name: max_obs, dtype: int64


Unnamed: 0,GEOID,TOC_Tier,zone_class,num_TOC,num_nonTOC
0,6037104404,0,C2,0,1
1,6037104404,1,C2,0,1
2,6037113212,1,C2,0,1
3,6037113234,1,C2,0,1
4,6037113237,1,C2,0,2


## Summary stats
Redo this section to be at case-tract-tier level, not at case-parcel level.

In [10]:
toc_parcels = toc_parcels_with_entitlements[toc_parcels_with_entitlements.num_TOC > 0]
non_toc_parcels = toc_parcels_with_entitlements[toc_parcels_with_entitlements.num_nonTOC > 0]
have_both_parcels = toc_parcels_with_entitlements[(toc_parcels_with_entitlements.num_TOC > 0) & 
                                                  (toc_parcels_with_entitlements.num_nonTOC > 0)]

print(f'# parcels: {len(toc_parcels_with_entitlements)}')
print(f'# parcels with TOC entitlements: {len(toc_parcels)}')
print(f'# parcels with non TOC entitlements: {len(non_toc_parcels)}')
print(f'# parcels with both TOC and non TOC entitlements: {len(have_both_parcels)}')
print(f'double check sum: {len(toc_parcels) + len(non_toc_parcels) - len(have_both_parcels)}')

# parcels: 1912
# parcels with TOC entitlements: 472
# parcels with non TOC entitlements: 1467
# parcels with both TOC and non TOC entitlements: 27
double check sum: 1912


In [11]:
print(f'% parcels with TOC entitlements: {len(toc_parcels) / len(df)}')
print(f'% parcels with non TOC entitlements: {len(non_toc_parcels) / len(df)}')
print(f'% parcels with both entitlements: {len(have_both_parcels) / len(df)}')

% parcels with TOC entitlements: 0.5462962962962963
% parcels with non TOC entitlements: 1.6979166666666667
% parcels with both entitlements: 0.03125


In [12]:
toc_parcels.zone_class.value_counts()

C2       182
R3       169
R4        82
C4        26
RD1.5      7
R5         3
RAS4       2
RD2        1
Name: zone_class, dtype: int64

In [13]:
non_toc_parcels.zone_class.value_counts()

C2       628
C4       221
R3       145
RD1.5    117
R2       108
RD2       83
R4        71
C5        28
R5        25
C1        15
RD3       13
RAS4      10
RD5        1
RAS3       1
RD4        1
Name: zone_class, dtype: int64

## Breakdown by TOC Tiers
Observations are at the case-tract-tier level

In [14]:
def summarize_by_tiers(df):
    df2 = (df.groupby(['TOC_Tier'])
           .agg({'GEOID': 'nunique', 
                 'num_TOC':'sum', 
                 'num_nonTOC':'sum'})
           .rename(columns = {'GEOID': 'num_TOC_eligible_tracts'})
           .reset_index()
          )
    
    for i in ['TOC', 'nonTOC']:
        new_col = f'pct_{i}'
        numerator = f'num_{i}'
        df2[new_col] = df2[numerator] / (df2.num_TOC + df2.num_nonTOC)
        
    return df2

by_tiers = summarize_by_tiers(df)
by_tiers

Unnamed: 0,TOC_Tier,num_TOC_eligible_tracts,num_TOC,num_nonTOC,pct_TOC,pct_nonTOC
0,0,14,3,15,0.166667,0.833333
1,1,160,51,350,0.127182,0.872818
2,2,144,58,239,0.195286,0.804714
3,3,248,177,611,0.224619,0.775381
4,4,45,20,132,0.131579,0.868421


## Breakdown by Zone Class

In [15]:
def summarize_by_zones(df):
    df2 = (df.groupby('zone_class')
           .agg({'GEOID':'nunique', 
                 'num_TOC':'sum', 
                 'num_nonTOC':'sum'})
           .rename(columns = {'GEOID': 'num_TOC_eligible_tracts'})
           .reset_index()
          )
    
    for i in ['TOC', 'nonTOC']:
        new_col = f'pct_{i}'
        numerator = f'num_{i}'
        df2[new_col] = df2[numerator] / (df2.num_TOC + df2.num_nonTOC)
        
    return df2

by_zones = summarize_by_zones(df)
by_zones

Unnamed: 0,zone_class,num_TOC_eligible_tracts,num_TOC,num_nonTOC,pct_TOC,pct_nonTOC
0,C1,10,0,14,0.0,1.0
1,C2,266,107,531,0.167712,0.832288
2,C4,83,17,223,0.070833,0.929167
3,C5,2,0,33,0.0,1.0
4,R2,34,0,109,0.0,1.0
5,R3,124,121,126,0.489879,0.510121
6,R4,59,52,54,0.490566,0.509434
7,R5,9,3,22,0.12,0.88
8,RAS3,1,0,1,0.0,1.0
9,RAS4,7,1,7,0.125,0.875


In [16]:
writer = pd.ExcelWriter('../outputs/toc_charts.xlsx', engine = 'xlsxwriter')

by_tiers.to_excel(writer, sheet_name = 'entitlements_by_tier')
by_zones.to_excel(writer, sheet_name = 'entitlements_by_zone')

writer.save()