# Entitlements in TOC-eligible parcels

In [1]:
import boto3
import geopandas as gpd
import intake
import numpy as np
import os
import pandas as pd
import utils
import laplan

In [2]:
catalog = intake.open_catalog("../catalogs/*.yml")

s3 = boto3.client('s3')
bucket_name = 'city-planning-entitlements'

## Process PCTS
* Won't know which AINs are used in PCTS. Keep all the CASE_NBR-AINs but have a way to identify how many obs to drop later on
* Join parcels to zoning eligible zones
* Want all the entitlements (TOC or non-TOC) after 10/2017 in the TOC-eligible parcels, and then we can see what activity has occurred

In [3]:
def subset_pcts():   
    # Import PCTS - use function to subset
    pcts = catalog.pcts2.read()
    
    FULL_PREFIX = list(laplan.pcts.VALID_PCTS_PREFIX)
    remove_prefix = ["ENV", "PAR", "ADM"]
    prefix = [x for x in FULL_PREFIX if x not in remove_prefix]

    pcts = laplan.pcts.subset_pcts(
        pcts,
        start_date="2017-10-01",
        prefix_list=prefix,
        get_dummies=True,
    )
    
    pcts = laplan.pcts.drop_child_cases(pcts, keep_child_entitlements = True)
    
    dropme = prefix_suffix_cols = prefix + list(laplan.pcts.VALID_PCTS_SUFFIX)
    pcts = pcts.drop(columns = dropme)
    
    # Import parcels
    parcels = catalog.toc_parcels.read().to_crs("EPSG:2229")
    
    # Grab the centroids and count number of duplicate obs
    parcels2 = utils.get_centroid(parcels)
    
    # geoparquets can't be read from S3 directly. Download and read locally.
    zoning_file = "parsed_zoning.parquet"
    s3.download_file(f'{bucket_name}',
                     f'gis/raw/{zoning_file}', f'../gis/{zoning_file}')

    zoning = gpd.read_parquet(f'../gis/{zoning_file}').to_crs("EPSG:2229")
    os.remove(f'../gis/{zoning_file}')
    
    eligible_zones = ['R2', 'R3', 'RAS3', 'R4', 'RAS4', 'R5', 
                  'RD1.5', 'RD2', 'RD3', 'RD4', 'RD5', 'RD6', 
                  'C1', 'C2', 'C4', 'C5']

    eligible_zoning = zoning[zoning.zone_class.isin(eligible_zones)]

        
    # Merge PCTS with parcel info to see which TOC Tier it falls within
    m1 = pd.merge(parcels2, pcts, on = 'AIN', how = 'inner', validate = '1:m')   

    # Spatial join with eligible zones and attach the zoning info
    m2 = gpd.sjoin(m1, eligible_zoning, how = 'inner', op = 'intersects').drop(columns = ['index_right'])

    m2 = m2.drop_duplicates()
        
    return m2

In [4]:
def more_pcts_processing(df): 
    # We care about TOC vs non-TOC entitlements
    df = df.assign(
        is_TOC = df.CASE_NBR.str.contains("TOC").astype(int),
    )
    
    # Subset by CASE_ACTION_ID -- let's use all cases for now (but approved cases are 1, 2, 11)
    # We have some NaN CASE_ACTION_IDs, so we won't subset at all
    
    # At this point, no more duplicates by PARENT_CASE - AIN combination
    
    # But, there are cases that apply to lots of parcels
    # Drop cases that apply to 20+ parcels (6 cases, which are all non-TOC cases)
    big_cases = (df.groupby('CASE_ID')
                 .agg({'id': 'count'})
                 .reset_index()
                )
    
    big_cases = big_cases[big_cases['id'] >= 20]
    df = df[~df.CASE_ID.isin(big_cases.CASE_ID)]
    
    # Subset and keep colums we need
    keep = ['CASE_ID', 'AIN', 'TOC_Tier', 
            'CASE_NBR', 'CASE_SEQ_NBR', 'CASE_YR_NBR', 'id', 
            'CASE_ACTION_ID', 'CASE_FILE_RCV_DT', 'CASE_FILE_DATE', 
            'PARENT_CASE', 'PROJ_DESC_TXT',
            'zone_class', 'centroid', 'is_TOC']
   
    df = df[keep]
    
    # Merge in tract info
    crosswalk_parcels_tracts = catalog.crosswalk_parcels_tracts.read()

    df = pd.merge(df, crosswalk_parcels_tracts[["AIN", "GEOID"]], 
        on = ["AIN"], how = "left", validate = "m:1")
    
    return df

In [5]:
df1 = subset_pcts()   
df2 = more_pcts_processing(df1)



## Create parcel-level df to use for TOC analysis
Use a df where each observation is case-AIN for TOC analysis. The TOC analysis naturally will count a case multiple times, depending on whether that case overlaps with a bus stop boundary, rail station boundary, etc. Bus stops and rail stations are naturally located near each other for transfer points, so counting a TOC case toward a particular bus line, rail line, or rail station naturally counts each TOC case multiple times over.

In [6]:
# This sub-function is used to reshape our df to parcel-level or tract-tier-level.
def groupby_make_wide(df, group_cols):
    # Group by a list of columns and count unique cases 
    # Group by case-tract-tier, case-AIN, etc
    group_cols2 = group_cols + ['is_TOC']
    
    df2 = (df.groupby(group_cols2)
           .agg({'CASE_ID':'count'})
           .rename(columns = {'CASE_ID': 'num_cases'})
           .reset_index()) 

    # Make wide
    df2 = df2.assign(
        num_TOC = df2.apply(lambda row: row.num_cases if row.is_TOC == 1 else np.nan, axis = 1),
        num_nonTOC = df2.apply(lambda row: row.num_cases if row.is_TOC == 0 else np.nan, axis = 1)
    )
    
    # If there are multiple obs for the same AIN, fill the NaNs with the max from the other column 
    # Then, drop duplicates
    df2 = df2.assign(
        num_TOC = df2.num_TOC.fillna(df2.groupby(group_cols)['num_TOC'].transform('max')),
        num_nonTOC = df2.num_nonTOC.fillna(df2.groupby(group_cols)['num_nonTOC'].transform('max'))
    )
    
    # Drop duplicates and clean up
    group_cols3 = group_cols + ['num_TOC', 'num_nonTOC']
    df3 = df2.drop_duplicates(subset = group_cols3)
    
    df3 = (df3.assign(
            num_TOC = df3.num_TOC.fillna(0).astype(int),
            num_nonTOC = df3.num_nonTOC.fillna(0).astype(int)
        ).drop(columns = ['is_TOC', 'num_cases'])
    )
    
    return df3

In [7]:
def make_parcel_level_df(df):
    """
    Ignore the fact that the same case can touch multiple parcels...
    TOC Tiers analysis will naturally count multiple times, because tiers coming from
    bus stops or rail lines will all overlap each other. 
    That analysis uses parcel-level PCTS combined with transit-stop-level TOC Tiers info.
    
    Reshape and make a parcel-level df to be used in later notebooks.
    Create the toc_parcels_with_entitlements.geojson file.
    """
    # Save the geometry of the parcels, just use centroids
    parcel_centroids = df[['AIN', 'centroid']].drop_duplicates()
    
    keep_col = ['CASE_NBR', 'CASE_ID', 'CASE_ACTION_ID', 'CASE_FILE_DATE', 
                'AIN', 'TOC_Tier', 'zone_class', 'is_TOC']
    
    df = df[keep_col]
         
    # Make into parcel-level df and make wide
    parcel_level = ['AIN', 'TOC_Tier', 'zone_class']
    df2 = groupby_make_wide(df, parcel_level)
    
    # Merge geometry back on
    df3 = pd.merge(parcel_centroids, df2, on = 'AIN', how = 'inner', validate = '1:m')
    
    return df3

In [8]:
toc_parcels_with_entitlements = make_parcel_level_df(df2)

toc_parcels_with_entitlements.to_file(driver = 'GeoJSON', 
           filename = '../gis/intermediate/toc_eligible_parcels_with_entitlements.geojson')

s3.upload_file('../gis/intermediate/toc_eligible_parcels_with_entitlements.geojson', 
               bucket_name, 
               'gis/intermediate/toc_eligible_parcels_with_entitlements.geojson')

## Check if cases span multiple tracts / tiers / zone_classes 
For summary stats, we don't want a parcel-level df. We want to try and get as close as possible to the unique case. If we count a case multiple times simply because it touches multiple parcels, we'll overcount by a lot. 

**Outputs below show that a case can span multiple zone_classes within a tract-tier. Infrequent, but can happen.
A case will not span multiple tiers within a tract.**

### Pare down case to be unique to tract-tier-zone_class for descriptive stats
For descriptive summary stats:
* A row should be unique to case-tract-tier-zone_class
* Should we store how many parcels a case (case-tract-tier-zone) applies to by TOC case or not? Let's skip for now, unless there's a real need, we can add it later.
* When this df gets aggregated to tier or zone, a case that does span multiple tiers or multiple zones will be counted multpile times. This is ok. 

In [9]:
def count_obs(df, group_cols):
    df['obs'] = df.groupby(group_cols).cumcount() + 1
    df['max_obs'] = df.groupby(group_cols)["obs"].transform("max")
    print(df.max_obs.value_counts())

In [10]:
group_cols = ["CASE_ID", "GEOID", "TOC_Tier", "zone_class"]
print("# parcels for each CASE_ID")
count_obs(df2, group_cols)
# So a CASE_ID can be applied up to 8 different AINs within the same tract-tier-zone.

display(df2[df2.max_obs==8][["CASE_NBR", "AIN", "TOC_Tier", "zone_class", "GEOID"]].head(8))

# parcels for each CASE_ID
1    1364
2     374
3     147
4     116
6      66
5      55
8      24
7      14
Name: max_obs, dtype: int64


Unnamed: 0,CASE_NBR,AIN,TOC_Tier,zone_class,GEOID
525,ZA-2017-4225-CUW,6069007008,3,R2,6037242700
526,ZA-2017-4225-CUW,6069007010,3,R2,6037242700
527,ZA-2017-4225-CUW,6069007012,3,R2,6037242700
528,ZA-2017-4225-CUW,6069007014,3,R2,6037242700
529,ZA-2017-4225-CUW,6069007016,3,R2,6037242700
530,ZA-2017-4225-CUW,6069012012,3,R2,6037242700
531,ZA-2017-4225-CUW,6069012013,3,R2,6037242700
532,ZA-2017-4225-CUW,6069012023,3,R2,6037242700


In [11]:
keep = ["CASE_ID", "CASE_NBR", "TOC_Tier", "GEOID", "zone_class", "is_TOC"]
no_dups_by_tract_tier_zone = (df2[keep].drop_duplicates()
                              .sort_values(["GEOID", "TOC_Tier", "zone_class"])
                              .reset_index(drop=True)
                             )

group_cols = ["CASE_ID", "GEOID", "TOC_Tier"]
print("# zone_classes for each CASE_ID")
count_obs(no_dups_by_tract_tier_zone, group_cols)
# A couple of cases touch 2 different zone_classes within same tract

display(no_dups_by_tract_tier_zone[no_dups_by_tract_tier_zone.max_obs==3])

# zone_classes for each CASE_ID
1    1605
2      48
3       3
Name: max_obs, dtype: int64


Unnamed: 0,CASE_ID,CASE_NBR,TOC_Tier,GEOID,zone_class,is_TOC,obs,max_obs
932,218745.0,CPC-2018-617-DB-SPR,3,6037213100,C2,0,1,3
934,218745.0,CPC-2018-617-DB-SPR,3,6037213100,R3,0,2,3
936,218745.0,CPC-2018-617-DB-SPR,3,6037213100,R4,0,3,3


In [12]:
keep = ["CASE_ID", "CASE_NBR", "TOC_Tier", "GEOID", "is_TOC"]
no_dups_by_tract_tier = (df2[keep].drop_duplicates()
                              .sort_values(["GEOID", "TOC_Tier"])
                              .reset_index(drop=True)
                             )

group_cols = ["CASE_ID", "GEOID", "TOC_Tier"]
print("# tiers for each CASE_ID")
count_obs(no_dups_by_tract_tier, group_cols)
# Cases always fall within the same tract-tier.

# tiers for each CASE_ID
1    1630
Name: max_obs, dtype: int64


In [13]:
keep = ["CASE_ID", "CASE_NBR", "GEOID", "is_TOC"]
no_dups_by_tract = (df2[keep].drop_duplicates()
                              .sort_values(["GEOID"])
                              .reset_index(drop=True)
                             )

group_cols = ["CASE_ID", "GEOID"]
print("# tracts for each CASE_ID")
count_obs(no_dups_by_tract, group_cols)
# Cases always fall within the same tract.

# tracts for each CASE_ID
1    1606
Name: max_obs, dtype: int64


## Create tract-tier-zone_class-level df
Now that the data exploration made clear just how CASE_ID can span these various geographies, we can get our df ready for summary stats.

In [14]:
def unique_to_tract_tier_zone(df):    
    # Only keep 1 case for each tract-tier-zone
    keep_cols = ["CASE_ID", "TOC_Tier", "GEOID", "CASE_NBR", 
                 "CASE_SEQ_NBR", "CASE_YR_NBR", 
                 "CASE_ACTION_ID", "CASE_FILE_RCV_DT", "CASE_FILE_DATE", 
                 "PARENT_CASE", "PROJ_DESC_TXT", "zone_class", "is_TOC"]
    
    df2 = (df[keep_cols].drop_duplicates()
                   .reset_index(drop=True)
                  )
        
    # Get counts of num_TOC and num_nonTOC by tract-tiers-zone
    tract_tier_level = ['GEOID', 'TOC_Tier', 'zone_class']
    df3 = groupby_make_wide(df2, tract_tier_level)
    
    return df3

In [15]:
df = unique_to_tract_tier_zone(df2)
df.head()

Unnamed: 0,GEOID,TOC_Tier,zone_class,num_TOC,num_nonTOC
0,6037104404,0,C2,0,1
1,6037104404,1,C2,0,1
2,6037113212,1,C2,0,1
3,6037113234,1,C2,0,1
4,6037113237,1,C2,0,2


## Summary stats by parcels

In [16]:
toc_parcels = toc_parcels_with_entitlements[toc_parcels_with_entitlements.num_TOC > 0]
non_toc_parcels = toc_parcels_with_entitlements[toc_parcels_with_entitlements.num_nonTOC > 0]
have_both_parcels = toc_parcels_with_entitlements[(toc_parcels_with_entitlements.num_TOC > 0) & 
                                                  (toc_parcels_with_entitlements.num_nonTOC > 0)]

print(f'# parcels: {len(toc_parcels_with_entitlements)}')
print(f'# parcels with TOC entitlements: {len(toc_parcels)}')
print(f'# parcels with non TOC entitlements: {len(non_toc_parcels)}')
print(f'# parcels with both TOC and non TOC entitlements: {len(have_both_parcels)}')
print(f'double check sum: {len(toc_parcels) + len(non_toc_parcels) - len(have_both_parcels)}')

# parcels: 1912
# parcels with TOC entitlements: 472
# parcels with non TOC entitlements: 1467
# parcels with both TOC and non TOC entitlements: 27
double check sum: 1912


In [17]:
print(f'% parcels with TOC entitlements: {len(toc_parcels) / len(toc_parcels_with_entitlements)}')
print(f'% parcels with non TOC entitlements: {len(non_toc_parcels) / len(toc_parcels_with_entitlements)}')
print(f'% parcels with both entitlements: {len(have_both_parcels) / len(toc_parcels_with_entitlements)}')

% parcels with TOC entitlements: 0.24686192468619247
% parcels with non TOC entitlements: 0.7672594142259415
% parcels with both entitlements: 0.014121338912133892


In [18]:
toc_parcels.zone_class.value_counts()

C2       182
R3       169
R4        82
C4        26
RD1.5      7
R5         3
RAS4       2
RD2        1
Name: zone_class, dtype: int64

In [19]:
non_toc_parcels.zone_class.value_counts()

C2       628
C4       221
R3       145
RD1.5    117
R2       108
RD2       83
R4        71
C5        28
R5        25
C1        15
RD3       13
RAS4      10
RD5        1
RAS3       1
RD4        1
Name: zone_class, dtype: int64

## Breakdown by TOC Tiers
Observations are at the case-tract-tier level.

Tracts are not mutually exclusive! A tract can span multiple tiers (ex: tract is partly in tier 3, partly in tier 4). If there are TOC or Non-TOC entitlements happening, that same tract would show up as an eligible tract in tier 3, and again in tier 4.

In [20]:
def summarize_by_tiers(df):
    df2 = (df.groupby(['TOC_Tier'])
           .agg({'GEOID': 'nunique', 
                 'num_TOC':'sum', 
                 'num_nonTOC':'sum'})
           .rename(columns = {'GEOID': 'num_TOC_eligible_tracts'})
           .reset_index()
          )
    
    for i in ['TOC', 'nonTOC']:
        new_col = f'pct_{i}'
        numerator = f'num_{i}'
        df2[new_col] = df2[numerator] / (df2.num_TOC + df2.num_nonTOC)
        
    return df2

by_tiers = summarize_by_tiers(df)
by_tiers

Unnamed: 0,TOC_Tier,num_TOC_eligible_tracts,num_TOC,num_nonTOC,pct_TOC,pct_nonTOC
0,0,14,3,15,0.166667,0.833333
1,1,160,51,350,0.127182,0.872818
2,2,144,58,239,0.195286,0.804714
3,3,248,177,611,0.224619,0.775381
4,4,45,20,132,0.131579,0.868421


## Breakdown by Zone Class
Observations are at the case-tract-tier-zone_class level. Recall: there are some cases, same CASE_ID, but span different zone_class within the tract. That case would show up multiple times in our aggregation.

Tracts are not mutually exclusive! A tract can obviously have multiple zone classes (ex: tract is partly R3, partly R4). If there are TOC or Non-TOC entitlements happening, that same tract would show up as an eligible tract in R3 and R4.

In [21]:
def summarize_by_zones(df):
    df2 = (df.groupby('zone_class')
           .agg({'GEOID':'nunique', 
                 'num_TOC':'sum', 
                 'num_nonTOC':'sum'})
           .rename(columns = {'GEOID': 'num_TOC_eligible_tracts'})
           .reset_index()
          )
    
    for i in ['TOC', 'nonTOC']:
        new_col = f'pct_{i}'
        numerator = f'num_{i}'
        df2[new_col] = df2[numerator] / (df2.num_TOC + df2.num_nonTOC)
        
    return df2

by_zones = summarize_by_zones(df)
by_zones

Unnamed: 0,zone_class,num_TOC_eligible_tracts,num_TOC,num_nonTOC,pct_TOC,pct_nonTOC
0,C1,10,0,14,0.0,1.0
1,C2,266,107,531,0.167712,0.832288
2,C4,83,17,223,0.070833,0.929167
3,C5,2,0,33,0.0,1.0
4,R2,34,0,109,0.0,1.0
5,R3,124,121,126,0.489879,0.510121
6,R4,59,52,54,0.490566,0.509434
7,R5,9,3,22,0.12,0.88
8,RAS3,1,0,1,0.0,1.0
9,RAS4,7,1,7,0.125,0.875


In [22]:
writer = pd.ExcelWriter('../outputs/toc_charts.xlsx', engine = 'xlsxwriter')

by_tiers.to_excel(writer, sheet_name = 'entitlements_by_tier')
by_zones.to_excel(writer, sheet_name = 'entitlements_by_zone')

writer.save()