# Entitlements in TOC-eligible parcels

In [1]:
import boto3
import geopandas as gpd
import intake
import numpy as np
import os
import pandas as pd
import pcts_census_utils
import pcts_parser
import utils

In [2]:
catalog = intake.open_catalog("../catalogs/*.yml")

s3 = boto3.client('s3')
bucket_name = 'city-planning-entitlements'

## Process PCTS
* Won't know which AINs are used in PCTS. Keep all the CASE_NBR-AINs but have a way to identify how many obs to drop later on
* Join parcels to zoning eligible zones
* Want all the entitlements (TOC or non-TOC) after 10/2017 in the TOC-eligible parcels, and then we can see what activity has occurred

In [3]:
def subset_pcts():   
    # Import PCTS - use function to subset
    prefix_suffix_list = ["TOC"]
    start_date = "2017-10"
    pcts = pcts_census_utils.subset_pcts(start_date, prefix_suffix_list)
    
    # Import parcels
    parcels = gpd.read_file(f'zip+s3://{bucket_name}/gis/intermediate/TOC_Parcels.zip')    
    
    # Grab the centroids and count number of duplicate obs
    parcels2 = utils.get_centroid(parcels)
    
    # geoparquets can't be read from S3 directly. Download and read locally.
    zoning_file = "parsed_zoning.parquet"
    s3.download_file(f'{bucket_name}',
                     f'gis/raw/{zoning_file}', f'../gis/{zoning_file}')

    zoning = gpd.read_parquet(f'../gis/{zoning_file}')
    os.remove(f'../gis/{zoning_file}')
    
    eligible_zones = ['R2', 'R3', 'RAS3', 'R4', 'RAS4', 'R5', 
                  'RD1.5', 'RD2', 'RD3', 'RD4', 'RD5', 'RD6', 
                  'C1', 'C2', 'C4', 'C5']

    eligible_zoning = zoning[zoning.zone_class.isin(eligible_zones)]

        
    # Merge PCTS with parcel info to see which TOC Tier it falls within
    m1 = pd.merge(parcels2, pcts, on = 'AIN', how = 'inner', validate = '1:m')   

    # Spatial join with eligible zones and attach the zoning info
    m2 = gpd.sjoin(m1, eligible_zoning, how = 'inner', op = 'intersects').drop(columns = ['index_right'])

    m2 = m2.drop_duplicates()
        
    return m2

In [4]:
def zoning_pcts_processing(df): 
    # Parse PCTS string and grab prefix
    parsed_col_names = ['prefix']

    def parse_pcts(row):
        try:
            z = pcts_parser.PCTSCaseNumber(row.CASE_NBR)
            return pd.Series([z.prefix], index = parsed_col_names)
        except ValueError:
            return pd.Series([z.prefix], index = parsed_col_names)

    parsed = df.apply(parse_pcts, axis = 1)
    df2 = pd.concat([df, parsed], axis = 1)


    # Subset by PCTS prefix, drop ENV/ADM/PAR cases
    drop_prefix = ['ENV', 'ADM', 'PAR']
    df3 = df2.loc[~df2.prefix.isin(drop_prefix)]

    # Subset by CASE_ACTION_ID -- let's use all cases for now (but approved cases are 1, 2, 11)
    # We have some NaN CASE_ACTION_IDs, so we won't subset at all
    
    # At this point, no more duplicates by PARENT_CASE - AIN combination
    
    # Subset and keep colums we need
    keep = ['CASE_ID', 'AIN', 'TOC_Tier', 
            'CASE_NBR', 'CASE_SEQ_NBR', 'CASE_YR_NBR', 'id', 
            'CASE_ACTION_ID', 'CASE_FILE_RCV_DT', 'CASE_FILE_DATE', 
            'PARENT_CASE', 'PROJ_DESC_TXT',
            'prefix', 'zone_class', 'centroid', 'TOC']
    
    return df3[keep]

In [5]:
def tag_toc_entitlements(df):
    # Save the geometry of the parcels, just use centroids
    parcel_centroids = df[['AIN', 'centroid']].drop_duplicates()
    
    keep_col = ['CASE_NBR', 'id', 'CASE_ACTION_ID', 'CASE_FILE_DATE', 
                'AIN', 'TOC_Tier', 'zone_class', 'TOC']
    
    df = (df[keep_col]
          .assign(
              is_TOC = df.TOC.astype(int)
          ).drop(columns = ["TOC"])
         )
    
    # Make into parcel-level df
    df2 = (df.groupby(['AIN', 'TOC_Tier', 'zone_class', 'is_TOC'])
           .agg({'id':'count'})
           .reset_index()) 

    # Make wide
    df2 = df2.assign(
        num_TOC = df2.apply(lambda row: row.id if row.is_TOC == 1 else np.nan, axis = 1),
        num_nonTOC = df2.apply(lambda row: row.id if row.is_TOC == 0 else np.nan, axis = 1)
    )
    
    
    # If there are multiple obs for the same AIN, fill the NaNs with the max from the other column 
    # Then, drop duplicates
    df2 = df2.assign(
        num_TOC = df2.num_TOC.fillna(df2.groupby('AIN')['num_TOC'].transform('max')),
        num_nonTOC = df2.num_nonTOC.fillna(df2.groupby('AIN')['num_nonTOC'].transform('max'))
    )
    
    df3 = df2.drop_duplicates(subset = ['AIN', 'TOC_Tier', 'zone_class', 'num_TOC', 'num_nonTOC'])

    df3 = (df3.assign(
            num_TOC = df3.num_TOC.fillna(0).astype(int),
            num_nonTOC = df3.num_nonTOC.fillna(0).astype(int)
        ).drop(columns = ['is_TOC', 'id'])
    )

    # Merge geometry back on
    df4 = pd.merge(parcel_centroids, df3, on = 'AIN', how = 'inner', validate = '1:m')
    
    return df4

In [6]:
df1 = subset_pcts()   
df2 = zoning_pcts_processing(df1)
df = tag_toc_entitlements(df2)

In [15]:
df.head()

Unnamed: 0,AIN,centroid,TOC_Tier,zone_class,num_TOC,num_nonTOC
0,2010004040,POINT (6378795.418 1908220.195),1,R3,0,1
1,2010004045,POINT (6378721.467 1908496.944),1,C4,0,1
2,2010004047,POINT (6378828.013 1908358.350),1,C4,0,1
3,2021012016,POINT (6373060.999 1898646.494),1,C2,0,1
4,2021012017,POINT (6373015.575 1898424.105),1,C2,0,1


## Summary stats

In [7]:
toc_parcels = df[df.num_TOC > 0]
non_toc_parcels = df[df.num_nonTOC > 0]
have_both_parcels = df[(df.num_TOC > 0) & (df.num_nonTOC > 0)]

print(f'# parcels: {len(df)}')
print(f'# parcels with TOC entitlements: {len(toc_parcels)}')
print(f'# parcels with non TOC entitlements: {len(non_toc_parcels)}')
print(f'# parcels with both TOC and non TOC entitlements: {len(have_both_parcels)}')
print(f'double check sum: {len(toc_parcels) + len(non_toc_parcels) - len(have_both_parcels)}')

# parcels: 7114
# parcels with TOC entitlements: 472
# parcels with non TOC entitlements: 6680
# parcels with both TOC and non TOC entitlements: 38
double check sum: 7114


In [8]:
print(f'% parcels with TOC entitlements: {len(toc_parcels) / len(df)}')
print(f'% parcels with non TOC entitlements: {len(non_toc_parcels) / len(df)}')
print(f'% parcels with both entitlements: {len(have_both_parcels) / len(df)}')

% parcels with TOC entitlements: 0.06634804610626933
% parcels with non TOC entitlements: 0.9389935338768626
% parcels with both entitlements: 0.005341579983131853


In [9]:
toc_parcels.zone_class.value_counts()

C2       182
R3       169
R4        82
C4        26
RD1.5      7
R5         3
RAS4       2
RD2        1
Name: zone_class, dtype: int64

In [10]:
non_toc_parcels.zone_class.value_counts()

RD1.5    1688
R2       1611
C2       1546
R3        915
C4        307
RD2       231
C1        103
R4         94
RD3        57
RD5        35
R5         30
C5         28
RAS4       26
RAS3        8
RD4         1
Name: zone_class, dtype: int64

In [11]:
df.to_file(driver = 'GeoJSON', filename = '../gis/intermediate/toc_eligible_parcels_with_entitlements.geojson')

s3.upload_file('../gis/intermediate/toc_eligible_parcels_with_entitlements.geojson', 
               f'{bucket_name}', 'gis/intermediate/toc_eligible_parcels_with_entitlements.geojson')

## Breakdown by TOC Tiers

In [12]:
def summarize_by_tiers(df):
    df2 = df.groupby('TOC_Tier').agg({'AIN':'count', 'num_TOC':'sum', 'num_nonTOC':'sum'}).reset_index()
    
    for i in ['TOC', 'nonTOC']:
        new_col = f'pct_{i}'
        numerator = f'num_{i}'
        df2[new_col] = df2[numerator] / (df2.num_TOC + df2.num_nonTOC)
    
    df2['all_AIN'] = df2.AIN.sum()
    df2['pct_AIN'] = df2.AIN / df2.all_AIN
    
    return df2

by_tiers = summarize_by_tiers(df)
by_tiers

Unnamed: 0,TOC_Tier,AIN,num_TOC,num_nonTOC,pct_TOC,pct_nonTOC,all_AIN,pct_AIN
0,0,120,3,123,0.02381,0.97619,7114,0.016868
1,1,4092,75,4203,0.017532,0.982468,7114,0.575204
2,2,1569,102,1533,0.062385,0.937615,7114,0.220551
3,3,1151,257,1026,0.200312,0.799688,7114,0.161794
4,4,182,37,194,0.160173,0.839827,7114,0.025583


## Breakdown by Zone Class

In [13]:
def summarize_by_zones(df):
    df2 = df.groupby('zone_class').agg({'AIN':'count', 'num_TOC':'sum', 'num_nonTOC':'sum'}).reset_index()
    
    for i in ['TOC', 'nonTOC']:
        new_col = f'pct_{i}'
        numerator = f'num_{i}'
        df2[new_col] = df2[numerator] / (df2.num_TOC + df2.num_nonTOC)
    
    df2['all_AIN'] = df2.AIN.sum()
    df2['pct_AIN'] = df2.AIN / df2.all_AIN
    
    return df2

by_zones = summarize_by_zones(df)
by_zones

Unnamed: 0,zone_class,AIN,num_TOC,num_nonTOC,pct_TOC,pct_nonTOC,all_AIN,pct_AIN
0,C1,103,0,111,0.0,1.0,7114,0.014478
1,C2,1710,184,1706,0.097354,0.902646,7114,0.240371
2,C4,330,26,353,0.068602,0.931398,7114,0.046387
3,C5,28,0,34,0.0,1.0,7114,0.003936
4,R2,1611,0,1661,0.0,1.0,7114,0.226455
5,R3,1071,169,940,0.15239,0.84761,7114,0.150548
6,R4,173,82,98,0.455556,0.544444,7114,0.024318
7,R5,32,3,33,0.083333,0.916667,7114,0.004498
8,RAS3,8,0,8,0.0,1.0,7114,0.001125
9,RAS4,28,2,31,0.060606,0.939394,7114,0.003936


In [14]:
writer = pd.ExcelWriter('../outputs/toc_charts.xlsx', engine = 'xlsxwriter')

by_tiers.to_excel(writer, sheet_name = 'entitlements_by_tier')
by_zones.to_excel(writer, sheet_name = 'entitlements_by_zone')

writer.save()