# Entitlements in TOC-eligible parcels

In [1]:
import boto3
import geopandas as gpd
import intake
import numpy as np
import os
import pandas as pd

import utils
import laplan

In [2]:
catalog = intake.open_catalog("../catalogs/*.yml")

s3 = boto3.client('s3')
bucket_name = 'city-planning-entitlements'

## Process PCTS
* Won't know which AINs are used in PCTS. Keep all the CASE_NBR-AINs but have a way to identify how many obs to drop later on
* Join parcels to zoning eligible zones
* Want all the entitlements (TOC or non-TOC) after 10/2017 in the TOC-eligible parcels, and then we can see what activity has occurred

In [3]:
def subset_pcts():   
    # Import PCTS - use function to subset
    pcts = catalog.pcts2.read()
    
    FULL_PREFIX = list(laplan.pcts.VALID_PCTS_PREFIX)
    remove_prefix = ["ENV", "PAR", "ADM"]
    prefix = [x for x in FULL_PREFIX if x not in remove_prefix]

    pcts = laplan.pcts.subset_pcts(
        pcts,
        start_date="2017-10-01",
        prefix_list=prefix,
        get_dummies=True,
    )
    
    pcts = laplan.pcts.drop_child_cases(pcts, keep_child_entitlements = True)
    
    dropme = prefix_suffix_cols = prefix + list(laplan.pcts.VALID_PCTS_SUFFIX)
    pcts = pcts.drop(columns = dropme)
    
    # Import parcels
    parcels = catalog.crosswalk_parcels_tracts.read()
    
    # Import parcels with zoning crosswalk
    parcels_with_zoning = utils.download_geoparquet("parcels_joined_zones.parquet", 
                                                    S3_path = "gis/intermediate/")

    # Import parcels with zoning
    eligible_zones = ['R2', 'R3', 'RAS3', 'R4', 'RAS4', 'R5', 
                  'RD1.5', 'RD2', 'RD3', 'RD4', 'RD5', 'RD6', 
                  'C1', 'C2', 'C4', 'C5']

    eligible_zoning = (parcels_with_zoning[parcels_with_zoning.zone_class.isin(eligible_zones)]
                        [["uuid", "zone_class"]]
                      )
    
    # There are duplicates in eligible_zoning, which means same uuid associated with 2 zone_classes
    # Pick more restrictive zone to keep
    eligible_zoning = eligible_zoning.assign(
        zone_order = eligible_zoning.zone_class.map(utils.ZONE_CLASS_ORDER)
    )

    eligible_zoning = (eligible_zoning.sort_values(["uuid", "zone_order"])
                       .drop_duplicates(subset = "uuid", keep = "first")
                       .drop(columns = "zone_order")
                      )
    
    # Merge parcel crosswalk with parcels in eligible zones
    # Inner join: drop those that aren't in eligible zones
    parcels_eligible_zones = pd.merge(parcels, eligible_zoning, 
                    on = "uuid", how = "inner", validate = "m:1")
    
    
    # Merge PCTS with parcel info to see which TOC Tier and zone class is associated with each 
    m1 = pd.merge(pcts, parcels_eligible_zones, on = 'AIN', how = 'inner', validate = 'm:1')   
        
    # There are duplicates on uuid-CASE_ID...leave for now? Affects <1% of obs.
    # If it touches more than 20 parcels...it'll get dropped as outlier?
    # We are also only looking at unique CASE_IDs in analysis.
    m2 = m1.copy()
    m2["obs"] = m2.groupby(["CASE_ID", "uuid"]).cumcount() + 1
    m2["max_obs"] = m2.groupby(["CASE_ID", "uuid"])["obs"].transform("max")
    print("Duplicates on CASE_ID-uuid")
    print(m2.max_obs.value_counts())
    
    return m1

In [4]:
def more_pcts_processing(df): 
    # We care about TOC vs non-TOC entitlements
    df = df.assign(
        is_TOC = df.CASE_NBR.str.contains("TOC").astype(int),
    )
    
    # Subset by CASE_ACTION_ID -- let's use all cases for now (but approved cases are 1, 2, 11)
    # We have some NaN CASE_ACTION_IDs, so we won't subset at all
    
    # At this point, no more duplicates by PARENT_CASE - AIN combination
    
    # But, there are cases that apply to lots of parcels
    # Drop cases that apply to 20+ parcels (6 cases, which are all non-TOC cases)
    big_cases = (df.groupby('CASE_ID')
                 .agg({'uuid': 'count'})
                 .reset_index()
                )
    
    big_cases = big_cases[big_cases['uuid'] >= 20]
    df = df[~df.CASE_ID.isin(big_cases.CASE_ID)]
        
    return df

In [5]:
df1 = subset_pcts()   
df2 = more_pcts_processing(df1)

Duplicates on CASE_ID-uuid
1     14841
79       79
3        18
2        14
9         9
7         7
Name: max_obs, dtype: int64


## Create parcel-level df to use for TOC analysis
Use a df where each observation is case-AIN for TOC analysis. The TOC analysis naturally will count a case multiple times, depending on whether that case overlaps with a bus stop boundary, rail station boundary, etc. Bus stops and rail stations are naturally located near each other for transfer points, so counting a TOC case toward a particular bus line, rail line, or rail station naturally counts each TOC case multiple times over.

In [6]:
# This sub-function is used to reshape our df to parcel-level or tract-tier-level.
def groupby_make_wide(df, group_cols):
    # Group by a list of columns and count unique cases 
    # Group by case-tract-tier, case-AIN, etc
    group_cols2 = group_cols + ['is_TOC']
    
    df2 = (df.groupby(group_cols2)
           .agg({'CASE_ID':'count'})
           .rename(columns = {'CASE_ID': 'num_cases'})
           .reset_index()) 

    # Make wide
    df2 = df2.assign(
        num_TOC = df2.apply(lambda row: row.num_cases if row.is_TOC == 1 else np.nan, axis = 1),
        num_nonTOC = df2.apply(lambda row: row.num_cases if row.is_TOC == 0 else np.nan, axis = 1)
    )
    
    # If there are multiple obs for the same AIN, fill the NaNs with the max from the other column 
    # Then, drop duplicates
    df2 = df2.assign(
        num_TOC = df2.num_TOC.fillna(df2.groupby(group_cols)['num_TOC'].transform('max')),
        num_nonTOC = df2.num_nonTOC.fillna(df2.groupby(group_cols)['num_nonTOC'].transform('max'))
    )
    
    # Drop duplicates and clean up
    group_cols3 = group_cols + ['num_TOC', 'num_nonTOC']
    df3 = df2.drop_duplicates(subset = group_cols3)
    
    df3 = (df3.assign(
            num_TOC = df3.num_TOC.fillna(0).astype(int),
            num_nonTOC = df3.num_nonTOC.fillna(0).astype(int)
        ).drop(columns = ['is_TOC', 'num_cases'])
    )
    
    return df3

In [7]:
df2.head(2)

Unnamed: 0,CASE_ID,APLC_ID,CASE_NBR,CASE_SEQ_NBR,CASE_YR_NBR,CASE_ACTION_ID,CASE_FILE_RCV_DT,CASE_FILE_DATE,PARNT_CASE_ID,PARENT_CASE,...,x,y,num_AIN,TOC_Tier,GEOID,total_AIN,pct_toc_AIN,toc_AIN,zone_class,is_TOC
0,210270.0,174665.0,ZA-2016-3724-CU-ZAA-SPP,3724.0,2016.0,4.0,2018-03-21,2018-03,,210270.0,...,-118.389601,34.15731,1,0,6037143300,1277,0.0,0,RD1.5,0
1,210270.0,174665.0,ZA-2016-3724-CU-ZAA-SPP,3724.0,2016.0,4.0,2018-03-21,2018-03,,210270.0,...,-118.389435,34.15731,1,0,6037143300,1277,0.0,0,RD1.5,0


In [8]:
def make_parcel_level_df(df):
    """
    Ignore the fact that the same case can touch multiple parcels...
    TOC Tiers analysis will naturally count multiple times, because tiers coming from
    bus stops or rail lines will all overlap each other. 
    That analysis uses parcel-level PCTS combined with transit-stop-level TOC Tiers info.
    
    Reshape and make a parcel-level df to be used in later notebooks.
    But, we should drop duplicates based on uuid, so that it is truly the unique physical parcel.
    Create the toc_parcels_with_entitlements.geojson file.
    """
    
    keep_col = ['CASE_NBR', 'CASE_ID', 'CASE_ACTION_ID', 'CASE_FILE_DATE', 
                'uuid', 'AIN', 'x', 'y', 'TOC_Tier', 'zone_class', 'is_TOC']
    
    # Drop duplicates based on uuid
    df = (df[keep_col]
          .sort_values(["uuid", "AIN"])
          .drop_duplicates(subset = "uuid", keep = "first")
          .reset_index(drop=True)
         )
         
    # Make into parcel-level df and make wide
    parcel_level = ['uuid', 'AIN', 'x', 'y', 'TOC_Tier', 'zone_class']
    df2 = groupby_make_wide(df, parcel_level)
        
    # Make gdf
    df3 = gpd.GeoDataFrame(
        df2,
        geometry = gpd.points_from_xy(df2.x, df2.y), 
        crs = "EPSG:4326"
    ).drop(columns = ["x", "y"])
    
    return df3

In [9]:
toc_parcels_with_entitlements = make_parcel_level_df(df2)

file_name = "toc_eligible_parcels_with_entitlements.geojson"

toc_parcels_with_entitlements.to_file(driver = 'GeoJSON', filename = f'../gis/intermediate/{file_name}')

s3.upload_file(f'../gis/intermediate/{file_name}', bucket_name, f'gis/intermediate/{file_name}')

## Check if cases span multiple tracts / tiers / zone_classes 
For summary stats, we don't want a parcel-level df. We want to try and get as close as possible to the unique case. If we count a case multiple times simply because it touches multiple parcels, we'll overcount by a lot. 

**Outputs below show that a case can span multiple zone_classes within a tract-tier. Infrequent, but can happen.
A case will not span multiple tiers within a tract.**

### Pare down case to be unique to tract-tier-zone_class for descriptive stats
For descriptive summary stats:
* A row should be unique to case-tract-tier-zone_class
* Should we store how many parcels a case (case-tract-tier-zone) applies to by TOC case or not? Let's skip for now, unless there's a real need, we can add it later.
* When this df gets aggregated to tier or zone, a case that does span multiple tiers or multiple zones will be counted multiple times. This is ok. 

In [10]:
def count_obs(df, group_cols):
    df['obs'] = df.groupby(group_cols).cumcount() + 1
    df['max_obs'] = df.groupby(group_cols)["obs"].transform("max")
    print(df.max_obs.value_counts())

In [11]:
group_cols = ["CASE_ID", "GEOID", "TOC_Tier", "zone_class"]
print("# parcels for each CASE_ID")
count_obs(df2, group_cols)
# So a CASE_ID can be applied up to 8 different AINs within the same tract-tier-zone.

display(df2[df2.max_obs==8][["CASE_NBR", "AIN", "TOC_Tier", "zone_class", "GEOID"]].head(8))

# parcels for each CASE_ID
1     2039
2      586
3      216
4      192
5      130
6       78
7       35
11      33
8       24
9        9
Name: max_obs, dtype: int64


Unnamed: 0,CASE_NBR,AIN,TOC_Tier,zone_class,GEOID
133,ZA-2017-4225-CUW,6069007008,3,R2,6037242700
134,ZA-2017-4225-CUW,6069007010,3,R2,6037242700
135,ZA-2017-4225-CUW,6069007012,3,R2,6037242700
136,ZA-2017-4225-CUW,6069007014,3,R2,6037242700
137,ZA-2017-4225-CUW,6069007016,3,R2,6037242700
138,ZA-2017-4225-CUW,6069012012,3,R2,6037242700
139,ZA-2017-4225-CUW,6069012013,3,R2,6037242700
140,ZA-2017-4225-CUW,6069012023,3,R2,6037242700


In [12]:
keep = ["CASE_ID", "CASE_NBR", "TOC_Tier", "GEOID", "zone_class", "is_TOC"]
no_dups_by_tract_tier_zone = (df2[keep].drop_duplicates()
                              .sort_values(["GEOID", "TOC_Tier", "zone_class"])
                              .reset_index(drop=True)
                             )

group_cols = ["CASE_ID", "GEOID", "TOC_Tier"]
print("# zone_classes for each CASE_ID")
count_obs(no_dups_by_tract_tier_zone, group_cols)
# A couple of cases touch 2 different zone_classes within same tract

display(no_dups_by_tract_tier_zone[no_dups_by_tract_tier_zone.max_obs==3])

# zone_classes for each CASE_ID
1    2430
2      70
3       3
Name: max_obs, dtype: int64


Unnamed: 0,CASE_ID,CASE_NBR,TOC_Tier,GEOID,zone_class,is_TOC,obs,max_obs
1440,218745.0,CPC-2018-617-DB-SPR,3,6037213100,C2,0,1,3
1442,218745.0,CPC-2018-617-DB-SPR,3,6037213100,R3,0,2,3
1443,218745.0,CPC-2018-617-DB-SPR,3,6037213100,R4,0,3,3


In [13]:
keep = ["CASE_ID", "CASE_NBR", "TOC_Tier", "GEOID", "is_TOC"]
no_dups_by_tract_tier = (df2[keep].drop_duplicates()
                              .sort_values(["GEOID", "TOC_Tier"])
                              .reset_index(drop=True)
                             )

group_cols = ["CASE_ID", "GEOID", "TOC_Tier"]
print("# tiers for each CASE_ID")
count_obs(no_dups_by_tract_tier, group_cols)
# Cases always fall within the same tract-tier.

# tiers for each CASE_ID
1    2466
Name: max_obs, dtype: int64


In [14]:
keep = ["CASE_ID", "CASE_NBR", "GEOID", "is_TOC"]
no_dups_by_tract = (df2[keep].drop_duplicates()
                              .sort_values(["GEOID"])
                              .reset_index(drop=True)
                             )

group_cols = ["CASE_ID", "GEOID"]
print("# tracts for each CASE_ID")
count_obs(no_dups_by_tract, group_cols)
# Cases always fall within the same tract.

# tracts for each CASE_ID
1    2356
Name: max_obs, dtype: int64


## Create tract-tier-zone_class-level df
Now that the data exploration made clear just how CASE_ID can span these various geographies, we can get our df ready for summary stats.

In [15]:
def unique_to_tract_tier_zone(df):    
    # Only keep 1 case for each tract-tier-zone
    keep_cols = ["CASE_ID", "TOC_Tier", "GEOID", "CASE_NBR", 
                 "CASE_SEQ_NBR", "CASE_YR_NBR", 
                 "CASE_ACTION_ID", "CASE_FILE_RCV_DT", "CASE_FILE_DATE", 
                 "PARENT_CASE", "PROJ_DESC_TXT", "zone_class", "is_TOC"]
    
    df2 = (df[keep_cols].drop_duplicates()
                   .reset_index(drop=True)
                  )
        
    # Get counts of num_TOC and num_nonTOC by tract-tiers-zone
    tract_tier_level = ['GEOID', 'TOC_Tier', 'zone_class']
    df3 = groupby_make_wide(df2, tract_tier_level)
    
    return df3

In [16]:
df = unique_to_tract_tier_zone(df2)
df.head()

Unnamed: 0,GEOID,TOC_Tier,zone_class,num_TOC,num_nonTOC
0,6037101210,0,C2,0,5
1,6037101210,0,R3,0,2
2,6037101210,0,RD1.5,0,1
3,6037101220,0,C2,0,1
4,6037101300,0,C2,0,2


## Summary stats by parcels

In [17]:
toc_parcels = toc_parcels_with_entitlements[toc_parcels_with_entitlements.num_TOC > 0]
non_toc_parcels = toc_parcels_with_entitlements[toc_parcels_with_entitlements.num_nonTOC > 0]
have_both_parcels = toc_parcels_with_entitlements[(toc_parcels_with_entitlements.num_TOC > 0) & 
                                                  (toc_parcels_with_entitlements.num_nonTOC > 0)]

print(f'# parcels: {len(toc_parcels_with_entitlements)}')
print(f'# parcels with TOC entitlements: {len(toc_parcels)}')
print(f'# parcels with non TOC entitlements: {len(non_toc_parcels)}')
print(f'# parcels with both TOC and non TOC entitlements: {len(have_both_parcels)}')
print(f'double check sum: {len(toc_parcels) + len(non_toc_parcels) - len(have_both_parcels)}')

# parcels: 2890
# parcels with TOC entitlements: 522
# parcels with non TOC entitlements: 2368
# parcels with both TOC and non TOC entitlements: 0
double check sum: 2890


In [18]:
print(f'% parcels with TOC entitlements: {len(toc_parcels) / len(toc_parcels_with_entitlements)}')
print(f'% parcels with non TOC entitlements: {len(non_toc_parcels) / len(toc_parcels_with_entitlements)}')
print(f'% parcels with both entitlements: {len(have_both_parcels) / len(toc_parcels_with_entitlements)}')

% parcels with TOC entitlements: 0.1806228373702422
% parcels with non TOC entitlements: 0.8193771626297578
% parcels with both entitlements: 0.0


In [19]:
toc_parcels.zone_class.value_counts()

C2       212
R3       179
R4        90
C4        23
RD1.5      9
R5         3
RD2        3
RAS4       2
R2         1
Name: zone_class, dtype: int64

In [20]:
non_toc_parcels.zone_class.value_counts()

C2       1063
C4        357
R3        208
RD1.5     200
R2        166
RD2       119
R4         81
C1         54
R5         34
C5         32
RD3        26
RAS4       19
RD5         3
RD6         2
RAS3        2
RD4         2
Name: zone_class, dtype: int64

## Breakdown by TOC Tiers
Observations are at the case-tract-tier level.

Tracts are not mutually exclusive! A tract can span multiple tiers (ex: tract is partly in tier 3, partly in tier 4). If there are TOC or Non-TOC entitlements happening, that same tract would show up as an eligible tract in tier 3, and again in tier 4.

In [21]:
def summarize_by_tiers(df):
    df2 = (df.groupby(['TOC_Tier'])
           .agg({'GEOID': 'nunique', 
                 'num_TOC':'sum', 
                 'num_nonTOC':'sum'})
           .rename(columns = {'GEOID': 'num_TOC_eligible_tracts'})
           .reset_index()
          )
    
    for i in ['TOC', 'nonTOC']:
        new_col = f'pct_{i}'
        numerator = f'num_{i}'
        df2[new_col] = df2[numerator] / (df2.num_TOC + df2.num_nonTOC)
        
    return df2

by_tiers = summarize_by_tiers(df)
by_tiers

Unnamed: 0,TOC_Tier,num_TOC_eligible_tracts,num_TOC,num_nonTOC,pct_TOC,pct_nonTOC
0,0,344,47,809,0.054907,0.945093
1,1,162,52,353,0.128395,0.871605
2,2,144,58,239,0.195286,0.804714
3,3,248,180,612,0.227273,0.772727
4,4,45,20,133,0.130719,0.869281


## Breakdown by Zone Class
Observations are at the case-tract-tier-zone_class level. Recall: there are some cases, same CASE_ID, but span different zone_class within the tract. That case would show up multiple times in our aggregation.

Tracts are not mutually exclusive! A tract can obviously have multiple zone classes (ex: tract is partly R3, partly R4). If there are TOC or Non-TOC entitlements happening, that same tract would show up as an eligible tract in R3 and R4.

In [22]:
def summarize_by_zones(df):
    df2 = (df.groupby('zone_class')
           .agg({'GEOID':'nunique', 
                 'num_TOC':'sum', 
                 'num_nonTOC':'sum'})
           .rename(columns = {'GEOID': 'num_TOC_eligible_tracts'})
           .reset_index()
          )
    
    for i in ['TOC', 'nonTOC']:
        new_col = f'pct_{i}'
        numerator = f'num_{i}'
        df2[new_col] = df2[numerator] / (df2.num_TOC + df2.num_nonTOC)
        
    return df2

by_zones = summarize_by_zones(df)
by_zones

Unnamed: 0,zone_class,num_TOC_eligible_tracts,num_TOC,num_nonTOC,pct_TOC,pct_nonTOC
0,C1,26,0,41,0.0,1.0
1,C2,386,135,881,0.132874,0.867126
2,C4,112,18,345,0.049587,0.950413
3,C5,2,0,37,0.0,1.0
4,R2,47,1,171,0.005814,0.994186
5,R3,159,129,190,0.404389,0.595611
6,R4,69,59,67,0.468254,0.531746
7,R5,10,3,33,0.083333,0.916667
8,RAS3,2,0,2,0.0,1.0
9,RAS4,14,1,17,0.055556,0.944444


In [23]:
writer = pd.ExcelWriter('../outputs/toc_charts.xlsx', engine = 'xlsxwriter')

by_tiers.to_excel(writer, sheet_name = 'entitlements_by_tier')
by_zones.to_excel(writer, sheet_name = 'entitlements_by_zone')

writer.save()