# PCTS Validate Counts
* Parse PCTS case number, and use prefix and suffix to validate counts

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import intake
import pcts_parser

In [2]:
catalog = intake.open_catalog("../catalogs/*.yml")

bucket_name = 'city-planning-entitlements'

### Sort out parent-child relationship and generate a case's entire history before dropping duplicates
* Final decision: use PARNT_CASE_ID, and fill it in whenever it's missing, because those are the parent cases themselves
* Parse case string for some big groups: PAR, ENV, APPEAL, ADM, ENTITLEMENT
* bys parent_case: egen max for PAR, ENV, APPEAL, ADM
* keep parent because it's ENTITLEMENT, but stores stuff from child cases

In [3]:
df = pd.read_parquet(f's3://{bucket_name}/data/final/master_pcts.parquet')

In [4]:
# Find the max of the different application types
aggregated = (df.pivot_table(index=['PARENT_CASE'], 
                   values = ['env', 'pre_application_review', 'admin', 'appeal'], 
                   aggfunc = 'max')
        .reset_index()
       )

df = pd.merge(df.drop(columns = ['admin', 'appeal', 'env', 'pre_application_review']), 
         aggregated, on = 'PARENT_CASE', how = 'left', validate = 'm:1')

# Drop duplicates, so we keep the history of child cases
keep = ['CASE_ID', 'CASE_YR_NBR', 'PARENT_CASE', 
        'admin', 'appeal', 'env', 'pre_application_review']

df = df[df.CASE_ID == df.PARENT_CASE][keep].drop_duplicates()

## Get counts for each category
* 2010-2019
* 2015-2019
* 2017, 2018, 2019 individual years

In [5]:
def count_cases(row):
    entitlement = 0
    env = 0
    admin = 0
    appeal = 0
    par = 0
    
    cond1 = (row.admin == 0)
    cond2 = (row.env == 0)
    cond3 = (row.pre_application_review == 0)

    
    if cond1 and cond2 and cond3:
        entitlement = 1
        
    if row.env == 1:
        env = 1
    
    if row.admin == 1:
        admin = 1
    
    if row.appeal == 1:
        appeal = 1
    
    if row.pre_application_review == 1:
        par = 1
    
    return pd.Series([entitlement, env, admin, appeal, par], 
                     index=['is_entitlement', 'is_env', 'is_admin', 'is_appeal', 'is_par'])

counts = df.apply(count_cases, axis = 1)
df2 = pd.concat([df, counts], axis = 1)

In [6]:
df2.head()

Unnamed: 0,CASE_ID,CASE_YR_NBR,PARENT_CASE,admin,appeal,env,pre_application_review,is_entitlement,is_env,is_admin,is_appeal,is_par
0,193546.0,2013.0,193546.0,1,0,0,0,0,0,1,0,0
1,234299.0,2019.0,234299.0,0,0,0,0,1,0,0,0,0
2,193547.0,2013.0,193547.0,0,0,0,0,1,0,0,0,0
3,193548.0,2013.0,193548.0,0,0,1,0,0,1,0,0,0
4,193549.0,2013.0,193549.0,0,0,0,0,1,0,0,0,0


In [7]:
df2010 = df2[(df2.CASE_YR_NBR >= 2010) & (df2.CASE_YR_NBR <= 2019)]
df2015 = df2[(df2.CASE_YR_NBR >= 2015) & (df2.CASE_YR_NBR <= 2019)]
df2017 = df2[df2.CASE_YR_NBR == 2017]
df2018 = df2[df2.CASE_YR_NBR == 2018]
df2019 = df2[df2.CASE_YR_NBR == 2019]

In [8]:
dataframes = {'2010': df2010, '2015': df2015, 
             '2017': df2017, '2018': df2018, '2019': df2019}

for key, value in dataframes.items():
    display(key)
    display(value.agg({'is_entitlement':'sum', 'is_env':'sum', 'is_admin':'sum', 
               'is_appeal':'sum', 'is_par':'sum'}).reset_index())

# Ugh, 2017-2019 individual years don't exactly line up still

'2010'

Unnamed: 0,index,0
0,is_entitlement,16683
1,is_env,13398
2,is_admin,13833
3,is_appeal,1243
4,is_par,1549


'2015'

Unnamed: 0,index,0
0,is_entitlement,9303
1,is_env,6520
2,is_admin,9411
3,is_appeal,610
4,is_par,1549


'2017'

Unnamed: 0,index,0
0,is_entitlement,2008
1,is_env,1355
2,is_admin,1272
3,is_appeal,118
4,is_par,0


'2018'

Unnamed: 0,index,0
0,is_entitlement,1905
1,is_env,1284
2,is_admin,2820
3,is_appeal,98
4,is_par,715


'2019'

Unnamed: 0,index,0
0,is_entitlement,1664
1,is_env,1126
2,is_admin,3357
3,is_appeal,20
4,is_par,834
