# PCTS Validate Counts
* Parse PCTS case number, and use prefix and suffix to validate counts

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import intake
import boto3
import utils

In [2]:
catalog = intake.open_catalog("../catalogs/*.yml")

s3 = boto3.client('s3')
bucket_name = 'city-planning-entitlements'

## Sort out parent-child relationship and generate a case's entire history before dropping duplicates
* APLC_ID or PARNT_CASE_ID
* APLC_ID can be the same across cases with different prefixes. But, they were part of the same application.
* Treat those as separate cases.
* Same prefix, same case. APLC_ID isn't a perfect match, maybe APLC_ID + CASE_SEQ_NBR + CASE_YR_NBR?
* Final decision: use PARNT_CASE_ID, and fill it in whenever it's missing, because those are the parent cases themselves
* Parse case string for some big groups: PAR, ENV, APPEAL, ADM, ENTITLEMENT
* bys parent_case: egen max for PAR, ENV, APPEAL, ADM
* keep parent because it's ENTITLEMENT, but stores stuff from child cases

In [3]:
cases = catalog.pcts.tCASE.read()
app = catalog.pcts.tAPLC.read()

In [4]:
cases1 = cases[['CASE_ID', 'APLC_ID', 'CASE_NBR', 'CASE_SEQ_NBR', 'CASE_YR_NBR', 'CASE_ACTION_ID', 'ADM_ACTION_DT', 'PARNT_CASE_ID']]
app1 = app[['APLC_ID', 'PROJ_DESC_TXT']]

In [5]:
keep_col = ['CASE_ID', 'CASE_NBR', 'CASE_SEQ_NBR', 'CASE_YR_NBR', 'APLC_ID', 'PARNT_CASE_ID',
            'CASE_ACTION_ID', 'ADM_ACTION_DT']

cases2 = cases1[(cases1.CASE_YR_NBR >= 2010) & (cases1.CASE_YR_NBR <= 2020)][keep_col]

In [6]:
just_cases = pd.merge(cases2, app1, on = 'APLC_ID', how = 'inner', validate = 'm:1')

In [7]:
view_cols = ['CASE_ID', 'CASE_NBR', 'CASE_SEQ_NBR', 'CASE_YR_NBR', 'APLC_ID', 'PARNT_CASE_ID', 'CASE_ACTION_ID']

just_cases = just_cases[view_cols].drop_duplicates()

just_cases['obs'] = just_cases.groupby(['CASE_SEQ_NBR', 'CASE_YR_NBR']).cumcount() + 1
just_cases['max_obs'] = just_cases.groupby(['CASE_SEQ_NBR', 'CASE_YR_NBR'])['obs'].transform('max')

just_cases.max_obs.value_counts()

1    47101
2     4032
3      696
4      160
5       35
9        9
7        7
6        6
Name: max_obs, dtype: int64

In [8]:
just_cases['parent_is_null'] = just_cases.PARNT_CASE_ID.isna()

just_cases.parent_is_null.value_counts()

True     49418
False     2628
Name: parent_is_null, dtype: int64

In [9]:
just_cases['PARENT_CASE'] = just_cases.apply(lambda row: row.CASE_ID if row.parent_is_null == True 
                                             else row.PARNT_CASE_ID, axis = 1)

In [10]:
sample_parents = [195906, 177239, 181967]

just_cases[just_cases.PARENT_CASE.isin(sample_parents)].sort_values(['PARENT_CASE', 'CASE_ID'])

Unnamed: 0,CASE_ID,CASE_NBR,CASE_SEQ_NBR,CASE_YR_NBR,APLC_ID,PARNT_CASE_ID,CASE_ACTION_ID,obs,max_obs,parent_is_null,PARENT_CASE
902,177239.0,ZA-2010-28-CU-CUB,28.0,2010.0,107560.0,,11.0,1,3,True,177239.0
7867,185587.0,ZA-2010-28-CU-CUB-1A,28.0,2010.0,113450.0,177239.0,,2,3,False,177239.0
23076,201852.0,ZA-2010-28-CU-CUB-EXT,28.0,2010.0,168927.0,177239.0,,3,3,False,177239.0
1331,181967.0,ENV-2011-328-MND,328.0,2011.0,110982.0,,,1,2,True,181967.0
12652,188354.0,ENV-2011-328-MND-REC1,328.0,2011.0,115327.0,181967.0,,2,2,False,181967.0
15566,195906.0,DIR-2014-886-SPP-SPPA,886.0,2014.0,120419.0,,2.0,1,4,True,195906.0
20302,200623.0,DIR-2014-886-SPP-SPPA-1A,886.0,2014.0,123459.0,195906.0,,2,4,False,195906.0
22541,201290.0,DIR-2014-886-SPP-SPPA-2A,886.0,2014.0,168552.0,195906.0,,3,4,False,195906.0


In [11]:
parsed_col_names = ['prefix', 'suffix', 'invalid_prefix']

def parse_pcts(row):
    try:
        z = utils.PCTSCaseNumber(row.CASE_NBR)
        return pd.Series([z.prefix, z.suffix, z.invalid_prefix], index = parsed_col_names)
    except ValueError:
        return pd.Series([z.prefix, z.suffix, z.invalid_prefix], index = parsed_col_names)

parsed = just_cases.apply(parse_pcts, axis = 1)

just_cases = pd.concat([just_cases, parsed], axis = 1)

just_cases.head()

Unnamed: 0,CASE_ID,CASE_NBR,CASE_SEQ_NBR,CASE_YR_NBR,APLC_ID,PARNT_CASE_ID,CASE_ACTION_ID,obs,max_obs,parent_is_null,PARENT_CASE,prefix,suffix,invalid_prefix
0,193546.0,ZA-2013-3079-CEX,3079.0,2013.0,118838.0,,1.0,1,1,True,193546.0,ZA,[CEX],
1,193547.0,AA-2013-3080-PMLA-SL,3080.0,2013.0,118839.0,,2.0,1,1,True,193547.0,AA,"[PMLA, SL]",
2,193548.0,ENV-2013-3081-MND,3081.0,2013.0,118839.0,,,1,1,True,193548.0,ENV,[MND],
3,193549.0,ZA-2013-3082-ZAA,3082.0,2013.0,118839.0,,2.0,1,1,True,193549.0,ZA,[ZAA],
4,179486.0,CHC-2010-1806-MA,1806.0,2010.0,109184.0,,,1,1,True,179486.0,CHC,[MA],


In [12]:
# Nones are throwing up error when we loop through to tag different suffixes
just_cases.suffix = just_cases.suffix.fillna('')

In [13]:
# Loop through possible suffixes we want to create dummies for
# if case contains 1 or more of them, tag appeal as 1. doesn't matter if there's multiple appeals. 
just_cases['APPEAL'] = just_cases.apply(lambda row: True if (len((set(row.suffix) & {'1A', '2A', '5A'})) > 0) else False, axis = 1)

 
for s in ['ADD', 'CC', 'EXT', 'M', 'PA', 'REC', 'SUP']:
    just_cases[s] = just_cases.apply(lambda row: True if s in row.suffix else False, axis = 1)
    
for p in ['ENV', 'PAR']:
    just_cases[p] = just_cases.apply(lambda row: True if p in row.prefix else False, axis = 1)    

    
def adm_groups(row):
    # Define all the ADM cases, which are a combination of prefix-suffix (sometime in the past) or just prefix (more recent years)
    cond1 = row.prefix == 'AA' and 'WIM' in row.suffix
    cond2 = row.prefix == 'DIR' and (len((set(row.suffix) & {'ACI', 'CEX', 'CWC', 'CWNC', 'HPM', 'VSO'})) > 0)
    cond3 = row.prefix == 'PS' and 'A' in row.suffix
    cond4 = row.prefix == 'ZA' and (len((set(row.suffix) & {'AIC', 'CEX'})) > 0)
    
    if (cond1) or (cond2) or (cond3) or (cond4):
        return True
    if 'ADM' in row.prefix:
        return True
    else:
        return False    
    
just_cases['ADM'] = just_cases.apply(adm_groups, axis = 1)

In [14]:
all_dummies = ['APPEAL', 'ADD', 'CC', 'EXT', 'M', 'PA', 'REC', 'SUP', 'ENV', 'PAR', 'ADM']

# Groupby the parent CASE_ID and then find the max for all the prefix and suffix dummies
for col in all_dummies:
    just_cases[col] = just_cases[col].astype(int)
    display(col)
    display(just_cases[col].value_counts())
    just_cases[col] = just_cases.groupby('PARENT_CASE')[col].transform('max')

'APPEAL'

0    50589
1     1457
Name: APPEAL, dtype: int64

'ADD'

0    52046
Name: ADD, dtype: int64

'CC'

0    51992
1       54
Name: CC, dtype: int64

'EXT'

0    51783
1      263
Name: EXT, dtype: int64

'M'

0    52046
Name: M, dtype: int64

'PA'

0    52046
Name: PA, dtype: int64

'REC'

0    52046
Name: REC, dtype: int64

'SUP'

0    52046
Name: SUP, dtype: int64

'ENV'

0    35750
1    16296
Name: ENV, dtype: int64

'PAR'

0    50446
1     1600
Name: PAR, dtype: int64

'ADM'

0    37604
1    14442
Name: ADM, dtype: int64

In [15]:
all_suffix = ['APPEAL', 'ADD', 'CC', 'EXT', 'M', 'PA', 'REC', 'SUP']
all_prefix = ['ENV', 'PAR', 'ADM']

just_cases['num_suffix'] = just_cases[all_suffix].sum(axis = 1)
just_cases['num_prefix'] = just_cases[all_prefix].sum(axis = 1)

display(just_cases.num_suffix.value_counts())
display(just_cases.num_prefix.value_counts())

0    48721
1     3179
2      142
3        4
Name: num_suffix, dtype: int64

1    32338
0    19708
Name: num_prefix, dtype: int64

In [16]:
# Only keep parent cases
just_parent = just_cases[just_cases.parent_is_null == True]

# Rename columns
just_parent.rename(columns = {'ADD': 'addendum', 'CC': 'conditional_clearance',
                            'EXT': 'extension', 'M': 'modification', 
                            'PA': 'plan_approval', 'REC': 'reconsideration', 'SUP': 'supplemental',
                            'APPEAL': 'appeal', 'ADM': 'admin', 'ENV': 'env', 'PAR': 'pre_application_review'}, inplace = True)

# Drop columns
drop = ['obs', 'max_obs', 'parent_is_null', 'PARNT_CASE_ID']

just_parent = just_parent.drop(columns = drop)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [17]:
just_parent.columns

Index(['CASE_ID', 'CASE_NBR', 'CASE_SEQ_NBR', 'CASE_YR_NBR', 'APLC_ID',
       'CASE_ACTION_ID', 'PARENT_CASE', 'prefix', 'suffix', 'invalid_prefix',
       'appeal', 'addendum', 'conditional_clearance', 'extension',
       'modification', 'plan_approval', 'reconsideration', 'supplemental',
       'env', 'pre_application_review', 'admin', 'num_suffix', 'num_prefix'],
      dtype='object')

In [18]:
cols_to_export = ['CASE_ID', 'PARENT_CASE', 'appeal', 'addendum', 'conditional_clearance',
                  'extension', 'modification', 'plan_approval', 'reconsideration', 'supplemental', 
                  'env', 'pre_application_review', 'admin']

just_parent[cols_to_export].to_parquet('../data/parent_cases.parquet')

## Get counts for each category
* 2010-2019
* 2015-2019

In [19]:
df2 = just_parent[just_parent.CASE_YR_NBR <= 2019]

In [20]:
entitlement_prefix = ['AA', 'APCC', 'APCE', 'APCH', 'APCNV', 
                  'APCS', 'APCSV', 'APCW', 'CPC', 'DIR', 'PS', 'TT', 'VTT', 'ZA']

In [21]:
# Entitlements should exclude the other categories, but can include appeals, because appeals need an entitlement to appeal
cond1 = (df2.admin == 0)
cond2 = (df2.env == 0)
cond3 = (df2.pre_application_review == 0)

print(f'# ENTITLE 2010-2019: {len(df2[(df2.CASE_YR_NBR >= 2010) & (df2.prefix.isin(entitlement_prefix)) & cond1 & cond2 & cond3])}')
print(f'# ENTITLE 2015-2019: {len(df2[(df2.CASE_YR_NBR >= 2015) & (df2.prefix.isin(entitlement_prefix)) & cond1 & cond2 & cond3])}')

# ENTITLE 2010-2019: 16513
# ENTITLE 2015-2019: 9386


In [22]:
print(f'# ENV 2010-2019: {len(df2[(df2.CASE_YR_NBR >= 2010) & (df2.env == 1)])}')
print(f'# ENV 2015-2019: {len(df2[(df2.CASE_YR_NBR >= 2015) & (df2.env == 1)])}')

# ENV 2010-2019: 15812
# ENV 2015-2019: 8588


In [23]:
print(f'# ADM 2010-2019: {len(df2[(df2.CASE_YR_NBR >= 2010) & (df2.admin == 1)])}')
print(f'# ADM 2015-2019: {len(df2[(df2.CASE_YR_NBR >= 2015) & (df2.admin == 1)])}')

# ADM 2010-2019: 14210
# ADM 2015-2019: 9746


In [24]:
print(f'# APPEAL 2010-2019: {len(df2[(df2.CASE_YR_NBR >= 2010) & (df2.appeal == 1)])}')
print(f'# APPEAL 2015-2019: {len(df2[(df2.CASE_YR_NBR >= 2015) & (df2.appeal == 1)])}')

# APPEAL 2010-2019: 1296
# APPEAL 2015-2019: 646


In [25]:
print(f'# PAR 2010-2019: {len(df2[(df2.CASE_YR_NBR >= 2010) & (df2.pre_application_review == 1)])}')
print(f'# PAR 2015-2019: {len(df2[(df2.CASE_YR_NBR >= 2015) & (df2.pre_application_review == 1)])}')

# PAR 2010-2019: 1563
# PAR 2015-2019: 1563
