# PCTS Validate Counts
* Parse PCTS case number, and use prefix and suffix to validate counts

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import intake
import boto3
import utils

In [2]:
catalog = intake.open_catalog("../catalogs/*.yml")

s3 = boto3.client('s3')
bucket_name = 'city-planning-entitlements'

## Sort out parent-child relationship and generate a case's entire history before dropping duplicates
* APLC_ID or PARNT_CASE_ID
* APLC_ID can be the same across cases with different prefixes. But, they were part of the same application.
* Treat those as separate cases.
* Same prefix, same case. APLC_ID isn't a perfect match, maybe APLC_ID + CASE_SEQ_NBR + CASE_YR_NBR?
* Final decision: use PARNT_CASE_ID, and fill it in whenever it's missing, because those are the parent cases themselves
* Parse case string for some big groups: PAR, ENV, APPEAL, ADM, ENTITLEMENT
* bys parent_case: egen max for PAR, ENV, APPEAL, ADM
* keep parent because it's ENTITLEMENT, but stores stuff from child cases

In [3]:
cases = pd.read_parquet('../data/tCASE.parquet')
app = pd.read_parquet('../data/tAPLC.parquet')
#cases = catalog.pcts.tCASE.read()
#app = catalog.pcts.tAPLC.read()

In [4]:
possible_dates = ['CASE_ID', 'CASE_NBR', 'CASE_SEQ_NBR', 'CASE_YR_NBR', 
                  'CASE_FILE_SENT_DT', 'CASE_FILE_RCV_DT', 
                 'TIMESTAMP', 'CRTN_DT']
cases[cases.CASE_YR_NBR >= 2010][possible_dates].head(3)

# Let's use CASE_FILE_RCV_DT

Unnamed: 0,CASE_ID,CASE_NBR,CASE_SEQ_NBR,CASE_YR_NBR,CASE_FILE_SENT_DT,CASE_FILE_RCV_DT,TIMESTAMP,CRTN_DT
451,193546.0,ZA-2013-3079-CEX,3079.0,2013.0,2013-10-03,2013-10-03,2016-04-11 14:06:47,2013-10-03 10:15:35
453,193547.0,AA-2013-3080-PMLA-SL,3080.0,2013.0,2013-10-04,2013-10-03,2017-07-05 08:11:40,2013-10-03 10:46:12
956,179486.0,CHC-2010-1806-MA,1806.0,2010.0,NaT,2010-07-01,2016-07-06 11:46:44,2010-07-01 10:29:48


In [5]:
cases1 = cases[['CASE_ID', 'APLC_ID', 'CASE_NBR', 'CASE_SEQ_NBR', 'CASE_YR_NBR', 'CASE_FILE_RCV_DT', 
                'CASE_ACTION_ID', 'ADM_ACTION_DT', 'PARNT_CASE_ID']]
app1 = app[['APLC_ID', 'PROJ_DESC_TXT']]

In [6]:
keep_col = ['CASE_ID', 'CASE_NBR', 'CASE_SEQ_NBR', 'CASE_YR_NBR', 'CASE_FILE_RCV_DT', 
            'APLC_ID', 'PARNT_CASE_ID',
            'CASE_ACTION_ID', 'ADM_ACTION_DT']

cases2 = cases1[(cases1.CASE_YR_NBR >= 2010) & (cases1.CASE_YR_NBR <= 2020)][keep_col]

In [7]:
just_cases = pd.merge(cases2, app1, on = 'APLC_ID', how = 'inner', validate = 'm:1')

In [8]:
view_cols = ['CASE_ID', 'CASE_NBR', 'CASE_SEQ_NBR', 'CASE_YR_NBR', 'APLC_ID', 'PARNT_CASE_ID', 
             'CASE_ACTION_ID', 'CASE_FILE_RCV_DT']

just_cases = just_cases[view_cols].drop_duplicates()

just_cases['obs'] = just_cases.groupby(['CASE_SEQ_NBR', 'CASE_YR_NBR']).cumcount() + 1
just_cases['max_obs'] = just_cases.groupby(['CASE_SEQ_NBR', 'CASE_YR_NBR'])['obs'].transform('max')

just_cases.max_obs.value_counts()

1    47101
2     4032
3      696
4      160
5       35
9        9
7        7
6        6
Name: max_obs, dtype: int64

In [9]:
just_cases['parent_is_null'] = just_cases.PARNT_CASE_ID.isna()

just_cases.parent_is_null.value_counts()

True     49418
False     2628
Name: parent_is_null, dtype: int64

In [10]:
just_cases['PARENT_CASE'] = just_cases.apply(lambda row: row.CASE_ID if row.parent_is_null == True 
                                             else row.PARNT_CASE_ID, axis = 1)

In [11]:
sample_parents = [195906, 177239, 181967]

just_cases[just_cases.PARENT_CASE.isin(sample_parents)].sort_values(['PARENT_CASE', 'CASE_ID'])

Unnamed: 0,CASE_ID,CASE_NBR,CASE_SEQ_NBR,CASE_YR_NBR,APLC_ID,PARNT_CASE_ID,CASE_ACTION_ID,CASE_FILE_RCV_DT,obs,max_obs,parent_is_null,PARENT_CASE
902,177239.0,ZA-2010-28-CU-CUB,28.0,2010.0,107560.0,,11.0,2010-01-06 00:00:00,1,3,True,177239.0
7867,185587.0,ZA-2010-28-CU-CUB-1A,28.0,2010.0,113450.0,177239.0,,2011-12-29 15:42:21,2,3,False,177239.0
23076,201852.0,ZA-2010-28-CU-CUB-EXT,28.0,2010.0,168927.0,177239.0,,2015-04-01 17:23:53,3,3,False,177239.0
1331,181967.0,ENV-2011-328-MND,328.0,2011.0,110982.0,,,2011-02-08 00:00:00,1,2,True,181967.0
12652,188354.0,ENV-2011-328-MND-REC1,328.0,2011.0,115327.0,181967.0,,2012-08-23 00:00:00,2,2,False,181967.0
15566,195906.0,DIR-2014-886-SPP-SPPA,886.0,2014.0,120419.0,,2.0,2014-03-17 00:00:00,1,4,True,195906.0
20302,200623.0,DIR-2014-886-SPP-SPPA-1A,886.0,2014.0,123459.0,195906.0,,2015-01-06 00:00:00,2,4,False,195906.0
22541,201290.0,DIR-2014-886-SPP-SPPA-2A,886.0,2014.0,168552.0,195906.0,,2015-02-19 00:00:00,3,4,False,195906.0


In [12]:
parsed_col_names = ['prefix', 'suffix', 'invalid_prefix']

def parse_pcts(row):
    try:
        z = utils.PCTSCaseNumber(row.CASE_NBR)
        return pd.Series([z.prefix, z.suffix, z.invalid_prefix], index = parsed_col_names)
    except ValueError:
        return pd.Series([z.prefix, z.suffix, z.invalid_prefix], index = parsed_col_names)

parsed = just_cases.apply(parse_pcts, axis = 1)

just_cases = pd.concat([just_cases, parsed], axis = 1)

just_cases.head()

Unnamed: 0,CASE_ID,CASE_NBR,CASE_SEQ_NBR,CASE_YR_NBR,APLC_ID,PARNT_CASE_ID,CASE_ACTION_ID,CASE_FILE_RCV_DT,obs,max_obs,parent_is_null,PARENT_CASE,prefix,suffix,invalid_prefix
0,193546.0,ZA-2013-3079-CEX,3079.0,2013.0,118838.0,,1.0,2013-10-03,1,1,True,193546.0,ZA,[CEX],
1,193547.0,AA-2013-3080-PMLA-SL,3080.0,2013.0,118839.0,,2.0,2013-10-03,1,1,True,193547.0,AA,"[PMLA, SL]",
2,193548.0,ENV-2013-3081-MND,3081.0,2013.0,118839.0,,,2013-10-03,1,1,True,193548.0,ENV,[MND],
3,193549.0,ZA-2013-3082-ZAA,3082.0,2013.0,118839.0,,2.0,2013-10-03,1,1,True,193549.0,ZA,[ZAA],
4,179486.0,CHC-2010-1806-MA,1806.0,2010.0,109184.0,,,2010-07-01,1,1,True,179486.0,CHC,[MA],


In [13]:
# Nones are throwing up error when we loop through to tag different suffixes
just_cases.suffix = just_cases.suffix.fillna('')

In [14]:
# Loop through possible suffixes we want to create dummies for
# if case contains 1 or more of them, tag appeal as 1. doesn't matter if there's multiple appeals. 
just_cases['APPEAL'] = just_cases.apply(lambda row: True if (len((set(row.suffix) & {'1A', '2A', '5A'})) > 0) else False, axis = 1)

 
for s in ['ADD', 'CC', 'EXT', 'M', 'PA', 'REC', 'SUP']:
    just_cases[s] = just_cases.apply(lambda row: True if s in row.suffix else False, axis = 1)
    
for p in ['ENV', 'PAR']:
    just_cases[p] = just_cases.apply(lambda row: True if p in row.prefix else False, axis = 1)    

    
def adm_groups(row):
    # Define all the ADM cases, which are a combination of prefix-suffix (sometime in the past) or just prefix (more recent years)
    cond1 = row.prefix == 'AA' and 'WIM' in row.suffix
    cond2 = row.prefix == 'DIR' and (len((set(row.suffix) & {'ACI', 'CEX', 'CWC', 'CWNC', 'HPM', 'VSO'})) > 0)
    cond3 = row.prefix == 'PS' and 'A' in row.suffix
    cond4 = row.prefix == 'ZA' and (len((set(row.suffix) & {'AIC', 'CEX'})) > 0)
    
    if (cond1) or (cond2) or (cond3) or (cond4):
        return True
    if 'ADM' in row.prefix:
        return True
    else:
        return False    
    
just_cases['ADM'] = just_cases.apply(adm_groups, axis = 1)

In [15]:
all_dummies = ['APPEAL', 'ADD', 'CC', 'EXT', 'M', 'PA', 'REC', 'SUP', 'ENV', 'PAR', 'ADM']

# Groupby the parent CASE_ID and then find the max for all the prefix and suffix dummies
for col in all_dummies:
    just_cases[col] = just_cases[col].astype(int)
    display(col)
    display(just_cases[col].value_counts())
    just_cases[col] = just_cases.groupby('PARENT_CASE')[col].transform('max')

'APPEAL'

0    50589
1     1457
Name: APPEAL, dtype: int64

'ADD'

0    52046
Name: ADD, dtype: int64

'CC'

0    51992
1       54
Name: CC, dtype: int64

'EXT'

0    51783
1      263
Name: EXT, dtype: int64

'M'

0    52046
Name: M, dtype: int64

'PA'

0    52046
Name: PA, dtype: int64

'REC'

0    52046
Name: REC, dtype: int64

'SUP'

0    52046
Name: SUP, dtype: int64

'ENV'

0    35750
1    16296
Name: ENV, dtype: int64

'PAR'

0    50446
1     1600
Name: PAR, dtype: int64

'ADM'

0    37604
1    14442
Name: ADM, dtype: int64

In [16]:
all_suffix = ['APPEAL', 'ADD', 'CC', 'EXT', 'M', 'PA', 'REC', 'SUP']
all_prefix = ['ENV', 'PAR', 'ADM']

just_cases['num_suffix'] = just_cases[all_suffix].sum(axis = 1)
just_cases['num_prefix'] = just_cases[all_prefix].sum(axis = 1)

display(just_cases.num_suffix.value_counts())
display(just_cases.num_prefix.value_counts())

0    48721
1     3179
2      142
3        4
Name: num_suffix, dtype: int64

1    32338
0    19708
Name: num_prefix, dtype: int64

In [17]:
# Only keep parent cases
just_parent = just_cases[just_cases.parent_is_null == True]

# Rename columns
just_parent.rename(columns = {'ADD': 'addendum', 'CC': 'conditional_clearance',
                            'EXT': 'extension', 'M': 'modification', 
                            'PA': 'plan_approval', 'REC': 'reconsideration', 'SUP': 'supplemental',
                            'APPEAL': 'appeal', 'ADM': 'admin', 'ENV': 'env', 'PAR': 'pre_application_review'}, inplace = True)

# Drop columns
drop = ['obs', 'max_obs', 'parent_is_null', 'PARNT_CASE_ID']

just_parent = just_parent.drop(columns = drop)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [18]:
just_parent.columns

Index(['CASE_ID', 'CASE_NBR', 'CASE_SEQ_NBR', 'CASE_YR_NBR', 'APLC_ID',
       'CASE_ACTION_ID', 'CASE_FILE_RCV_DT', 'PARENT_CASE', 'prefix', 'suffix',
       'invalid_prefix', 'appeal', 'addendum', 'conditional_clearance',
       'extension', 'modification', 'plan_approval', 'reconsideration',
       'supplemental', 'env', 'pre_application_review', 'admin', 'num_suffix',
       'num_prefix'],
      dtype='object')

In [19]:
cols_to_export = ['CASE_ID', 'PARENT_CASE', 'appeal', 'addendum', 'conditional_clearance',
                  'extension', 'modification', 'plan_approval', 'reconsideration', 'supplemental', 
                  'env', 'pre_application_review', 'admin']

just_parent[cols_to_export].to_parquet('../data/parent_cases.parquet')
just_parent[cols_to_export].to_parquet(f's3://{bucket_name}/data/intermediate/parent_cases.parquet')

## Get counts for each category
* 2010-2019
* 2015-2019
* 2017, 2018, 2019 individual years

In [20]:
def count_cases(row):
    entitlement = 0
    env = 0
    admin = 0
    appeal = 0
    par = 0
    
    cond1 = (row.admin == 0)
    cond2 = (row.env == 0)
    cond3 = (row.pre_application_review == 0)

    entitlement_prefix = ['AA', 'APCC', 'APCE', 'APCH', 'APCNV', 
                  'APCS', 'APCSV', 'APCW', 'CPC', 'DIR', 'PS', 'TT', 'VTT', 'ZA']
    
    if any(prefix in row.prefix for prefix in entitlement_prefix) and cond1 and cond2 and cond3:
        entitlement = 1
        
    if row.env == 1:
        env = 1
    
    if row.admin == 1:
        admin = 1
    
    if row.appeal == 1:
        appeal = 1
    
    if row.pre_application_review == 1:
        par = 1
    
    return pd.Series([entitlement, env, admin, appeal, par], 
                     index=['is_entitlement', 'is_env', 'is_admin', 'is_appeal', 'is_par'])

counts = just_parent.apply(count_cases, axis = 1)
df2 = pd.concat([just_parent, counts], axis = 1)

In [21]:
df2.head()

Unnamed: 0,CASE_ID,CASE_NBR,CASE_SEQ_NBR,CASE_YR_NBR,APLC_ID,CASE_ACTION_ID,CASE_FILE_RCV_DT,PARENT_CASE,prefix,suffix,...,env,pre_application_review,admin,num_suffix,num_prefix,is_entitlement,is_env,is_admin,is_appeal,is_par
0,193546.0,ZA-2013-3079-CEX,3079.0,2013.0,118838.0,1.0,2013-10-03,193546.0,ZA,[CEX],...,0,0,1,0,1,0,0,1,0,0
1,193547.0,AA-2013-3080-PMLA-SL,3080.0,2013.0,118839.0,2.0,2013-10-03,193547.0,AA,"[PMLA, SL]",...,0,0,0,0,0,1,0,0,0,0
2,193548.0,ENV-2013-3081-MND,3081.0,2013.0,118839.0,,2013-10-03,193548.0,ENV,[MND],...,1,0,0,0,1,0,1,0,0,0
3,193549.0,ZA-2013-3082-ZAA,3082.0,2013.0,118839.0,2.0,2013-10-03,193549.0,ZA,[ZAA],...,0,0,0,0,0,1,0,0,0,0
4,179486.0,CHC-2010-1806-MA,1806.0,2010.0,109184.0,,2010-07-01,179486.0,CHC,[MA],...,0,0,0,0,0,0,0,0,0,0


In [22]:
df2010 = df2[(df2.CASE_YR_NBR >= 2010) & (df2.CASE_YR_NBR <= 2019)]
df2015 = df2[(df2.CASE_YR_NBR >= 2015) & (df2.CASE_YR_NBR <= 2019)]
df2017 = df2[df2.CASE_YR_NBR == 2017]
df2018 = df2[df2.CASE_YR_NBR == 2018]
df2019 = df2[df2.CASE_YR_NBR == 2019]

In [23]:
dataframes = {'2010': df2010, '2015': df2015, 
             '2017': df2017, '2018': df2018, '2019': df2019}

for key, value in dataframes.items():
    display(key)
    display(value.agg({'is_entitlement':'sum', 'is_env':'sum', 'is_admin':'sum', 
               'is_appeal':'sum', 'is_par':'sum'}).reset_index())

# Ugh, 2017-2019 individual years don't exactly line up still

'2010'

Unnamed: 0,index,0
0,is_entitlement,16513
1,is_env,15812
2,is_admin,14210
3,is_appeal,1296
4,is_par,1563


'2015'

Unnamed: 0,index,0
0,is_entitlement,9386
1,is_env,8588
2,is_admin,9746
3,is_appeal,646
4,is_par,1563


'2017'

Unnamed: 0,index,0
0,is_entitlement,2134
1,is_env,1844
2,is_admin,1375
3,is_appeal,125
4,is_par,0


'2018'

Unnamed: 0,index,0
0,is_entitlement,1954
1,is_env,1770
2,is_admin,2965
3,is_appeal,104
4,is_par,724


'2019'

Unnamed: 0,index,0
0,is_entitlement,1655
1,is_env,1576
2,is_admin,3437
3,is_appeal,21
4,is_par,839
