# PCTS Validate Counts
* Parse PCTS case number, and use prefix and suffix to validate counts

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import intake
import boto3
import utils

In [2]:
catalog = intake.open_catalog("../catalogs/*.yml")

s3 = boto3.client('s3')
bucket_name = 'city-planning-entitlements'

## Grab tables from PCTS and merge

In [3]:
cases = catalog.pcts.tCASE.read()
app = catalog.pcts.tAPLC.read()
geo_info = catalog.pcts.tPROP_GEO_INFO.read()
la_prop = catalog.pcts.tLA_PROP.read()
parcels = gpd.read_file(f'zip+s3://{bucket_name}/gis/intermediate/la_parcels_toc.zip')

In [4]:
cases1 = cases[['CASE_ID', 'APLC_ID', 'CASE_NBR', 'CASE_SEQ_NBR', 'CASE_YR_NBR', 'CASE_ACTION_ID', 'ADM_ACTION_DT', 'PARNT_CASE_ID']]
app1 = app[['APLC_ID', 'PROJ_DESC_TXT']]
geo_info1 = geo_info[['CASE_ID', 'PROP_ID']]
la_prop1 = la_prop[la_prop.ASSR_PRCL_NBR.notna()][['PROP_ID', 'ASSR_PRCL_NBR']]

In [5]:
# Subset by years and columns
keep_col = ['CASE_ID', 'CASE_NBR', 'CASE_SEQ_NBR', 'CASE_YR_NBR', 'APLC_ID', 'PARNT_CASE_ID',
            'CASE_ACTION_ID', 'ADM_ACTION_DT']

cases2 = cases1[(cases1.CASE_YR_NBR >= 2010) & (cases1.CASE_YR_NBR <= 2020)][keep_col]

In [6]:
m1 = pd.merge(cases2, app1, on = 'APLC_ID', how = 'inner', validate = 'm:1')

In [7]:
m2 = pd.merge(cases2, geo_info1, on = 'CASE_ID', how = 'inner', validate = '1:m')

In [8]:
m3 = pd.merge(m2, la_prop1, on = 'PROP_ID', how = 'inner', validate = 'm:1')

In [9]:
df = pd.merge(m3, parcels, left_on = 'ASSR_PRCL_NBR', right_on = 'AIN', how = 'inner', validate = 'm:1')

## Parent case
* APLC_ID or PARNT_CASE_ID
* APLC_ID can be the same across cases with different prefixes. But, they were part of the same application.
* Treat those as separate cases.
* Same prefix, same case. APLC_ID isn't a perfect match, maybe APLC_ID + CASE_SEQ_NBR + CASE_YR_NBR?
* Final decision: use PARNT_CASE_ID, and fill it in whenever it's missing, because those are the parent cases themselves
* Parse case string for some big groups: PAR, ENV, APPEAL, ADM, ENTITLEMENT
* bys parent_case: egen max for PAR, ENV, APPEAL, ADM
* keep parent because it's ENTITLEMENT, but stores stuff from child cases

In [10]:
keep_cols = ['CASE_ID', 'CASE_NBR', 'CASE_SEQ_NBR', 'CASE_YR_NBR', 'APLC_ID', 'PARNT_CASE_ID', 'CASE_ACTION_ID',
            'AIN', 'TOC_Tier', 'geometry']

df = df[keep_cols].drop_duplicates()

## Sort out parent-child relationship by tagging history
* Entitlement can appear multiple times because it's linked to several parcels 
* There are no duplicates once we add in AIN.

In [11]:
case_ain_crosswalk = df[['CASE_ID', 'AIN']].drop_duplicates()

In [12]:
view_cols = ['CASE_ID', 'CASE_NBR', 'CASE_SEQ_NBR', 'CASE_YR_NBR', 'APLC_ID', 'PARNT_CASE_ID', 'CASE_ACTION_ID']

In [13]:
just_cases = df[view_cols].drop_duplicates()

just_cases['obs'] = just_cases.groupby(['CASE_SEQ_NBR', 'CASE_YR_NBR']).cumcount() + 1
just_cases['max_obs'] = just_cases.groupby(['CASE_SEQ_NBR', 'CASE_YR_NBR'])['obs'].transform('max')

just_cases.max_obs.value_counts()

1    37104
2      968
3       99
4       16
Name: max_obs, dtype: int64

In [14]:
just_cases['parent_is_null'] = just_cases.PARNT_CASE_ID.isna()

just_cases.parent_is_null.value_counts()

True     37633
False      554
Name: parent_is_null, dtype: int64

In [15]:
just_cases['PARENT_CASE'] = just_cases.apply(lambda row: row.CASE_ID if row.parent_is_null == True 
                                             else row.PARNT_CASE_ID, axis = 1)

In [16]:
sample_parents = [195906, 177239, 181967]

just_cases[just_cases.PARENT_CASE.isin(sample_parents)].sort_values(['PARENT_CASE', 'CASE_ID'])

Unnamed: 0,CASE_ID,CASE_NBR,CASE_SEQ_NBR,CASE_YR_NBR,APLC_ID,PARNT_CASE_ID,CASE_ACTION_ID,obs,max_obs,parent_is_null,PARENT_CASE
7715,177239.0,ZA-2010-28-CU-CUB,28.0,2010.0,107560.0,,11.0,1,3,True,177239.0
7717,185587.0,ZA-2010-28-CU-CUB-1A,28.0,2010.0,113450.0,177239.0,,2,3,False,177239.0
7718,201852.0,ZA-2010-28-CU-CUB-EXT,28.0,2010.0,168927.0,177239.0,,3,3,False,177239.0
1189,181967.0,ENV-2011-328-MND,328.0,2011.0,110982.0,,,1,2,True,181967.0
1190,188354.0,ENV-2011-328-MND-REC1,328.0,2011.0,115327.0,181967.0,,2,2,False,181967.0
32608,195906.0,DIR-2014-886-SPP-SPPA,886.0,2014.0,120419.0,,2.0,1,4,True,195906.0
32610,200623.0,DIR-2014-886-SPP-SPPA-1A,886.0,2014.0,123459.0,195906.0,,2,4,False,195906.0
32611,201290.0,DIR-2014-886-SPP-SPPA-2A,886.0,2014.0,168552.0,195906.0,,3,4,False,195906.0


In [17]:
parsed_col_names = ['prefix', 'suffix', 'invalid_prefix']

def parse_pcts(row):
    try:
        z = utils.PCTSCaseNumber(row.CASE_NBR)
        return pd.Series([z.prefix, z.suffix, z.invalid_prefix], index = parsed_col_names)
    except ValueError:
        return pd.Series([z.prefix, z.suffix, z.invalid_prefix], index = parsed_col_names)

parsed = just_cases.apply(parse_pcts, axis = 1)

just_cases = pd.concat([just_cases, parsed], axis = 1)

just_cases.head()

Unnamed: 0,CASE_ID,CASE_NBR,CASE_SEQ_NBR,CASE_YR_NBR,APLC_ID,PARNT_CASE_ID,CASE_ACTION_ID,obs,max_obs,parent_is_null,PARENT_CASE,prefix,suffix,invalid_prefix
0,193546.0,ZA-2013-3079-CEX,3079.0,2013.0,118838.0,,1.0,1,1,True,193546.0,ZA,[CEX],
1,234299.0,CPC-2019-7393-CA,7393.0,2019.0,192286.0,,,1,1,True,234299.0,CPC,[CA],
2,193547.0,AA-2013-3080-PMLA-SL,3080.0,2013.0,118839.0,,2.0,1,1,True,193547.0,AA,"[PMLA, SL]",
3,193548.0,ENV-2013-3081-MND,3081.0,2013.0,118839.0,,,1,1,True,193548.0,ENV,[MND],
4,193549.0,ZA-2013-3082-ZAA,3082.0,2013.0,118839.0,,2.0,1,1,True,193549.0,ZA,[ZAA],


In [19]:
# Nones are throwing up error when we loop through to tag different suffixes
just_cases.suffix = just_cases.suffix.fillna('')

just_cases.head()

Unnamed: 0,CASE_ID,CASE_NBR,CASE_SEQ_NBR,CASE_YR_NBR,APLC_ID,PARNT_CASE_ID,CASE_ACTION_ID,obs,max_obs,parent_is_null,PARENT_CASE,prefix,suffix,invalid_prefix
0,193546.0,ZA-2013-3079-CEX,3079.0,2013.0,118838.0,,1.0,1,1,True,193546.0,ZA,[CEX],
1,234299.0,CPC-2019-7393-CA,7393.0,2019.0,192286.0,,,1,1,True,234299.0,CPC,[CA],
2,193547.0,AA-2013-3080-PMLA-SL,3080.0,2013.0,118839.0,,2.0,1,1,True,193547.0,AA,"[PMLA, SL]",
3,193548.0,ENV-2013-3081-MND,3081.0,2013.0,118839.0,,,1,1,True,193548.0,ENV,[MND],
4,193549.0,ZA-2013-3082-ZAA,3082.0,2013.0,118839.0,,2.0,1,1,True,193549.0,ZA,[ZAA],


In [20]:
# Loop through possible suffixes we want to create dummies for
# if case contains 1 or more of them, tag appeal as 1. doesn't matter if there's multiple appeals. 
just_cases['APPEAL'] = just_cases.apply(lambda row: True if (len((set(row.suffix) & {'1A', '2A', '5A'})) > 0) else False, axis = 1)

 
for s in ['ADD', 'CC', 'EXT', 'M', 'PA', 'REC', 'SUP']:
    just_cases[s] = just_cases.apply(lambda row: True if s in row.suffix else False, axis = 1)
    
for p in ['ENV', 'PAR']:
    just_cases[p] = just_cases.apply(lambda row: True if p in row.prefix else False, axis = 1)    

    
def adm_groups(row):
    # Define all the ADM cases, which are a combination of prefix-suffix (sometime in the past) or just prefix (more recent years)
    cond1 = row.prefix == 'AA' and 'WIM' in row.suffix
    cond2 = row.prefix == 'DIR' and (len((set(row.suffix) & {'ACI', 'CEX', 'CWC', 'CWNC', 'HPM', 'VSO'})) > 0)
    cond3 = row.prefix == 'PS' and 'A' in row.suffix
    cond4 = row.prefix == 'ZA' and (len((set(row.suffix) & {'AIC', 'CEX'})) > 0)
    
    if (cond1) or (cond2) or (cond3) or (cond4):
        return True
    if 'ADM' in row.prefix:
        return True
    else:
        return False    
    
just_cases['ADM'] = just_cases.apply(adm_groups, axis = 1)

In [21]:
all_dummies = ['APPEAL', 'ADD', 'CC', 'EXT', 'M', 'PA', 'REC', 'SUP', 'ENV', 'PAR', 'ADM']

# Groupby the parent CASE_ID and then find the max for all the prefix and suffix dummies
for col in all_dummies:
    just_cases[col] = just_cases[col].astype(int)
    display(col)
    display(just_cases[col].value_counts())
    just_cases[col] = just_cases.groupby('PARENT_CASE')[col].transform('max')

'APPEAL'

0    37947
1      240
Name: APPEAL, dtype: int64

'ADD'

0    38187
Name: ADD, dtype: int64

'CC'

0    38149
1       38
Name: CC, dtype: int64

'EXT'

0    38014
1      173
Name: EXT, dtype: int64

'M'

0    38187
Name: M, dtype: int64

'PA'

0    38187
Name: PA, dtype: int64

'REC'

0    38187
Name: REC, dtype: int64

'SUP'

0    38187
Name: SUP, dtype: int64

'ENV'

0    27630
1    10557
Name: ENV, dtype: int64

'PAR'

0    36814
1     1373
Name: PAR, dtype: int64

'ADM'

0    25204
1    12983
Name: ADM, dtype: int64

In [22]:
all_suffix = ['APPEAL', 'ADD', 'CC', 'EXT', 'M', 'PA', 'REC', 'SUP']
all_prefix = ['ENV', 'PAR', 'ADM']

just_cases['num_suffix'] = just_cases[all_suffix].sum(axis = 1)
just_cases['num_prefix'] = just_cases[all_prefix].sum(axis = 1)

display(just_cases.num_suffix.value_counts())
display(just_cases.num_prefix.value_counts())

0    37353
1      807
2       27
Name: num_suffix, dtype: int64

1    24913
0    13274
Name: num_prefix, dtype: int64

In [23]:
# Only keep parent cases
just_parent = just_cases[just_cases.parent_is_null == True]

# Rename columns
just_parent.rename(columns = {'ADD': 'addendum', 'CC': 'conditional_clearance',
                            'EXT': 'extension', 'M': 'modification', 
                            'PA': 'plan_approval', 'REC': 'reconsideration', 'SUP': 'supplemental',
                            'APPEAL': 'appeal', 'ADM': 'admin', 'ENV': 'env', 'PAR': 'pre_application_review'}, inplace = True)

# Drop columns
drop = ['obs', 'max_obs', 'parent_is_null', 'PARNT_CASE_ID']

just_parent = just_parent.drop(columns = drop)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [24]:
just_parent.columns

Index(['CASE_ID', 'CASE_NBR', 'CASE_SEQ_NBR', 'CASE_YR_NBR', 'APLC_ID',
       'CASE_ACTION_ID', 'PARENT_CASE', 'prefix', 'suffix', 'invalid_prefix',
       'appeal', 'addendum', 'conditional_clearance', 'extension',
       'modification', 'plan_approval', 'reconsideration', 'supplemental',
       'env', 'pre_application_review', 'admin', 'num_suffix', 'num_prefix'],
      dtype='object')

In [25]:
entitlement_prefix = ['AA', 'APCC', 'APCE', 'APCH', 'APCNV', 
                  'APCS', 'APCSV', 'APCW', 'CPC', 'DIR', 'PS', 'TT', 'VTT', 'ZA']

## Get counts for each category
* 2010-2019
* 2015-2019

In [26]:
df2 = just_parent[just_parent.CASE_YR_NBR <= 2019]

In [27]:
# These seem in line, but now ENV, ADM, and APPEALS are off
print(f'# ENTITLE 2010-2019: {len(df2[(df2.CASE_YR_NBR >= 2010) & (df2.prefix.isin(entitlement_prefix))])}')
print(f'# ENTITLE 2015-2019: {len(df2[(df2.CASE_YR_NBR >= 2015) & (df2.prefix.isin(entitlement_prefix))])}')

# ENTITLE 2010-2019: 18870
# ENTITLE 2015-2019: 9577


In [28]:
print(f'# ENV 2010-2019: {len(df2[(df2.CASE_YR_NBR >= 2010) & (df2.env == 1)])}')
print(f'# ENV 2015-2019: {len(df2[(df2.CASE_YR_NBR >= 2015) & (df2.env == 1)])}')

# ENV 2010-2019: 10456
# ENV 2015-2019: 5150


In [29]:
print(f'# ADM 2010-2019: {len(df2[(df2.CASE_YR_NBR >= 2010) & (df2.admin == 1)])}')
print(f'# ADM 2015-2019: {len(df2[(df2.CASE_YR_NBR >= 2015) & (df2.admin == 1)])}')

# ADM 2010-2019: 12776
# ADM 2015-2019: 8601


In [30]:
print(f'# APPEAL 2010-2019: {len(df2[(df2.CASE_YR_NBR >= 2010) & (df2.appeal == 1)])}')
print(f'# APPEAL 2015-2019: {len(df2[(df2.CASE_YR_NBR >= 2015) & (df2.appeal == 1)])}')

# APPEAL 2010-2019: 222
# APPEAL 2015-2019: 3


In [31]:
print(f'# PAR 2010-2019: {len(df2[(df2.CASE_YR_NBR >= 2010) & (df2.pre_application_review == 1)])}')
print(f'# PAR 2015-2019: {len(df2[(df2.CASE_YR_NBR >= 2015) & (df2.pre_application_review == 1)])}')

# PAR 2010-2019: 1339
# PAR 2015-2019: 1339


## Merge parent cases back with parcel info

In [32]:
# Merge in the parcel info for each case again
final = pd.merge(just_parent, case_ain_crosswalk, on = 'CASE_ID', how = 'inner', validate = '1:m')
final.head()

Unnamed: 0,CASE_ID,CASE_NBR,CASE_SEQ_NBR,CASE_YR_NBR,APLC_ID,CASE_ACTION_ID,PARENT_CASE,prefix,suffix,invalid_prefix,...,modification,plan_approval,reconsideration,supplemental,env,pre_application_review,admin,num_suffix,num_prefix,AIN
0,193546.0,ZA-2013-3079-CEX,3079.0,2013.0,118838.0,1.0,193546.0,ZA,[CEX],,...,0,0,0,0,0,0,1,0,1,4420030014
1,234299.0,CPC-2019-7393-CA,7393.0,2019.0,192286.0,,234299.0,CPC,[CA],,...,0,0,0,0,0,0,0,0,0,4420030014
2,234299.0,CPC-2019-7393-CA,7393.0,2019.0,192286.0,,234299.0,CPC,[CA],,...,0,0,0,0,0,0,0,0,0,4240011032
3,234299.0,CPC-2019-7393-CA,7393.0,2019.0,192286.0,,234299.0,CPC,[CA],,...,0,0,0,0,0,0,0,0,0,4237014010
4,234299.0,CPC-2019-7393-CA,7393.0,2019.0,192286.0,,234299.0,CPC,[CA],,...,0,0,0,0,0,0,0,0,0,4420004011


In [35]:
final.drop(columns = ['suffix', 'invalid_prefix']).to_parquet('../data/entitlements.parquet')