# PCTS Validate Counts
* Parse PCTS case number, and use prefix and suffix to validate counts

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import intake
import boto3
import utils

In [2]:
catalog = intake.open_catalog("../catalogs/*.yml")

s3 = boto3.client('s3')
bucket_name = 'city-planning-entitlements'

In [3]:
df = catalog.pcts.tCASE.read()

In [4]:
# Subset by years and columns
df = df[(df.CASE_YR_NBR >= 2010) & (df.CASE_YR_NBR <= 2020)][['CASE_ID', 'CASE_NBR', 'CASE_SEQ_NBR', 'CASE_YR_NBR', 
               'CASE_ACTION_ID', 'ADM_ACTION_DT']]

In [5]:
parsed_col_names = ['prefix', 'suffix', 'invalid_prefix']

def parse_pcts(row):
    try:
        z = utils.PCTSCaseNumber(row.CASE_NBR)
        return pd.Series([z.prefix, z.suffix, z.invalid_prefix], index = parsed_col_names)
    except ValueError:
        return pd.Series([z.prefix, z.suffix, z.invalid_prefix], index = parsed_col_names)

parsed = df.apply(parse_pcts, axis = 1)

df = pd.concat([df, parsed], axis = 1)

df.head()

Unnamed: 0,CASE_ID,CASE_NBR,CASE_SEQ_NBR,CASE_YR_NBR,CASE_ACTION_ID,ADM_ACTION_DT,prefix,suffix,invalid_prefix
451,193546.0,ZA-2013-3079-CEX,3079.0,2013.0,1.0,NaT,ZA,[CEX],
453,193547.0,AA-2013-3080-PMLA-SL,3080.0,2013.0,2.0,NaT,AA,"[PMLA, SL]",
956,179486.0,CHC-2010-1806-MA,1806.0,2010.0,,NaT,CHC,[MA],
957,179489.0,AA-2010-1809-COC,1809.0,2010.0,1.0,NaT,AA,[COC],
1665,188521.0,ZA-2012-2430-CUB,2430.0,2012.0,1.0,NaT,ZA,[CUB],


In [6]:
df.invalid_prefix.value_counts()

      52045
PM        1
Name: invalid_prefix, dtype: int64

In [7]:
# Defining categories
entitlement_prefix = ['AA', 'APCC', 'APCE', 'APCH',
                      'APCNV', 'APCS', 'APCSV', 'APCW',
                      'CPC', 'DIR', 'PS', 'TT', 'VTT', 'ZA']

environmental_prefix = ['ENV']

administrative_prefix = ['ADM']

administrative_suffix = ['WTM', 
                         'ACI', 'CEX', 'CWC', 'CWNC', 'HPM', 'VSO', 
                         'A',
                         'AIC']

appeal_suffix = ['1A', '2A', '5A']

In [8]:
def tag_prefix(row):
    if row.prefix in entitlement_prefix:
        return 'entitlement'
    elif row.prefix in environmental_prefix:
        return 'environmental'
    elif row.prefix in administrative_prefix:
        return 'administrative'
    else: 
        return ''

df['prefix_category'] = df.apply(tag_prefix, axis = 1)

df.prefix_category.value_counts()

entitlement       26352
environmental     16296
administrative     6871
                   2527
Name: prefix_category, dtype: int64

In [9]:
df2 = df[['CASE_ID', 'suffix']]

suffix = df2.suffix.apply(pd.Series)

suffix.rename(columns = {0: 'v1', 1: 'v2', 2: 'v3',
                        3: 'v4', 4: 'v5', 5: 'v6',  6: 'v7', 
                        7: 'v8', 8: 'v9', 9: 'v10'}, inplace = True) 

df2 = pd.concat([df2, suffix], axis = 1)

df2.head()

Unnamed: 0,CASE_ID,suffix,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10
451,193546.0,[CEX],CEX,,,,,,,,,
453,193547.0,"[PMLA, SL]",PMLA,SL,,,,,,,,
956,179486.0,[MA],MA,,,,,,,,,
957,179489.0,[COC],COC,,,,,,,,,
1665,188521.0,[CUB],CUB,,,,,,,,,


In [10]:
#pd.wide_to_long(df2, stubnames = 'v', i = ['CASE_ID'], j = 'suffix').reset_index('CASE_ID')

In [11]:
# Reshape from wide to long
# What's difference between pd.wide_to_long and pd.melt?

df3 = df2.melt(id_vars = ['CASE_ID'], value_vars = ['v1', 'v2', 'v3', 'v4', 'v5', 'v6', 'v7', 'v8', 'v9', 'v10'], 
        var_name = 'variable', value_name = 'suffix')

df3 = df3[df3.suffix.notna()].sort_values(['CASE_ID', 'variable'], ascending = [True, True])
df3.head()

Unnamed: 0,CASE_ID,variable,suffix
636,177206.0,v1,CE
286,177207.0,v1,CWC
287,177208.0,v1,CWC
389,177210.0,v1,ZAA
390,177211.0,v1,CE


In [13]:
def tag_suffix(row):
    if row.suffix in administrative_suffix:
        return 'administrative'
    elif row.suffix in appeal_suffix:
        return 'appeal'
    else: 
        return ''

df3['suffix_category'] = df3.apply(tag_suffix, axis = 1)

In [14]:
df3.suffix_category.value_counts()

                  51882
administrative    10076
appeal             1458
Name: suffix_category, dtype: int64

In [15]:
df4 = df3[df3.suffix_category != '']


df4['admin'] = df4.apply(lambda row: 1 if row.suffix_category=='administrative' else 0, axis = 1)
df4['appeal'] = df4.apply(lambda row: 1 if row.suffix_category=='appeal' else 0, axis = 1)

# In case there are duplicate CASE_ID, get the max value for admin and appeal for new dummy
for col in ['admin', 'appeal']:
    new_col = f'{col}_suffix'
    df4[new_col] = df4.groupby('CASE_ID')[col].transform('max')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [16]:
df5 = df4[['CASE_ID', 'admin_suffix', 'appeal_suffix']].drop_duplicates()

In [17]:
# Merge the 2 dummy variables in
df = pd.merge(df, df5, on = 'CASE_ID', how = 'left')

for col in ['admin_suffix', 'appeal_suffix']:
    df[col] = df[col].fillna(0)

## Get counts for each category
* 2010-2019
* 2015-2019

In [18]:
df = df[df.CASE_YR_NBR <= 2019]

In [19]:
# These are the most off...maybe need to add additional conditions where case cannot fall into ADM or ENV or APPEAL?
print(f'# ENTITLE 2010-2019: {len(df[(df.CASE_YR_NBR >= 2010) & (df.prefix_category == "entitlement")])}')
print(f'# ENTITLE 2015-2019: {len(df[(df.CASE_YR_NBR >= 2015) & (df.prefix_category == "entitlement")])}')

# ENTITLE 2010-2019: 26273
# ENTITLE 2015-2019: 13355


In [20]:
print(f'# ENV 2010-2019: {len(df[(df.CASE_YR_NBR >= 2010) & (df.prefix_category == "environmental")])}')
print(f'# ENV 2015-2019: {len(df[(df.CASE_YR_NBR >= 2015) & (df.prefix_category == "environmental")])}')

# ENV 2010-2019: 16232
# ENV 2015-2019: 8687


In [21]:
print(f'# ADM 2010-2019: {len(df[(df.CASE_YR_NBR >= 2010) & ((df.prefix_category == "administrative") | (df.admin_suffix == 1))])}')
print(f'# ADM 2015-2019: {len(df[(df.CASE_YR_NBR >= 2015) & ((df.prefix_category == "administrative") | (df.admin_suffix == 1))])}')

# ADM 2010-2019: 14310
# ADM 2015-2019: 9754


In [22]:
print(f'# APPEAL 2010-2019: {len(df[(df.CASE_YR_NBR >= 2010) & (df.appeal_suffix == 1)])}')
print(f'# APPEAL 2015-2019: {len(df[(df.CASE_YR_NBR >= 2015) & (df.appeal_suffix == 1)])}')

# APPEAL 2010-2019: 1457
# APPEAL 2015-2019: 714
