## Test out the laplan.pcts functions

In [1]:
import boto3
import intake
import pandas

import laplan

In [2]:
catalog = intake.open_catalog("../catalogs/*.yml")

s3 = boto3.client('s3')
bucket_name = 'city-planning-entitlements'

In [3]:
pcts = catalog.pcts.read()
"""
Let's replicate what this does


FULL_PREFIX = list(laplan.pcts.VALID_PCTS_PREFIX)
remove_prefix = ["ENV", "PAR", "ADM"]
prefix = [x for x in FULL_PREFIX if x not in remove_prefix]

suffix = ["TOC", "CUB"]


pcts = laplan.pcts.subset_pcts(pcts,
                              start_date="2017-10-01",
                              prefix_list=prefix,
                              get_dummies=True)

pcts = laplan.pcts.drop_child_cases(pcts, keep_child_entitlements=True)
"""

'\nLet\'s replicate what this does\n\n\nFULL_PREFIX = list(laplan.pcts.VALID_PCTS_PREFIX)\nremove_prefix = ["ENV", "PAR", "ADM"]\nprefix = [x for x in FULL_PREFIX if x not in remove_prefix]\n\nsuffix = ["TOC", "CUB"]\n\n\npcts = laplan.pcts.subset_pcts(pcts,\n                              start_date="2017-10-01",\n                              prefix_list=prefix,\n                              get_dummies=True)\n\npcts = laplan.pcts.drop_child_cases(pcts, keep_child_entitlements=True)\n'

In [4]:
remove_prefix = ["ENV", "PAR", "ADM"]
FULL_PREFIX = list(laplan.pcts.VALID_PCTS_PREFIX)
prefix_list = [x for x in FULL_PREFIX if x not in remove_prefix]

suffix_list = ["TOC", "CUB"]

start_date = "2017-10-01"
end_date = pandas.Timestamp.now()

In [5]:
# Subset PCTS by start / end date
start_date = (
    pandas.to_datetime(start_date)
    if start_date
    else pandas.to_datetime("2010-01-01")
)
end_date = pandas.to_datetime(end_date) if end_date else pandas.Timestamp.now()

pcts = (
    pcts[
        (pcts.FILE_DATE >= start_date) & (pcts.FILE_DATE <= end_date)
    ]
    .drop_duplicates()
    .reset_index(drop=True)
)

In [6]:
import re

GENERAL_PCTS_RE = re.compile("([A-Z]+)-([0-9X]{4})-([0-9]+)((?:-[A-Z0-9]+)*)$")
MISSING_YEAR_RE = re.compile("([A-Z]+)-([0-9]+)((?:-[A-Z0-9]+)*)$")

# Parse CASE_NBR
cols = pcts.CASE_NUMBER.str.extract(GENERAL_PCTS_RE)

all_prefixes = cols[0]
all_suffixes = cols[3].str[1:]

print("show case number parsed with GENERAL_PCTS_RE")
display(cols.head())
display(all_prefixes.head())
display(all_suffixes.head())

show case number parsed with GENERAL_PCTS_RE


Unnamed: 0,0,1,2,3
0,DIR,2017,4014,-CWNC
1,ZA,2017,3955,-CU
2,ENV,2017,3956,-CE
3,DIR,2017,3957,-VSO
4,ZA,2017,3958,-ZAA


0    DIR
1     ZA
2    ENV
3    DIR
4     ZA
Name: 0, dtype: object

0    CWNC
1      CU
2      CE
3     VSO
4     ZAA
Name: 3, dtype: object

In [7]:
# Parse additional prefixes and suffixes that did not pass the first regex
# to fill NaN values based on indices.  Suffixes at position 2 instead of 3.
failed_general_parse = all_prefixes.isna()
additional_cols = pcts[failed_general_parse].CASE_NUMBER.str.extract(MISSING_YEAR_RE)

print("failed to parse go through MISSING_YEAR_RE")
display(additional_cols.head())

failed to parse go through MISSING_YEAR_RE


Unnamed: 0,0,1,2
42,TT,72841,-REV-1A
120,VTT,74835,-CN
121,VTT,74835,-CN
122,VTT,74835,-CN
348,VTT,78424,-CN


In [8]:
# Now fill in those failed to parse the first time around
# Find the index where that happened, and assign those values for prefixes and suffixes

additional_prefixes = additional_cols[0]
additional_suffixes = additional_cols[2].str[1:]

all_prefixes.at[additional_prefixes.index] = additional_prefixes.values
all_suffixes.at[additional_suffixes.index] = additional_suffixes.values
all_suffixes = all_suffixes.str.split("-", expand=True)

In [9]:
# Start by excluding all rows that failed to parse.
successfully_parsed = all_prefixes.notna()


# Create 2 series, holds all True values for each case
allow_prefix = pandas.Series(True, index=pcts.index)
allow_suffix = pandas.Series(True, index=pcts.index)

In [10]:
# Subset by prefix
if prefix_list is not None:
    allow_prefix = all_prefixes.isin(prefix_list)
    
# Takes the previous series, which was all true,
# now only those who are part of "allow_prefix" have True, rest are False

allow_prefix

0          True
1          True
2         False
3          True
4          True
          ...  
494946     True
494947     True
494948    False
494949    False
494950    False
Name: 0, Length: 494951, dtype: bool

In [11]:
# Subset by suffix. Since the suffix may be in any of the all_suffixes
# column, we logical-or them together, checking if each column has one
# of the ?requested ones.
if suffix_list is not None:
    print("before: all Trues")
    display(allow_suffix)
    allow_suffix = ~allow_suffix
    print("after: all Falses")
    display(allow_suffix)
    
    # Loop through each column in all_suffixes, labeled 0, 1, ..., n
    # Turn on to be True if it's already True (will work after 2nd iteration), 
    # or if the suffix is found in our allowed suffixes
    # This way, even if we loop through ones that aren't in our allowed suffixes, if it's already True,
    # it won't turn to False
    for c in all_suffixes.columns:
        
        allow_suffix = allow_suffix | all_suffixes[c].isin(suffix_list)


before: all Trues


0         True
1         True
2         True
3         True
4         True
          ... 
494946    True
494947    True
494948    True
494949    True
494950    True
Length: 494951, dtype: bool

after: all Falses


0         False
1         False
2         False
3         False
4         False
          ...  
494946    False
494947    False
494948    False
494949    False
494950    False
Length: 494951, dtype: bool

In [12]:
# If this condition is met, select the row (which is indexed by case_number)
subset = successfully_parsed & allow_prefix & allow_suffix

pcts = pcts[subset]

# Also, only select the rows that meet the above condition for our prefixes and suffixes dataframes
all_prefixes = all_prefixes[subset]
all_suffixes = all_suffixes[subset]

In [13]:
prefix_dummies = pandas.get_dummies(all_prefixes, dtype="bool")

In [14]:
print(f"set of prefix_list or all the valid ones: {set(prefix_list or VALID_PCTS_PREFIX)}")
print(f"set of prefixes in our dummies: {set(prefix_dummies.columns)}")
missing_prefixes = set(prefix_list or VALID_PCTS_PREFIX) - set(
        prefix_dummies.columns
    )
print(f"missing prefixes: {missing_prefixes}")

set of prefix_list or all the valid ones: {'APCNV', 'APCS', 'APCSV', 'APCC', 'VTT', 'PS', 'CPC', 'CHC', 'ZA', 'AA', 'TT', 'DIR', 'APCW', 'APCE', 'HPO', 'APCH'}
set of prefixes in our dummies: {'APCC', 'CPC', 'ZA', 'DIR', 'APCW', 'APCE', 'APCH'}
missing prefixes: {'APCNV', 'APCS', 'APCSV', 'VTT', 'CHC', 'AA', 'TT', 'HPO', 'PS'}


In [15]:
suffix_dummies = pandas.get_dummies(all_suffixes.stack(), dtype="bool").max(
            level=0
        )
display(suffix_dummies.head())

# Identify if any of the requested suffixes are missing. If so,
# populate them with a column of falses
print(f"set of suffix_list or all the valid ones: {set(suffix_list or VALID_PCTS_SUFFIX)}")
print(f"set of suffixes in our dummies: {set(suffix_dummies.columns)}")
missing_suffixes = set(suffix_list or VALID_PCTS_SUFFIX) - set(
    suffix_dummies.columns
)
print(f"missing suffixes: {missing_suffixes}")

Unnamed: 0,1A,2A,CCMP,CDO,CDP,CPIOA,CPIOE,CU,CUB,CUX,...,TOC,VCU,VZC,WDI,ZAA,ZAD,ZAI,ZBA,ZC,ZV
94,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
95,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
96,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
97,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
157,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


set of suffix_list or all the valid ones: {'TOC', 'CUB'}
set of suffixes in our dummies: {'SPR', 'TDR', 'CPIOE', '2A', 'CDO', 'CPIOA', 'GB', 'MEL', 'MSC', 'PA4', 'PA5', 'DB', 'PA1', 'GPA', 'MSP', 'SN', 'ZAD', 'PA3', 'PA9', 'SPE', 'SPP', '1A', 'M1', 'ZC', 'DA', 'DRB', 'HD', 'MCUP', 'SIP', 'ZBA', 'SPPA', 'P', 'PA2', 'VZC', 'TOC', 'CCMP', 'WDI', 'ZAA', 'VCU', 'ZV', 'CUX', 'CU', 'ZAI', 'CDP', 'CUB', 'DD'}
missing suffixes: set()
