## Test out the laplan.pcts functions

In [1]:
import boto3
import intake
import pandas

import laplan

In [2]:
catalog = intake.open_catalog("../catalogs/*.yml")

s3 = boto3.client('s3')
bucket_name = 'city-planning-entitlements'

In [3]:
pcts = catalog.pcts.read()
"""
Let's replicate what this does, but just on select cases to see what's going on


FULL_PREFIX = list(laplan.pcts.VALID_PCTS_PREFIX)
remove_prefix = ["ENV", "PAR", "ADM"]
prefix = [x for x in FULL_PREFIX if x not in remove_prefix]

suffix = ["TOC", "CUB"]


pcts = laplan.pcts.subset_pcts(pcts,
                              start_date="2017-10-01",
                              prefix_list=prefix,
                              get_dummies=True)

pcts = laplan.pcts.drop_child_cases(pcts, keep_child_entitlements=True)
"""

'\nLet\'s replicate what this does, but just on select cases to see what\'s going on\n\n\nFULL_PREFIX = list(laplan.pcts.VALID_PCTS_PREFIX)\nremove_prefix = ["ENV", "PAR", "ADM"]\nprefix = [x for x in FULL_PREFIX if x not in remove_prefix]\n\nsuffix = ["TOC", "CUB"]\n\n\npcts = laplan.pcts.subset_pcts(pcts,\n                              start_date="2017-10-01",\n                              prefix_list=prefix,\n                              get_dummies=True)\n\npcts = laplan.pcts.drop_child_cases(pcts, keep_child_entitlements=True)\n'

In [4]:
cases_to_keep = [
    "PAR-2020-384-TOC", # this should get dropped because of excluded prefix
    "DIR-2020-360-TOC-SPR", # this should get kept
    "ZA-2010-861-CUB-CU-ZV-ZAA-SPR-PA1", # this has lots of suffixes for us to learn about
    "PAR-2020-387-CUB", # this should get dropped because of excluded prefix
]

pcts = pcts[pcts.CASE_NUMBER.isin(cases_to_keep)].drop_duplicates(subset = ["CASE_NUMBER"])

In [5]:
print(f"# obs: {len(pcts)}")
print(f"# unique case numbers: {len(pcts.CASE_NUMBER.unique())}")

# obs: 4
# unique case numbers: 4


In [6]:
remove_prefix = ["ENV", "PAR", "ADM"]
FULL_PREFIX = list(laplan.pcts.VALID_PCTS_PREFIX)
prefix_list = [x for x in FULL_PREFIX if x not in remove_prefix]

suffix_list = ["TOC", "CUB"]

start_date = "2017-10-01"
end_date = pandas.Timestamp.now()

In [7]:
# Subset PCTS by start / end date
start_date = (
    pandas.to_datetime(start_date)
    if start_date
    else pandas.to_datetime("2010-01-01")
)
end_date = pandas.to_datetime(end_date) if end_date else pandas.Timestamp.now()

pcts = (
    pcts[
        (pcts.FILE_DATE >= start_date) & (pcts.FILE_DATE <= end_date)
    ]
    .drop_duplicates()
    .reset_index(drop=True)
)

In [8]:
import re

GENERAL_PCTS_RE = re.compile("([A-Z]+)-([0-9X]{4})-([0-9]+)((?:-[A-Z0-9]+)*)$")
MISSING_YEAR_RE = re.compile("([A-Z]+)-([0-9]+)((?:-[A-Z0-9]+)*)$")

# Parse CASE_NBR
cols = pcts.CASE_NUMBER.str.extract(GENERAL_PCTS_RE)

all_prefixes = cols[0]
all_suffixes = cols[3].str[1:]

print("show case number parsed with GENERAL_PCTS_RE")
display(cols.head())
display(all_prefixes.head())
display(all_suffixes.head())

show case number parsed with GENERAL_PCTS_RE


Unnamed: 0,0,1,2,3
0,DIR,2020,360,-TOC-SPR
1,PAR,2020,384,-TOC
2,ZA,2010,861,-CUB-CU-ZV-ZAA-SPR-PA1
3,PAR,2020,387,-CUB


0    DIR
1    PAR
2     ZA
3    PAR
Name: 0, dtype: object

0                  TOC-SPR
1                      TOC
2    CUB-CU-ZV-ZAA-SPR-PA1
3                      CUB
Name: 3, dtype: object

In [9]:
# Parse additional prefixes and suffixes that did not pass the first regex
# to fill NaN values based on indices.  Suffixes at position 2 instead of 3.
failed_general_parse = all_prefixes.isna()
additional_cols = pcts[failed_general_parse].CASE_NUMBER.str.extract(MISSING_YEAR_RE)

print("failed to parse go through MISSING_YEAR_RE")
display(additional_cols.head())

failed to parse go through MISSING_YEAR_RE


Unnamed: 0,0,1,2


In [10]:
# Now fill in those failed to parse the first time around
# Find the index where that happened, and assign those values for prefixes and suffixes

additional_prefixes = additional_cols[0]
additional_suffixes = additional_cols[2].str[1:]

all_prefixes.at[additional_prefixes.index] = additional_prefixes.values
all_suffixes.at[additional_suffixes.index] = additional_suffixes.values
all_suffixes = all_suffixes.str.split("-", expand=True)

In [11]:
# Start by excluding all rows that failed to parse.
successfully_parsed = all_prefixes.notna()


# Create 2 series, holds all True values for each case
allow_prefix = pandas.Series(True, index=pcts.index)
allow_suffix = pandas.Series(True, index=pcts.index)

In [12]:
# Subset by prefix
if prefix_list is not None:
    allow_prefix = all_prefixes.isin(prefix_list)
    
# Takes the previous series, which was all true,
# now only those who are part of "allow_prefix" have True, rest are False

allow_prefix

0     True
1    False
2     True
3    False
Name: 0, dtype: bool

In [13]:
# Subset by suffix. Since the suffix may be in any of the all_suffixes
# column, we logical-or them together, checking if each column has one
# of the ?requested ones.
if suffix_list is not None:
    print("before: all Trues")
    display(allow_suffix)
    allow_suffix = ~allow_suffix
    print("after: all Falses")
    display(allow_suffix)
    
    # Loop through each column in all_suffixes, labeled 0, 1, ..., n
    # Turn on to be True if it's already True (will work after 2nd iteration), 
    # or if the suffix is found in our allowed suffixes
    # This way, even if we loop through ones that aren't in our allowed suffixes, if it's already True,
    # it won't turn to False
    for c in all_suffixes.columns:
        print(f"Column: {c}")
        allow_suffix = allow_suffix | all_suffixes[c].isin(suffix_list)
        display(allow_suffix)

before: all Trues


0    True
1    True
2    True
3    True
dtype: bool

after: all Falses


0    False
1    False
2    False
3    False
dtype: bool

Column: 0


0    True
1    True
2    True
3    True
dtype: bool

Column: 1


0    True
1    True
2    True
3    True
dtype: bool

Column: 2


0    True
1    True
2    True
3    True
dtype: bool

Column: 3


0    True
1    True
2    True
3    True
dtype: bool

Column: 4


0    True
1    True
2    True
3    True
dtype: bool

Column: 5


0    True
1    True
2    True
3    True
dtype: bool

In [14]:
# If this condition is met, select the row (which is indexed by case_number)
subset = successfully_parsed & allow_prefix & allow_suffix

pcts = pcts[subset]

# Also, only select the rows that meet the above condition for our prefixes and suffixes dataframes
all_prefixes = all_prefixes[subset]
all_suffixes = all_suffixes[subset]

In [15]:
prefix_dummies = pandas.get_dummies(all_prefixes, dtype="bool")

In [16]:
print(f"set of prefix_list or all the valid ones: {set(prefix_list or VALID_PCTS_PREFIX)}")
print(f"set of prefixes in our dummies: {set(prefix_dummies.columns)}")
missing_prefixes = set(prefix_list or VALID_PCTS_PREFIX) - set(
        prefix_dummies.columns
    )
print(f"missing prefixes: {missing_prefixes}")

set of prefix_list or all the valid ones: {'VTT', 'DIR', 'PS', 'AA', 'APCW', 'CPC', 'TT', 'ZA', 'APCSV', 'APCH', 'APCE', 'APCS', 'HPO', 'APCNV', 'CHC', 'APCC'}
set of prefixes in our dummies: {'DIR', 'ZA'}
missing prefixes: {'VTT', 'CPC', 'TT', 'APCSV', 'APCH', 'APCE', 'PS', 'AA', 'APCW', 'APCS', 'HPO', 'APCNV', 'CHC', 'APCC'}


In [17]:
suffix_dummies = pandas.get_dummies(all_suffixes.stack(), dtype="bool").max(
            level=0
        )
display(suffix_dummies.head())

# Identify if any of the requested suffixes are missing. If so,
# populate them with a column of falses
print(f"set of suffix_list or all the valid ones: {set(suffix_list or VALID_PCTS_SUFFIX)}")
print(f"set of suffixes in our dummies: {set(suffix_dummies.columns)}")
missing_suffixes = set(suffix_list or VALID_PCTS_SUFFIX) - set(
    suffix_dummies.columns
)
print(f"missing suffixes: {missing_suffixes}")

Unnamed: 0,CU,CUB,PA1,SPR,TOC,ZAA,ZV
0,False,False,False,True,True,False,False
2,True,True,True,True,False,True,True


set of suffix_list or all the valid ones: {'TOC', 'CUB'}
set of suffixes in our dummies: {'SPR', 'CUB', 'ZV', 'CU', 'PA1', 'ZAA', 'TOC'}
missing suffixes: set()


In [18]:
# Make sure they are all nullable boolean type
suffix_dummies = suffix_dummies.astype("boolean")
prefix_dummies = prefix_dummies.astype("boolean")

In [19]:
# Combine the dfs.
pcts = pandas.concat((pcts, prefix_dummies, suffix_dummies), axis=1)

pcts

Unnamed: 0,CASE_ID,CASE_NUMBER,FILE_DATE,APPLICATION_ID,CASE_SEQUENCE_NUMBER,CASE_YEAR_NUMBER,PARENT_CASE_ID,CASE_ACTION_ID,ADDRESS,COUNCIL_DISTRICT,...,GEOID,DIR,ZA,CU,CUB,PA1,SPR,TOC,ZAA,ZV
0,235081,DIR-2020-360-TOC-SPR,2020-01-16,192926,360,2020,,,4517 S MAIN,9,...,6037231900,True,False,False,False,False,True,True,False,False
2,235109,ZA-2010-861-CUB-CU-ZV-ZAA-SPR-PA1,2020-01-17,192948,861,2010,178277.0,,434 S WESTERN,4,...,6037211804,False,True,True,True,True,True,False,True,True
