# Create dummies for all possible suffixes?
* We use parser to grab the prefix and suffix
* We have crosswalk for all possible suffixes
* Do we want to create ~150 new columns to hold whether a particular suffix exists or not?

In [16]:
import intake
import numpy as np
import pandas as pd
import pcts_parser

In [17]:
catalog = intake.open_catalog("../catalogs/*.yml")

bucket_name = 'city-planning-entitlements'

In [18]:
pcts = pd.read_parquet(f's3://{bucket_name}/data/final/master_pcts.parquet')

keep = ['CASE_ID', 'PARENT_CASE', 'CASE_NBR']
pcts = pcts[keep].drop_duplicates()

crosswalk = pd.read_parquet(f's3://{bucket_name}/data/crosswalk_suffix.parquet')

In [19]:
def grab_suffix(df): 
    # Parse PCTS string and grab suffix
    parsed_col_names = ['suffix']

    def parse_pcts(row):
        try:
            z = pcts_parser.PCTSCaseNumber(row.CASE_NBR)
            return pd.Series([z.suffix], index = parsed_col_names)
        except ValueError:
            return pd.Series([z.suffix], index = parsed_col_names)

    parsed = df.apply(parse_pcts, axis = 1)
    df = pd.concat([df, parsed], axis = 1)
    
    # Turn the list of suffixes into dummies
    df2 = pd.get_dummies(df.suffix.apply(pd.Series).stack()).sum(level=0)

    df = pd.concat([df, df2])    
    
    return df

In [20]:
df = grab_suffix(pcts)

In [21]:
df.head()

Unnamed: 0,CASE_ID,PARENT_CASE,CASE_NBR,suffix,1A,2A,A,AC,ACI,ADD1,...,WDI,WTM,YV,ZAA,ZAD,ZAI,ZBA,ZC,ZCJ,ZV
0,193546.0,193546.0,ZA-2013-3079-CEX,[CEX],,,,,,,...,,,,,,,,,,
1,234299.0,234299.0,CPC-2019-7393-CA,[CA],,,,,,,...,,,,,,,,,,
2,193547.0,193547.0,AA-2013-3080-PMLA-SL,"[PMLA, SL]",,,,,,,...,,,,,,,,,,
3,193548.0,193548.0,ENV-2013-3081-MND,[MND],,,,,,,...,,,,,,,,,,
4,193549.0,193549.0,ZA-2013-3082-ZAA,[ZAA],,,,,,,...,,,,,,,,,,


In [26]:
# Groupby parent cases and find max for all dummies
all_parents = (df.drop(columns = ['CASE_ID', 'CASE_NBR', 'suffix'])
                .fillna(0)
                .pivot_table(index = ['PARENT_CASE'], aggfunc = 'sum')
                .reset_index()
            )
all_parents


Unnamed: 0,PARENT_CASE,1A,2A,A,AC,ACI,ADD1,ADU,AIC,B,...,WDI,WTM,YV,ZAA,ZAD,ZAI,ZBA,ZC,ZCJ,ZV
0,0.0,527.0,42.0,3.0,121.0,24.0,3.0,492.0,148.0,1.0,...,137.0,83.0,18.0,1655.0,893.0,51.0,34.0,484.0,25.0,1626.0
1,1499.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1932.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1989.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2260.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47715,235136.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47716,235138.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47717,235140.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47718,235141.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
