In [1]:
import pandas
import intake

In [2]:
cat = intake.open_catalog(r"../catalogs/*.yml")

In [3]:
pcts = cat.pcts2.read()

In [4]:
pcts_cases = pcts["CASE_NBR"]

In [5]:
pcts_cases_split = pcts_cases.str.split('-',expand=True)

#### note: .fillna() does not change the groupby total rows

In [6]:
pcts_cases_split.drop(columns=pcts_cases_split.columns[4:]).groupby([0,1]).first()

Unnamed: 0_level_0,Unnamed: 1_level_0,2,3
0,1,Unnamed: 2_level_1,Unnamed: 3_level_1
AA,1999,7287,PMLA
AA,2002,5557,PMLA
AA,2003,6204,PMLA
AA,2004,4782,PMLA
AA,2005,6689,PMLA
...,...,...,...
ZA,2018,6,CU
ZA,2019,9,CUB
ZA,2020,32,CUB
ZAI,1992,2018,PA1


In [7]:
pcts_cases_split.groupby([0,1]).first()

Unnamed: 0_level_0,Unnamed: 1_level_0,2,3,4,5,6,7,8,9,10,11,12,13
0,1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AA,1999,7287,PMLA,,,,,,,,,,
AA,2002,5557,PMLA,EXT,,,,,,,,,
AA,2003,6204,PMLA,EXT,,,,,,,,,
AA,2004,4782,PMLA,EXT,,,,,,,,,
AA,2005,6689,PMLA,SL,EXT,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZA,2018,6,CU,CUB,ZBA,SPR,,,,,,,
ZA,2019,9,CUB,,,,,,,,,,
ZA,2020,32,CUB,,,,,,,,,,
ZAI,1992,2018,PA1,,,,,,,,,,


### Type the case string parts

In [8]:
def string_type(string):
    if string: # ignore None type
        if string.isdigit():
            return "numeric" + str(len(string))
        elif string.isalpha():
            return "alpha"
        elif string.isalnum():
            return "alphanumeric"

In [9]:
pcts_cases_typed = pcts_cases_split.copy()

In [10]:
for c in pcts_cases_split.columns[1:]:
    pcts_cases_typed[c] = pcts_cases_typed[c].map(string_type)
    

In [11]:
pcts_cases_typed.rename(columns={0:'prefix',1:'2',2:'3',3:'4'}, inplace=True)

### Groupby to see different combinations; unfortunately this only shows suffix types that appear in first suffix position

In [12]:
pcts_cases_typed.groupby(['prefix','2','3','4'],dropna=False).first()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,4,5,6,7,8,9,10,11,12,13
prefix,2,3,4,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AA,numeric4,numeric1,alpha,,,,,,,,,,
AA,numeric4,numeric2,alpha,,,,,,,,,,
AA,numeric4,numeric3,alpha,,,,,,,,,,
AA,numeric4,numeric4,alpha,alphanumeric,,,,,,,,,
AA,numeric4,numeric5,alpha,alphanumeric,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZA,numeric4,numeric4,,,,,,,,,,,
ZA,numeric4,numeric5,alpha,alpha,,,,,,,,,
ZA,numeric4,numeric5,alphanumeric,,,,,,,,,,
ZAI,numeric4,numeric4,alpha,alphanumeric,,,,,,,,,


### Alternative to see suffix types in various suffix positions.

In [13]:
group_suffixes = pcts_cases_typed.set_index(
    ['prefix','2','3']
).stack(
    dropna=False
).reset_index(
).groupby(
    ['prefix','2','3',0], dropna=False
).first(
).reset_index()#.to_csv('pcts_cases_typed_stacked.csv')

In [14]:
group_suffixes.rename(columns={0:'4','level_3':'suffix_position'}, inplace=True)

In [15]:
for c in group_suffixes.columns[1:4]:
    print(f'Position {c} contains alpha characters for the following: \n\
    {set(group_suffixes[group_suffixes[c].str.contains("alpha").fillna(False)]["prefix"].to_list())}')

Position 2 contains alpha characters for the following: 
    {'ZA'}
Position 3 contains alpha characters for the following: 
    {'VTT', 'TT', 'PS'}
Position 4 contains alpha characters for the following: 
    {'CHC', 'EIR', 'APCW', 'APCE', 'DIR', 'APCSV', 'APCS', 'TT', 'CUB', 'APCH', 'VTT', 'PAR', 'CPC', 'CUZ', 'APCC', 'ADM', 'ENV', 'PS', 'APCNV', 'ZAI', 'ZA', 'AA', 'PM'}


### Check ZA that showed up with alphanumeric year in second position.  Appears to be a year still, but using a convention that must be inquired about with systems or the planners.

In [16]:
pcts.iloc[pcts_cases_split[(
    pcts_cases_split[0] == 'ZA') & (
    ~pcts_cases_split[1].str.isdigit()
)].index]

Unnamed: 0,CASE_ID,APLC_ID,CASE_NBR,CASE_SEQ_NBR,CASE_YR_NBR,CASE_ACTION_ID,CASE_FILE_RCV_DT,CASE_FILE_DATE,PARNT_CASE_ID,PARENT_CASE,AIN,PROJ_DESC_TXT,id
71252,188691.0,115543.0,ZA-19XX-19139-PA9,19139.0,,1.0,2012-09-21,2012-09,29631.0,29631.0,4334007009,,19139_2012
109746,192421.0,118072.0,ZA-19XX-19139-PA10,19139.0,,1.0,2013-07-15,2013-07,29631.0,29631.0,4334007009,,19139_2013


### Note that there are some auto-generated cases with suffixes that have a (n) or (N) attached.  The regular expressions won't catch those.  Based on the 2010-2019 backup it appears it is only 3 case records associated with DRB suffix.  More info [here](https://planning.lacity.org/dcpapi/general/prefixsuffix/active/pdf/).

In [17]:
pcts[pcts.CASE_NBR.str.contains('\(')]

Unnamed: 0,CASE_ID,APLC_ID,CASE_NBR,CASE_SEQ_NBR,CASE_YR_NBR,CASE_ACTION_ID,CASE_FILE_RCV_DT,CASE_FILE_DATE,PARNT_CASE_ID,PARENT_CASE,AIN,PROJ_DESC_TXT,id
106394,190768.0,116952.0,DIR-2013-774-DRB-SPP-P(N),774.0,2013.0,,2013-03-19,2013-03,,190768.0,4326003226,"PURSUANT TO LAMC SECTION 16.50, PRELIMINARY R...",774_2013
106532,190892.0,117033.0,DIR-2013-888-DRB-SPP-P(N),888.0,2013.0,12.0,2013-03-27,2013-03,,190892.0,5581019013,"PURSUANT TO LAMC SECTION 11.5.7.C, PRELIMINARY...",888_2013
106533,190892.0,117033.0,DIR-2013-888-DRB-SPP-P(N),888.0,2013.0,12.0,2013-03-27,2013-03,,190892.0,5581019018,"PURSUANT TO LAMC SECTION 11.5.7.C, PRELIMINARY...",888_2013
