In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import intake
import pcts_parser

In [2]:
catalog = intake.open_catalog("../catalogs/*.yml")

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
#list(catalog.pcts)

In [5]:
df = catalog.pcts.tCASE.read()
df = df[['CASE_NBR']]
df.head()

Unnamed: 0,CASE_NBR
0,PC-1987-764-PC
1,PC-1987-761-PC
2,CPC-1987-640-ZC
3,PC-1987-758-PC
4,CPC-1987-630-BL


In [6]:
parsed_col_names = ['prefix', 'year', 'pcts_case_id', 'suffix', 'invalid_prefix']

def parse_pcts(row):
    try:
        z = pcts_parser.PCTSCaseNumber(row.CASE_NBR)
        return pd.Series([z.prefix, z.year, z.pcts_case_id, z.suffix, z.invalid_prefix], index = parsed_col_names)
    except ValueError:
        return pd.Series(['failed', 'failed', 'failed', 'failed', ''], index = parsed_col_names)

parsed = df.apply(parse_pcts, axis = 1)

df = pd.concat([df, parsed], axis = 1)

df.head()

Unnamed: 0,CASE_NBR,prefix,year,pcts_case_id,suffix,invalid_prefix
0,PC-1987-764-PC,invalid,1987,764,[PC],PC
1,PC-1987-761-PC,invalid,1987,761,[PC],PC
2,CPC-1987-640-ZC,CPC,1987,640,[ZC],
3,PC-1987-758-PC,invalid,1987,758,[PC],PC
4,CPC-1987-630-BL,CPC,1987,630,[BL],


In [7]:
df.prefix.value_counts()

ZA         66128
ENV        39391
CPC        33263
DIR        23897
invalid    23467
ADM         6871
AA          5355
TT          5316
VTT         2450
CHC         1792
PAR         1600
APCNV        505
APCSV        430
APCW         246
APCC         178
             148
PS           134
APCE          47
APCS          25
APCH          18
HPO            1
Name: prefix, dtype: int64

In [8]:
# A lot of these seem to be suffixes.
# What do we do when a suffix is listed as the prefix?
df[df.prefix=='invalid'].invalid_prefix.value_counts()

YV      5357
CEX     3699
PC      2577
BZA     2486
ZAI     1903
ZV       928
CE       916
PM       912
AIC      642
SM       588
CUZ      559
CDP      491
TR       425
CUB      288
EAF      284
PMEX     283
COC      230
YC       171
CAL      146
FDC      128
CUX       80
SPR       80
MND       74
EIR       66
CP        66
DL        30
ND        15
BR        10
PMV        9
PMW        4
DLM        4
NP         2
BOB        1
ZQA        1
CF         1
S          1
PWA        1
DLEX       1
CA         1
FCD        1
QPC        1
GCEX       1
T          1
DU         1
OC         1
N          1
Name: invalid_prefix, dtype: int64

In [13]:
df[df.invalid_prefix=='PMEX'].head()

Unnamed: 0,CASE_NBR,prefix,year,pcts_case_id,suffix,invalid_prefix
71801,PMEX-2000-598,invalid,2000,598,,PMEX
71819,PMEX-2000-1114,invalid,2000,1114,,PMEX
71913,PMEX-1980-2456,invalid,1980,2456,,PMEX
71914,PMEX-19XX-94,invalid,19XX,94,,PMEX
71915,PMEX-19XX-1005,invalid,19XX,1005,,PMEX


In [14]:
df[df.prefix==''].head()

Unnamed: 0,CASE_NBR,prefix,year,pcts_case_id,suffix,invalid_prefix
204,ZA-1987-848-ZAI TH,,,,,
1717,ZA-1988-20198-PLAN APPROVAL,,,,,
3194,-2000-2847,,,,,
3971,CPC-1987-853-ZC/GPA,,,,,
5739,CPC-1986-1093-ZC/GPA,,,,,


In [11]:
wrong = df[(df.prefix == "") | (df.prefix=="invalid")]
print(f'% with invalid_prefix: {len(wrong) / len(df)}')
print(f'# obs with invalid prefixes: {len(wrong)}')
print(f'# invalid prefixes: {wrong.invalid_prefix.nunique()}')

% with invalid_prefix: 0.1117806325794511
# obs with invalid prefixes: 23615
# invalid prefixes: 47


In [12]:
"""
Text to check 1st regexr:
PC-1987-764-PC
CPC-1987-640-ZC
APCNV-2007-848-BL-ZAA-ZAD
CPC-2007-3931-ZC-HD-CUB-CU-ZV-SPR
TT-61102
TT-61605-2A
CDP-1987-34


Text to check 2nd regexr:
TT-61102
TT-61605-2A
VTT-61993-1A
VTT-68920-CC

"""

'\nText to check 1st regexr:\nPC-1987-764-PC\nCPC-1987-640-ZC\nAPCNV-2007-848-BL-ZAA-ZAD\nCPC-2007-3931-ZC-HD-CUB-CU-ZV-SPR\nTT-61102\nTT-61605-2A\nCDP-1987-34\n\n\nText to check 2nd regexr:\nTT-61102\nTT-61605-2A\nVTT-61993-1A\nVTT-68920-CC\n\n'