# Parse PCTS CASE_NBR

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import intake
import pcts_parser

In [2]:
catalog = intake.open_catalog("../catalogs/*.yml")

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
#list(catalog.pcts)

['tAPEL_CASE',
 'tAPLC',
 'tCASE',
 'tCOMMENT',
 'tCPC_CASE',
 'tDWELLING_UNITS',
 'tDWELLING_UNIT_SPECIALGROUPS',
 'tENV_CASE',
 'tHOLD_CASE',
 'tLA_PROP',
 'tLOC',
 'tPROP_CPIO',
 'tPROP_CUGU',
 'tPROP_GEO_INFO',
 'tPROP_SPECPLAN',
 'tZONING_CASE',
 'trefDWELLING_UNIT_PROPTYPE',
 'trefDWELLING_UNIT_SPECIALGROUPS',
 'trefHOLD_STG']

In [5]:
df = catalog.pcts.tCASE.read()
df = df[['CASE_NBR']]
df.head()

Unnamed: 0,CASE_NBR
0,PC-1987-764-PC
1,PC-1987-761-PC
2,CPC-1987-640-ZC
3,PC-1987-758-PC
4,CPC-1987-630-BL


In [6]:
parsed_col_names = ['prefix', 'year', 'pcts_case_id', 'suffix', 'invalid_prefix']

def parse_pcts(row):
    try:
        z = pcts_parser.PCTSCaseNumber(row.CASE_NBR)
        return pd.Series([z.prefix, z.year, z.pcts_case_id, z.suffix, z.invalid_prefix], index = parsed_col_names)
    except ValueError:
        return pd.Series(['failed', 'failed', 'failed', 'failed', ''], index = parsed_col_names)

parsed = df.apply(parse_pcts, axis = 1)

df = pd.concat([df, parsed], axis = 1)

df.head()

Unnamed: 0,CASE_NBR,prefix,year,pcts_case_id,suffix,invalid_prefix
0,PC-1987-764-PC,invalid,1987,764,[PC],PC
1,PC-1987-761-PC,invalid,1987,761,[PC],PC
2,CPC-1987-640-ZC,CPC,1987,640,[ZC],
3,PC-1987-758-PC,invalid,1987,758,[PC],PC
4,CPC-1987-630-BL,CPC,1987,630,[BL],


In [7]:
df.year.value_counts()

19XX    34285
         8745
2006     7834
2019     7547
2018     7487
2005     7418
2004     6927
2007     6663
2002     5704
2003     5694
2008     5459
2017     5450
2014     5194
2000     5005
2016     4795
2001     4755
2015     4633
2009     4524
1988     4448
2013     4401
1989     4095
2012     3772
1990     3697
2010     3579
2011     3444
1999     3057
1987     3057
1991     2577
1986     2475
1992     2439
1996     2252
1998     2251
1993     2236
1995     2073
1985     2008
1984     2001
1997     1989
1983     1957
1994     1916
1980     1476
1981     1469
1982     1346
1979     1063
1978     1053
1977      995
1974      420
2020      393
1976      385
1973      361
1975      317
1972      307
1966      230
1959      222
1964      194
1962      179
1965      177
1963      176
1971      165
1956      163
1970      156
1955      145
1957      143
1961      140
1949      140
1958      128
1953      124
1954      119
1968      116
1948      113
1950      113
1967      111
1960  

In [9]:
post_2000 = df[df.year.str.contains('20')]

In [11]:
post_2000.year = post_2000.year.astype(int)

recent = post_2000.loc[post_2000.year >= 2010]
recent2 = recent.loc[recent.year >= 2015]

## Doesn't appear to be any invalid prefixes in 2010-present data

In [12]:
recent.prefix.value_counts()

ENV      16296
DIR      12492
ZA        9332
ADM       6871
AA        1841
PAR       1600
CPC       1024
CHC        924
APCNV      101
APCSV       93
APCC        46
APCW        40
APCS        15
APCE        14
APCH         6
Name: prefix, dtype: int64

In [13]:
recent2.prefix.value_counts()

ENV      8751
DIR      6880
ADM      6871
ZA       3908
PAR      1600
AA        986
CPC       659
CHC       490
APCNV      58
APCSV      41
APCC       32
APCW       13
APCE        8
APCH        4
APCS        4
Name: prefix, dtype: int64

## Validate against City Planning's counts

In [18]:
recent[recent.prefix=='ADM'].year.value_counts()

2019    3432
2018    2909
2017     314
2020     216
Name: year, dtype: int64

In [16]:
print(f'2010 and after total: {len(recent)}')
print(f'2010 and after ENV: {len(recent[recent.prefix=="ENV"])}')
print(f'2010 and after ADM: {len(recent[recent.prefix=="ADM"])}')

print(f'2015 and after total: {len(recent2[recent2.prefix=="ADM"])}')
print(f'2015 and after ADM: {len(recent2[recent2.prefix=="ADM"])}')
print(f'2015 and after ADM: {len(recent2[recent2.prefix=="ADM"])}')

2010 and after total: 50695
2010 and after ENV: 16296
2010 and after ADM: 6871
2015 and after total: 6871
2015 and after ADM: 6871
2015 and after ADM: 6871


In [12]:
"""
Text to check 1st regexr:
PC-1987-764-PC
CPC-1987-640-ZC
APCNV-2007-848-BL-ZAA-ZAD
CPC-2007-3931-ZC-HD-CUB-CU-ZV-SPR
TT-61102
TT-61605-2A
CDP-1987-34


Text to check 2nd regexr:
TT-61102
TT-61605-2A
VTT-61993-1A
VTT-68920-CC

"""

'\nText to check 1st regexr:\nPC-1987-764-PC\nCPC-1987-640-ZC\nAPCNV-2007-848-BL-ZAA-ZAD\nCPC-2007-3931-ZC-HD-CUB-CU-ZV-SPR\nTT-61102\nTT-61605-2A\nCDP-1987-34\n\n\nText to check 2nd regexr:\nTT-61102\nTT-61605-2A\nVTT-61993-1A\nVTT-68920-CC\n\n'

## Validate year and case numbers

In [15]:
recent.pcts_case_id.value_counts()

94      18
2714    17
397     16
1402    16
2084    14
1169    14
767     14
2186    14
305     14
92      14
2119    14
2511    14
307     14
3197    14
3051    14
632     14
103     14
472     14
1288    14
4074    14
2111    13
3280    13
90      13
817     13
977     13
1923    13
336     13
69      13
2527    13
837     13
3310    13
1358    13
277     13
911     13
382     13
3231    13
886     13
1208    13
364     13
864     13
2401    13
1001    13
2658    13
1068    13
1736    13
2178    13
778     13
2164    13
2451    13
2157    13
328     13
1881    13
323     13
3028    13
2015    13
2197    13
914     13
2067    13
366     13
2615    13
2807    13
519     13
543     13
2727    13
852     13
1655    13
727     13
967     13
361     13
552     13
1330    13
35      13
1722    13
385     13
1073    13
370     13
28      13
83      13
108     13
2029    13
3213    13
2548    13
889     13
1862    13
41      13
2593    13
334     13
1880    13
1158    13
1779    12
319     12

In [17]:
check_id = ['2714', '397']

recent[recent.pcts_case_id.isin(check_id)].sort_values(['pcts_case_id', 'year'], ascending = [True, True])

# Looks like case numbers are not unique across years. But, if they have different year, can we treat them as different cases?
# If they have multiple obs in the same year, should we treat them as the same case or different cases?
# It looks like the suffixes are just getting longer and longer, but it's not only you're taking more on, you're also revising some of the suffixes.

Unnamed: 0,CASE_NBR,prefix,year,pcts_case_id,suffix,invalid_prefix
154818,ZA-2010-2714-ZAI,ZA,2010,2714,[ZAI],
158820,ZA-2011-2714-CEX,ZA,2011,2714,[CEX],
165586,ENV-2012-2714-CE,ENV,2012,2714,[CE],
169480,ZA-2013-2714-CUB,ZA,2013,2714,[CUB],
179303,AA-2014-2714-PMEX,AA,2014,2714,[PMEX],
180214,CPC-2015-2714-VZC-SP-DRB-SPP,CPC,2015,2714,"[VZC, SP, DRB, SPP]",
189484,CPC-2015-2714-VZC-SP-DRB-SPP-PA1,CPC,2015,2714,"[VZC, SP, DRB, SPP, PA1]",
190095,CPC-2015-2714-VZC-SP-DRB-SPP-PA1-1A,CPC,2015,2714,"[VZC, SP, DRB, SPP, PA1, 1A]",
192719,CPC-2015-2714-VZC-SP-DRB-SPP-PA2,CPC,2015,2714,"[VZC, SP, DRB, SPP, PA2]",
192819,CPC-2015-2714-VZC-SP-DRB-SPP-PA3,CPC,2015,2714,"[VZC, SP, DRB, SPP, PA3]",


In [19]:
recent.pcts_case_id.nunique()

7772

In [20]:
len(recent)

50695

In [25]:
recent.duplicated(subset = ['pcts_case_id', 'year']).value_counts()

False    48426
True      2269
dtype: int64

In [26]:
recent.duplicated(subset = ['pcts_case_id']).value_counts()

True     42923
False     7772
dtype: int64