# Check original file against published reports
## ADU / SPR

In [1]:
import intake
import numpy as np
import pandas as pd
import laplan

catalog = intake.open_catalog('../catalogs/*.yml')
bucket_name = "city-planning-entitlements"

In [2]:
start_date = "1/1/10"
end_date = "10/31/19"

# Let's throw our new master_pcts into the d1_step_by_step
#master_pcts = catalog.pcts2.read()
master_pcts = pd.read_parquet('s3://city-planning-entitlements/test_new_master_pcts.parquet')

### PCTS Reporting Module Results

In [3]:
def import_and_subset(name):
    df = pd.read_excel(f'../data/pcts_{name}.xlsx', skiprows=4)
    keep = ["CASE NUMBER", "FILE DATE"]
    df = df[keep].rename(columns = {"CASE NUMBER": "CASE_NBR"})
    return df

### ITA laplan function

In [4]:
# All prefixes and suffixes
# This is our old master_pcts
def laplan_subset(name):
    name = name.upper()
    pcts = laplan.pcts.subset_pcts(
        master_pcts, 
        start_date = start_date , end_date = end_date,
        get_dummies=True, verbose=False)
    pcts = laplan.pcts.drop_child_cases(pcts, keep_child_entitlements=False)
    
    pcts = pcts[pcts[name]==True]
    
    return pcts

### ITA step-by-step in creating master_pcts

In [None]:
def ita_step_by_step(name):
    name = name.upper()
    print(f"{name}: Creating master PCTS step-by-step")
    
    case = pd.read_parquet(f's3://{bucket_name}/data/raw/tCASE.parquet')
    app = pd.read_parquet(f's3://{bucket_name}/data/raw/tAPLC.parquet')
    geo_info = pd.read_parquet(f's3://{bucket_name}/data/raw/tPROP_GEO_INFO.parquet')
    la_prop = pd.read_parquet(f's3://{bucket_name}/data/raw/tLA_PROP.parquet')

    app1 = app[['APLC_ID', 'PROJ_DESC_TXT']]
    geo_info1 = geo_info[['CASE_ID', 'PROP_ID']]
    la_prop1 = la_prop[la_prop.ASSR_PRCL_NBR.notna()][['PROP_ID', 'ASSR_PRCL_NBR']]
    
    # Subset by start/end date
    case2 = case[(case.CASE_FILE_RCV_DT >= start_date) & 
            (case.CASE_FILE_RCV_DT <= end_date)]
    
    # Subset by suffix 
    case3 = case2[case2.CASE_NBR.str.contains(f"-{name}")]
    
    print(f'1-# unique cases (parents + child): {case3.CASE_NBR.nunique()}')
    
    # Keep parent cases only
    case4 = case3[case3.PARNT_CASE_ID.isna()]
    
    print(f'2-# unique cases (parents): {case4.CASE_NBR.nunique()}')
    
    m1 = pd.merge(case4, geo_info1, on = 'CASE_ID', how = 'inner', validate = '1:m')
    m2 = pd.merge(m1, la_prop1, on = 'PROP_ID', how = 'inner', validate = 'm:1')
    m3 = pd.merge(m2, app1, on = 'APLC_ID', how = 'left', validate = 'm:1')
    
    print(f'3-# unique cases (parents), with geo_info merged: {m1.CASE_NBR.nunique()}')
    print(f'4-# unique cases (parents), with la_prop merged: {m2.CASE_NBR.nunique()}')
    print(f'5-# unique cases (parents), with app merged: {m3.CASE_NBR.nunique()}')


### ITA D1 step-by-step for dashboard

In [5]:
prefix_list = laplan.pcts.VALID_PCTS_PREFIX
suffix_list = laplan.pcts.VALID_PCTS_SUFFIX

remove_prefix = ["ENV"]
remove_suffix = [
    "EIR",
    "IPRO",
    "CA",
    "CATEX",
    "CPIO",
    "CPU",
    "FH",
    "G",
    "HD",
    "HPOZ",
    "ICO",
    "K",
    "LCP",
    "NSO",
    "S",
    "SN",
    "SP",
    "ZAI",
    "CRA", 
    "RFA",
]

prefix_list = [x for x in prefix_list if x not in remove_prefix]
suffix_list = [x for x in suffix_list if x not in remove_suffix]

def d1_step_by_step(name):
    name = name.upper()
    print(f"{name}: D1 step-by-step")
        
    # Load PCTS and subset to the prefix / suffix list we want
    pcts = laplan.pcts.subset_pcts(
        master_pcts,
        start_date = start_date, end_date = end_date,
        prefix_list=prefix_list, suffix_list=suffix_list,
        get_dummies=True, verbose=False,
    )
    pcts = laplan.pcts.drop_child_cases(pcts, keep_child_entitlements=True)
    pcts = pcts[pcts[name]==True][["CASE_NBR", "CASE_ID", "AIN"]]
    
    print(f'1-# unique cases (parents) using laplan: {pcts.CASE_NBR.nunique()}')
    
    # Add on tract info
    # See which cases have AINs, but those AINs are not mapped onto tract GEOID
    parcel_to_tract = catalog.crosswalk_parcels_tracts.read()
    parcel_to_tract = parcel_to_tract[["AIN", "num_AIN", "GEOID"]]

    pcts = pd.merge(
        pcts,
        parcel_to_tract, 
        on="AIN",
        how="inner",
        validate="m:1",
    )
    
    print(f'2-# unique cases (parents), with tract merged in: {pcts.CASE_NBR.nunique()}')
    
    # Clean AIN data and get rid of outliers
    case_counts = pcts.CASE_ID.value_counts()
    big_cases = pcts[pcts.CASE_ID.isin(case_counts[case_counts > 20].index)]

    pcts = pcts[~pcts.CASE_ID.isin(big_cases.CASE_ID)]
    
    print(f'3-# unique cases (parents) removing outliers: {pcts.CASE_NBR.nunique()}')

## Comparisons

In [6]:
# Put functions all together
def comparison(suffix):
    dcp = import_and_subset(suffix)
    ita = laplan_subset(suffix)

    print("Discrepancies in DCP vs ITA")
    print(f'DCP-{suffix.upper()} unique cases (parents) in PCTS report: {dcp.CASE_NBR.nunique()}')
    print(f'ITA-{suffix.upper()} unique cases (parents) with laplan, all prefixes/suffixes: {ita.CASE_NBR.nunique()}')    
    #ita_step_by_step(suffix)
    d1_step_by_step(suffix)

In [7]:
comparison("1a")

Discrepancies in DCP vs ITA
DCP-1A unique cases (parents) in PCTS report: 1412
ITA-1A unique cases (parents) with laplan, all prefixes/suffixes: 0
1A: D1 step-by-step
1-# unique cases (parents) using laplan: 1110
2-# unique cases (parents), with tract merged in: 1110
3-# unique cases (parents) removing outliers: 1110


In [8]:
comparison("2a")

Discrepancies in DCP vs ITA
DCP-2A unique cases (parents) in PCTS report: 119
ITA-2A unique cases (parents) with laplan, all prefixes/suffixes: 0
2A: D1 step-by-step
1-# unique cases (parents) using laplan: 67
2-# unique cases (parents), with tract merged in: 67
3-# unique cases (parents) removing outliers: 67


In [9]:
comparison("5a")

Discrepancies in DCP vs ITA
DCP-5A unique cases (parents) in PCTS report: 3
ITA-5A unique cases (parents) with laplan, all prefixes/suffixes: 0
5A: D1 step-by-step


KeyError: '5A'

In [None]:
#comparison("adu")

In [None]:
#comparison("spr")