In [None]:
import os
import sys
import pandas as pd

sys.path.insert(0, '../')
from utils import drop_dups_with_less_info

In [None]:
df_dir = 'step_1_outputs/csv_data'

# Inspecting duplicate APNs and building_ids

Tables provided by Santa Clara with duplicate APNs:

- Commercial: pc851ci
- Multi-Family: pc851mf
- Single-Family: pc851sf

This step will be between Steps I and II of county assessor table processing. Output will be modified county tables in each county output folder (on S3 and locally)

## Commercial: pc851ci

**Conclusion:** Some information may not be up to date

**Recommendation:** Select record with more information (less NaN and 0 values)

In [None]:
pc851ci = pd.read_csv(os.path.join(df_dir, 'Santa_Clara_Assessor_Data_pc851ci.csv'))

pc851ci['building_id'] = ('A' + pc851ci['APN'].astype(str)
                          + 'B' + pc851ci['BUILDING_NUM'].astype(str))

In [None]:
vcs = pc851ci['building_id'].value_counts()
dup_ids = vcs[vcs > 1].index

mini_pc851ci = pc851ci[pc851ci['building_id'].isin(dup_ids)]
mini_pc851ci.T

In [None]:
# Mismatched columns
inconsistent_cols = ['REMARKS']  # for APN 42451131

# NaN
nan_cols = ['CONDITION_CODE', 'REMARKS']

# 0
zero_cols = ['EFFECTIVE_YEAR', 'NUMBER_UNITS', 'LEASE_AREA',
             'RENTABLE_AREA', 'TOTAL_AREA', 'NUMBER_FLOORS']

#### Drop duplicates and check that it worked

In [None]:
drop_dups_with_less_info(pc851ci)
# verify that it worked:
vcs = pc851ci['building_id'].value_counts()
dup_ids = vcs[vcs > 1].index
assert(len(dup_ids) == 0)

## Multi-Family: pc851mf

**Conclusion:** Cannot discern why there are duplicate records 

**Recommendation:** Select record with more information (less NaN and 0 values)

In [None]:
pc851mf = pd.read_csv(os.path.join(df_dir, 'Santa_Clara_Assessor_Data_pc851mf.csv'))
# there was 1 duplicate value for APN 31312025
pc851mf.drop_duplicates(inplace=True)

In [None]:
pc851mf['building_id'] = ('A' + pc851mf['APN'].astype(str)
                          + 'B' + pc851mf['BUILDING_NUM'].astype(str))

In [None]:
vcs = pc851mf['building_id'].value_counts()
dup_ids = vcs[vcs > 1].index

mini_pc851mf = pc851mf[pc851mf['building_id'].isin(dup_ids)]
mini_pc851mf.T

In [None]:
# Mismatched columns
inconsistent_cols = ['YEAR_BUILT', 'REMARKS']  # for APN 76432056, 76432056

# NaN
nan_cols = ['REC_ROOM_FLAG', 'FIREPLACE_FLAG', 'CONDITION_CODE']

# 0
zero_cols = ['EFFECTIVE_YEAR', 'LEASE_AREA', 'REMARKS']

#### Drop duplicates and check that it worked

In [None]:
drop_dups_with_less_info(pc851mf)
# verify that it worked:
vcs = pc851mf['building_id'].value_counts()
dup_ids = vcs[vcs > 1].index
assert(len(dup_ids) == 0)

## Single-Family: pc851sf

**Conclusion:** duplicate building ids seem to be due to old building records (e.g. pre-addition, pre-demo)

**Recommendation:** Select newest record based on YEAR_BUILT or EFFECTIVE_YEAR and amount of information (less NaN and 0 values), or if they are the same, select the record with more information (less NaN and 0 values). **EDIT**: just select record with more data (to be consistent with other tables' processing)

In [None]:
pc851sf = pd.read_csv(os.path.join(df_dir, 'Santa_Clara_Assessor_Data_pc851sf.csv'))
pc851sf.drop_duplicates(inplace=True)

In [None]:
pc851sf['building_id'] = ('A' + pc851sf['APN'].astype(str)
                          + 'B' + pc851sf['BUILDING_NUM'].astype(str))

In [None]:
vcs = pc851sf['building_id'].value_counts()
dup_ids = vcs[vcs > 1].index

mini_pc851sf = pc851sf[pc851sf['building_id'].isin(dup_ids)]
mini_pc851sf.T

In [None]:
# Mismatched columns (not a complete list)
inconsistent_cols = ['POOL_SAP_CODE', 'REMARKS', 'EFFECTIVE_YEAR', 'YEAR_BUILT']  # for APN 52701013, 48123055

# NaN
nan_cols = ['REMARKS', 'HEAT_AIR_COND', 'GARAGE_PORT',
           'PORCH_FLAG', 'PATIO-CODE', 'MISC_COSTS', 'POOL_SAP_CODE',
           'HILLSIDE_FLAG', 'TENNIS_FLAG', 'GARAGE_CONV_FLAG', 'EXTRA_KITCHEN',
           'EXTRA_PLUMB', 'SCREEN_ROOM_FLAG']

# 0
zero_cols = ['TOTAL_ROOMS', 'BATH_ROOMS', 'BEDROOM', 'FIRST_FLOOR_AREA'] # for APN 48123055

In [None]:
pc851sf[pc851sf['building_id'] == 'A52701013B1.0'].T

In [None]:
pc851sf[pc851sf['building_id'] == 'A51719079B1.0'].T

#### Drop duplicates and check that it worked

In [None]:
drop_dups_with_less_info(pc851sf)
# verify that it worked:
vcs = pc851sf['building_id'].value_counts()
dup_ids = vcs[vcs > 1].index
assert(len(dup_ids) == 0)