In [1]:
# Grab the data from the FINAL DATA DIR
import os
import sys
# Get the current working directory
current_directory = os.getcwd()
parent_directory = os.path.abspath(os.path.join(current_directory, os.pardir))
os.chdir(parent_directory)

from src.paths import CLEANED_DATA_DIR
import pandas as pd
import numpy as np

# Set PD display options to show all columns and column width
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 1000)

SL = pd.read_csv(CLEANED_DATA_DIR / 'sales_list_cleaned.csv', index_col = [0])
LR = pd.read_csv(CLEANED_DATA_DIR / 'local_roll_cleaned.csv', index_col = [0])
DS = pd.read_csv(CLEANED_DATA_DIR / 'secured_basic_cleaned.csv', index_col = [0])
HZ = pd.read_csv(CLEANED_DATA_DIR / 'hazards_cleaned.csv', index_col = [0])
LU = pd.read_csv(CLEANED_DATA_DIR / 'land_use_cleaned.csv', index_col = [0])

### 1. Adjust the city names in the data to handle scenarios where the city names are not consistent. For example "New York Cty" and "New York City" should be considered the same city.

In [2]:
from src.city_const import city_names
# sort the keys in alphabetical order
standardized_city_names = dict(sorted(city_names.items(), key=lambda x: x[0]))

reverse_lookup = {}
for city, variants in city_names.items():
    for variant in variants:
        reverse_lookup[variant] = city
# Function to correct city names
def correct_city_name_DS(city_name):
    return reverse_lookup.get(city_name, city_name)

# Assuming DS is your DataFrame
def preprocess_city_names_DS(df):
    # Drop rows where 'city_state' is completely a string of numbers
    df = df[~df['city'].str.isnumeric()].copy()
    
    # Apply the correct_city_name function using .loc
    df.loc[:, 'city'] = df['city'].apply(correct_city_name_DS)
    
    return df

DS = preprocess_city_names_DS(DS)

In [None]:
from src.city_const import city_names
# sort the keys in alphabetical order
standardized_city_names = dict(sorted(city_names.items(), key=lambda x: x[0]))

reverse_lookup = {}
for city, variants in city_names.items():
    for variant in variants:
        reverse_lookup[variant] = city
# Function to correct city names
def correct_city_name_LR(city_name):
    return reverse_lookup.get(city_name, city_name)

# Assuming DS is your DataFrame
def preprocess_city_names_LR(df):
    # Drop rows where 'city_state' is completely a string of numbers
    df = df[~df['mail_city'].str.isnumeric()].copy()
    
    # Apply the correct_city_name function using .loc
    df.loc[:, 'mail_city'] = df['mail_city'].apply(correct_city_name_LR)
    
    return df

LR = preprocess_city_names_LR(LR)

### 2. Create a composite key for the assessor_identification_number
##### The first four digits are the mapbook number, the next three digits are the page number, and the last three digits are the parcel number.

In [3]:
temp = DS

In [4]:
def split_ain(df, ain_column):
    """
    Splits the assessor identification number into mapbook number, page number, and parcel number
    and adds these as new columns in the DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame to process.
        ain_column (str): The column name in the DataFrame containing the AIN.

    Returns:
        pd.DataFrame: The DataFrame with the new columns added.
    """
    # Ensure the AIN is treated as a string and exactly 10 digits
    df[ain_column] = df[ain_column].astype(str).str.zfill(10)
    
    df['mapbook_number'] = df[ain_column].str[:4]
    df['page_number'] = df[ain_column].str[4:7]
    df['parcel_number'] = df[ain_column].str[7:10]

    # Insert the new columns at the beginning of the DataFrame
    df.insert(0, 'mapbook_number', df.pop('mapbook_number'))
    df.insert(1, 'page_number', df.pop('page_number'))
    df.insert(2, 'parcel_number', df.pop('parcel_number'))
    
    return df

In [7]:
DS = split_ain(DS, 'assessor_identification_number')
LR = split_ain(LR, 'assessor_identification_number')
HZ = split_ain(HZ, 'assessor_identification_number')
LU = split_ain(LU, 'assessor_identification_number')
SL = split_ain(SL, 'assessor_identification_number')

In [8]:
def validate_ain(df, column_name='assessor_identification_number'):
    # Convert the column to string for length checking
    df['temp_str_ain'] = df[column_name].astype(str)
    
    # Filter to only include rows where the length of the assessor_identification_number is 10
    df = df[df['temp_str_ain'].str.len() == 10].copy()
    
    # Drop the temporary string column
    df.drop(columns=['temp_str_ain'], inplace=True)
    
    return df

DS = validate_ain(DS)
LR = validate_ain(LR)
SL = validate_ain(SL)
HZ = validate_ain(HZ)
LU = validate_ain(LU)

In [9]:
from src.paths import VALIDATION_DIR

# Save the data to the frontend directory
DS.to_csv(VALIDATION_DIR / 'secured_basic_validated.csv')
LR.to_csv(VALIDATION_DIR / 'local_roll_validated.csv')
HZ.to_csv(VALIDATION_DIR / 'hazards_validated.csv')
LU.to_csv(VALIDATION_DIR / 'land_use_validated.csv')
SL.to_csv(VALIDATION_DIR / 'sales_list_validated.csv')

### 4.. Optional - Alter the hazards columns to be a binary column where 1 indicates that the property is in a hazard zone and 0 indicates that the property is not in a hazard zone. If it is near a hazard zone, it should be considered near in a hazard zone.