### This Notebook is all about the processing of owner names in this large dataset. 
##### The following code functions are customarily used to process the owner names in the dataset. The columns we are transforming come from the secured_basic_final as "first_owner_name", "first_owner_name_overflow", and "second_owner_name". 
##### These columns also contain useful information about Trusts which we will store and label in another column. Primarily the approach to dealing with this data is to parse out the Trust Names, clean up the 'Care_of' symbols, and remove any special characters.

In [1]:
# Grab the data from the FINAL DATA DIR
import os
import sys
# Get the current working directory
current_directory = os.getcwd()
parent_directory = os.path.abspath(os.path.join(current_directory, os.pardir))
os.chdir(parent_directory)

from src.paths import FINAL_DATA_DIR, FRONTEND_DIR, VALIDATION_DIR, NORMALIZED_DATA_DIR
import pandas as pd
import numpy as np
import re

# Set PD display options to show all columns and column width
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 1000)

# load the owner data
SL = pd.read_csv(VALIDATION_DIR / 'sales_list_validated.csv', index_col = [0])
LR = pd.read_csv(VALIDATION_DIR / 'local_roll_validated.csv', index_col = [0])
DS = pd.read_csv(VALIDATION_DIR / 'secured_basic_validated.csv', index_col = [0])
HZ = pd.read_csv(VALIDATION_DIR / 'hazards_validated.csv', index_col = [0])
LU = pd.read_csv(VALIDATION_DIR / 'land_use_validated.csv', index_col = [0])

In [11]:
LR.head()

Unnamed: 0,mapbook_number,page_number,parcel_number,assessor_identification_number,taxrate_area,administrative_region_number,common_area_key,year_sold_to_state,recording_date,land_value,improvement_value,exemption_claim_type_key,gross_personal_property_value,fixture_value,real_estate_exemption,personal_property_exemption,fixture_exemption_value,homeowner's_exemption,first_owner_assessee_name,first_owner_assessee_name_overflow,second_owner_assessee_name,special_name_assessee,address_key,address_date_of_last_change,postal_city_code,situs_house_number,unit,zip,street_name,city,legal_description_narrative,legal_description_lot,legal_description_division,legal_description_region,legal_description_line_one,legal_description_line_two,legal_description_line_three,legal_description_line_four,legal_description_line_five,zoning_code,use_code,effective_year,year_built,main_building_square_feet,county_name,state_name
0,2004,1,3,2004001003,16,2,0,0,2009-05-29,697735,295271,0,0,0,0,0,0,0,"PATEL,HANISH B AND ALPA P TRS",PATEL TRUST,Unknown,Unknown,D,1993-07-01,15,8321,Unknown,91304,FAUST AVE,LOS ANGELES CA,TRACT NO 25040,99,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,LARE9,101,1973,1973,2090,Los Angeles,California
1,2004,1,4,2004001004,16,2,0,0,2021-12-01,363273,250862,0,0,0,0,0,0,0,"ALI,SYED SHAH AND DILRUBA S TRS",ALI FAMILY TRUST,Unknown,Unknown,D,1993-07-01,15,8313,Unknown,91304,FAUST AVE,LOS ANGELES CA,TRACT NO 25040,100,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,LARE9,101,1973,1973,2479,Los Angeles,California
2,2004,1,5,2004001005,16,2,0,0,2017-08-18,516040,194684,0,0,0,0,0,0,0,"POPAL,ARINA AND",Unknown,"POPAL,MALIHA",Unknown,D,1993-07-01,15,8309,Unknown,91304,FAUST AVE,LOS ANGELES CA,TRACT NO 25040,101,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,LARE9,100,1973,1973,2057,Los Angeles,California
3,2004,1,8,2004001008,16,2,0,0,1979-07-02,125903,217613,0,0,0,0,0,0,7000,"SZABO,GEORGE AND JOY",Unknown,Unknown,Unknown,D,1993-07-01,15,8325,Unknown,91304,MAYNARD AVE,LOS ANGELES CA,*TR=30333,1,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,LARE11,101,1978,1978,2423,Los Angeles,California
4,2004,1,9,2004001009,16,2,0,0,1984-06-19,137190,205895,0,0,0,0,0,0,7000,"CAPUTO,DONATO AND IMMACOLATA",Unknown,Unknown,Unknown,D,1993-07-01,15,8311,Unknown,91304,MAYNARD AVE,LOS ANGELES CA,*TR=30333,2,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,LARE11,101,1978,1978,2226,Los Angeles,California


In [7]:
def reindex_column(df, column_name, new_position):
    """
    Reindexes the specified column to a new position in the DataFrame.
    
    Args:
        df (pd.DataFrame): The DataFrame to process.
        column_name (str): The name of the column to move.
        new_position (int): The new position (0-based index) for the column.

    Returns:
        pd.DataFrame: The DataFrame with the column reindexed.
    """
    # Ensure the column exists in the DataFrame
    if column_name in df.columns:
        # Remove the column and insert it at the new position
        col = df.pop(column_name)
        df.insert(new_position, column_name, col)
    else:
        print(f"Warning: Column '{column_name}' not found in DataFrame.")
    return df

# Example usage:
LU = reindex_column(LU, 'assessor_identification_number', 3)
HZ = reindex_column(HZ, 'assessor_identification_number', 3)

In [12]:
def identify_trusts(row):
    """
    Identify trust names and indicators from the owner's name.
    
    Parameters:
    row (pd.Series): A row of the DataFrame.
    
    Returns:
    pd.Series: Updated row with 'trust_names' and 'trust_indicator'.
    """
    owner_name = row['first_owner_name']
    trust_name = ""
    trust_indicator = 0

    # Identify the first instance of "TR" or "TRS"
    tr_index = owner_name.find(' TR ')
    trs_index = owner_name.find(' TRS ')

    # Determine which one occurs first if both are present
    if tr_index != -1 and (trs_index == -1 or tr_index < trs_index):
        trust_name = owner_name[tr_index+1:].strip()
        trust_indicator = 1
        owner_name = owner_name[:tr_index+1].strip()
    elif trs_index != -1 and (tr_index == -1 or trs_index < tr_index):
        trust_name = owner_name[trs_index+1:].strip()
        trust_indicator = 1
        owner_name = owner_name[:trs_index+1].strip()

    row['first_owner_name'] = owner_name
    row['trust_names'] = trust_name
    row['trust_indicator'] = trust_indicator

    return row

def replace_unknown_and_concatenate(row):
    """
    Replace 'Unknown' with empty strings and concatenate owner names.
    
    Parameters:
    row (pd.Series): A row of the DataFrame.
    
    Returns:
    pd.Series: Updated row with concatenated 'first_owner_name'.
    """
    row['first_owner_name_overflow'] = row['first_owner_name_overflow'].replace('Unknown', '')
    row['second_owner_name'] = row['second_owner_name'].replace('Unknown', '')
    row['first_owner_name'] = f"{row['first_owner_name']} {row['first_owner_name_overflow']} {row['second_owner_name']}".strip()
    return row

def process_care_of(row):
    """
    Identify 'care of' indicator and clean owner names accordingly.
    
    Parameters:
    row (pd.Series): A row of the DataFrame.
    
    Returns:
    pd.Series: Updated row with 'care_of' indicator.
    """
    owner_name = row['first_owner_name']
    row['care_of'] = 0

    # Check if "CO" is present using .find() method
    co_index = owner_name.find(' CO ')
    if co_index != -1:
        row['care_of'] = 1
        owner_name = owner_name.replace(' CO ', ' ').strip()
    else:
        # Check for "CO" at the beginning or end
        if owner_name.startswith('CO '):
            row['care_of'] = 1
            owner_name = owner_name.replace('CO ', '', 1).strip()
        elif owner_name.endswith(' CO'):
            row['care_of'] = 1
            owner_name = owner_name.replace(' CO', '', 1).strip()
    
    row['first_owner_name'] = owner_name
    return row

def remove_trs_tr(row):
    """
    Remove 'TRS' and 'TR' from owner names and trust names.
    
    Parameters:
    row (pd.Series): A row of the DataFrame.
    
    Returns:
    pd.Series: Updated row with cleaned 'first_owner_name' and 'trust_names'.
    """
    row['first_owner_name'] = re.sub(r'\bTRS\b', '', row['first_owner_name'], flags=re.IGNORECASE).strip()
    row['first_owner_name'] = re.sub(r'\bTR\b', '', row['first_owner_name'], flags=re.IGNORECASE).strip()
    row['trust_names'] = re.sub(r'\bTRS\b', '', row['trust_names'], flags=re.IGNORECASE).strip()
    row['trust_names'] = re.sub(r'\bTR\b', '', row['trust_names'], flags=re.IGNORECASE).strip()
    return row

def remove_et_al_DS(df):
    """
    Remove 'ET AL' from 'first_owner_name' and 'trust_names'.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to process.
    
    Returns:
    pd.DataFrame: Updated DataFrame with 'ET AL' removed.
    """
    df['first_owner_name'] = df['first_owner_name'].str.replace('ET AL', '', regex=False).str.strip()
    df['trust_names'] = df['trust_names'].str.replace('ET AL', '', regex=False).str.strip()
    return df

def process_dataframe(df):
    """
    Process the DataFrame with multiple transformations:
    1. Replace 'Unknown' in overflow names.
    2. Concatenate owner names.
    3. Identify and clean trust names and indicators.
    4. Identify 'care of' indicators.
    5. Remove 'TRS' and 'TR' from names.
    6. Remove 'ET AL' from names.
    7. Drop unnecessary columns.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to process.
    
    Returns:
    pd.DataFrame: Fully processed DataFrame.
    """
    # Initialize new columns
    df['trust_names'] = ""
    df['trust_indicator'] = 0
    df['care_of'] = 0

    # Apply row-wise transformations
    df = df.apply(replace_unknown_and_concatenate, axis=1)
    df = df.apply(identify_trusts, axis=1)
    df = df.apply(process_care_of, axis=1)
    df = df.apply(remove_trs_tr, axis=1)
    
    # Remove 'ET AL' from 'first_owner_name' and 'trust_names'
    df = remove_et_al_DS(df)
    
    # Drop unnecessary columns
    df.drop(columns=['first_owner_name_overflow', 'second_owner_name'], inplace=True)

    return df

In [13]:
# process the data
DS = process_dataframe(DS)


In [14]:
# Save the data to the NORMALIZED_DATA_DIR
SL.to_csv(NORMALIZED_DATA_DIR / 'sales_list_normalized.csv')
LR.to_csv(NORMALIZED_DATA_DIR / 'local_roll_normalized.csv')
DS.to_csv(NORMALIZED_DATA_DIR / 'secured_basic_normalized.csv')
HZ.to_csv(NORMALIZED_DATA_DIR / 'hazards_normalized.csv')
LU.to_csv(NORMALIZED_DATA_DIR / 'land_use_normalized.csv')