## Data Exploration of Local Roll and Secured Basic File

In [1]:
import os
import sys
# Get the current working directory
current_directory = os.getcwd()
parent_directory = os.path.abspath(os.path.join(current_directory, os.pardir))
os.chdir(parent_directory)

In [2]:
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
from plotly.subplots import make_subplots
from scipy import stats
from scipy.stats import norm
from src.paths import TRANSFORMED_DATA_DIR

# Set PD display options to show all columns and column width
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 1000)

LU = pd.read_csv(TRANSFORMED_DATA_DIR / 'land_use.csv')
HZ = pd.read_csv(TRANSFORMED_DATA_DIR / 'hazards_city.csv')
DS = pd.read_csv(TRANSFORMED_DATA_DIR / 'DS_transformed.csv')
LR = pd.read_csv(TRANSFORMED_DATA_DIR / 'local_roll_transformed.csv')
SL = pd.read_csv(TRANSFORMED_DATA_DIR / 'sales_list_transformed.csv')

  LU = pd.read_csv(TRANSFORMED_DATA_DIR / 'land_use.csv')
  HZ = pd.read_csv(TRANSFORMED_DATA_DIR / 'hazards_city.csv')
  DS = pd.read_csv(TRANSFORMED_DATA_DIR / 'DS_transformed.csv')
  LR = pd.read_csv(TRANSFORMED_DATA_DIR / 'local_roll_transformed.csv')
  SL = pd.read_csv(TRANSFORMED_DATA_DIR / 'sales_list_transformed.csv')


In [3]:
DS_unknown_object_columns = ['Fraction', 'Direction', 'Street Name', 'Unit', 'City State', 'M Fraction', 'M Direction', 'M Street Name', 'M Unit', 'M City State', 
'First Owner Name Overflow', 'Special Name Assessee', 'Second Owner Name', 'Use Code', 'BD1 Quality',
'BD2 Quality', 'BD3 Quality', 'BD4 Quality', 'BD5 Quality', 'BD1 Design', 'BD2 Design', 'BD3 Design', 
'BD4 Design', 'BD5 Design', 'Legal First Line', 'Legal Second Line', 'Legal Third Line', 'Legal Fourth Line',
'Legal Fifth Line', 'Legal Last Line', 'Land Reason Key', 'Impairment Key', 'First Transfree Overflow', 'Second Transfree Name']

SL_unknown_object_columns = [ 'Direction', 'street name', 'first owner', 'first owner tr', 'use code', 'zone', 'doc type', 'design ty', 'last sale1 ver', '2 verification', '3 verif'] 

LR_unknown_object_columns = ['First Owner Assessee Name', 'First Owner Assessee Name Overflow', 'Second Owner Assessee Name', 'Special Name Assessee', 'Situs Address Key', 'Situs Address Fraction', 'Situs Address Direction', 
'Situs Address Street Name', 'Situs Address City and State', 'Mail Address Key', 'Mail Address Unit', 'Situs Address Unit', 'Mail Address Direction', 'Mail Address Fraction', 'Mail Address Street Name', 'Mail Address City and State',
'Legal Description - Last LIne Narrative', 'Legal Description - Last LIne Lot', 'Legal Description - Last LIne Division', 'Legal Description - Last LIne Region', 'Legal Description Line One', 
'Legal Description Line Two', 'Legal Description Line Three', 'Legal Description Line Four', 'Legal Description Line Five', 'Zoning Code', 'Use Code']

DS_datetime_columns = ['Recording Date', 'Last Sale Date', 'Sale Two Date', 'Sale Three Date']

SL_datetime_columns = ['transferee', 'last 1 sale date', '2 dat', '3 sal da']

LR_datetime_columns = ['Recording Date', 'Situs Address date of Last Change', 'Mail Address date of Last Change']

# grab all columns that are not datetime_columns or unknown_object_columns
DS_all_columns = set(DS.columns)
DS_excluded_columns = set(DS_datetime_columns + DS_unknown_object_columns)
DS_included_columns = list(DS_all_columns - DS_excluded_columns)

LR_all_columns = set(LR.columns)
LR_excluded_columns = set(LR_datetime_columns + LR_unknown_object_columns)
LR_included_columns = list(LR_all_columns - LR_excluded_columns)

SL_all_columns = set(SL.columns)
SL_excluded_columns = set(SL_datetime_columns + SL_unknown_object_columns)
SL_included_columns = list(SL_all_columns - SL_excluded_columns)

In [4]:
# Replace DS Values to our need
def replace_DS_hazard_city_key(df, column, old_values, new_values):
    """
    A function to replace the hazard city key values
    """
    df[column] = df[column].replace(old_values, new_values)
    return df

def replace_DS_special_name_legend(df, column, old_values, new_values):
    """
    A function to fix nulls from special name legend row
    """
    df[column] = df[column].replace(old_values, new_values)
    return df

def replace_DS_doc_reason_code(df, column, old_values, new_values):
    """
    A function to fix nulls from special name legend row
    """
    df[column] = df[column].replace(old_values, new_values)
    return df

def replace_DS_tax_stat_key(df, column, old_values, new_values):
    """
    A function to fix nulls from tax stat key
    """
    df[column] = df[column].replace(old_values, new_values)
    return df

def replace_DS_exemption(df, column, old_values, new_values):
    """
    A function to fix nulls and unknown values
    """
    df[column] = df[column].replace(old_values, new_values)

    return df

def drop_DS_ownership_code(df, column, old_values, new_values):
    """
    A function to drop the ownership code column
    """
    df.drop(column, axis=1, inplace=True)
    return df

def replace_DS_pp_key(df, column, old_values, new_values):
    """
    A function to replace the pp key values
    """
    df[column] = df[column].replace(old_values, new_values)
    return df

# for a column called "Administrative Region" in a dataset, replace the string values of 'A-1' and 'B-1' with a 1 and 21 respectively
def replace_LR_administrative_values(df, column, old_values, new_values):
    """
    A custom function for the LR dataset to replace the string values of 'A-1' and 'B-1' with a 1 and 21 respectively
    """
    df[column] = df[column].replace(old_values, new_values)
    return df

def replace_LR_situs_address_key(df, column, old_values, new_values):
    """
    A custom function for the LR dataset to replace the string values of 'A-1' and 'B-1' with a 1 and 21 respectively
    """
    df[column] = df[column].replace(old_values, new_values)
    return df

def drop_LR_filler(df, column):
    """
    A custom function to drop the filler column
    """
    df = df.drop(column)

# Replace Sales List Values to change Mixed Types
def replace_SL_administrative_values(df, column, old_values, new_values):
    """
    A custom function for the SL dataset to replace the string values of 'A-1' and 'B-1' with a 1 and 21 respectively
    """
    df[column] = df[column].replace(old_values, new_values)
    return df

def replace_SL_dtt_type(df, column, old_values, new_values):
    """
    A custom function for the SL dataset to replace the string values of 'A-1' and 'B-1' with a 1 and 21 respectively
    """
    df[column] = df[column].replace(old_values, new_values)
    return df

def replace_SL_key(df, column, old_values, new_values):
    """
    A custom function for the SL dataset to replace the string values of 'A-1' and 'B-1' with a 1 and 21 respectively
    """
    df[column] = df[column].replace(old_values, new_values)
    return df

def replace_SL_design_ty(df, column, old_values, new_values):
    """
    A custom function for the SL dataset to replace the string values of 'A-1' and 'B-1' with a 1 and 21 respectively
    """
    df[column] = df[column].replace(old_values, new_values)
    return df

def replace_SL_verification(df, columns, old_values, new_values):
    """
    A function to replace the verification ÿ, np.nan
    """
    for column in columns:
        df[column] = df[column].replace(old_values, new_values)
    return df 


In [5]:
DS = replace_DS_hazard_city_key(DS, 'Hazard City Key', ['A', 'B', 'C', 'D', np.nan], [10, 11, 12, 13, 0])
DS = replace_DS_special_name_legend(DS, 'Special Name Legend', [np.nan, 'ÿÿÿÿÿ'], ['Unknown', 'Unknown'])
DS = replace_DS_doc_reason_code(DS, 'Doc Reason Code', [np.nan, 'ÿ'], ['0', '0'])
DS = replace_DS_tax_stat_key(DS, 'Tax Stat Key', [np.nan, 'ÿ'], ['4', '4'])
DS = replace_DS_exemption(DS, 'Exemption Type', [np.nan, 'ÿ'], ['10', '10'])
#DS = drop_DS_ownership_code(DS, 'Ownership Code')
#DS = replace_DS_pp_key(DS, 'PP Key', [np.nan, 'ÿ'], [0, 0])
DS['county_name'] = 'Los Angeles'

DS['state_name'] = 'California'

LR = replace_LR_administrative_values(LR, 'Administrative Region Number', ['A1', 'B1', np.nan], [1, 21, 0])
LR = replace_LR_situs_address_key(LR, 'Situs Address Key', [np.nan], ['O'])
LR['county_name'] = 'Los Angeles'
LR['state_name'] = 'California'

SL = replace_SL_administrative_values(SL, 'Administrative Region', ['A1', 'B1', 'ÿÿ', np.nan], [1, 21, 99, 99])
SL = replace_SL_dtt_type(SL, 'dtt type', [np.nan, 'nan', 'ÿ'], [0, 0, 0])
SL = replace_SL_key(SL, 'Key', ['ÿ'], [0])
SL = replace_SL_design_ty(SL, 'design ty', [np.nan], [0])
SL = replace_SL_verification(SL, ['last sale1 ver', '2 verification', '3 verif'], ['ÿ', np.nan], ['0','0S'])
SL['county_name'] = 'Los Angeles'
SL['state_name'] = 'California'

# Keep only instances of LA County for both HZ and LU
HZ = HZ[HZ['COUNTY_NAME'] == 'Los Angeles']
HZ['state_name'] = 'California'

LU = LU[LU['COUNTY_NAME'] == 'Los Angeles']
LU['state_name'] = 'California'


### Map the categorical columns to their respective values.

In [None]:
from src.const import *
# Create a new column where the impairment key is replaced with the impairment description
DS['impairment_description'] = DS['impairment_key'].map(impairment_key_dict)
DS['tax_status'] = DS['tax_stat_key'].map(tax_status_dict)

DS['residential_zone_price'] = 800000
DS['commercial_zone_price'] = 1000000
DS['industrial_zone_price'] = 1200000
DS['mixed_use_zone_price'] = 900000

 # for market comparable use a placeholder for the data. 
DS['market_comparable_city'] = DS['city']
DS['market_comparable'] = 'Above Average for Single Family Homes'

DS['average_unit_rent'] = 2250
DS['average_single_family_sale'] = 900000  
DS['average_multi_family_sale'] = 1300000
DS['average_commercial_sale'] = 2000000
DS['average_industrial_sale'] = 3000000
DS['average_condominium_sale'] = 500000

In [7]:
temp_df = DS

In [8]:
import pandas as pd
import numpy as np

def clean_and_convert_strings(df, columns):
    """
    Cleans and converts specified columns in a dataframe to strings. It replaces null values, nan, and any instances
    of 'ÿ' and its repetitions with 'Unknown', then converts each column to string type.

    Args:
        df (pd.DataFrame): The dataframe to process.
        columns (list of str): The list of column names to process as string columns.

    Returns:
        pd.DataFrame: The dataframe with the processed columns.
    """
    for column in columns:
        if column in df.columns:
            # Replace 'ÿ' sequences and null values with 'Unknown'
            df[column] = df[column].replace(to_replace=r'ÿ+', value='Unknown', regex=True).fillna('Unknown')
            # Convert to string type
            df[column] = df[column].astype(str, errors='ignore')
        else:
            print(f"Warning: Column '{column}' not found in DataFrame.")
    return df

def convert_to_datetime(df, datetime_columns):
    """
    Converts specified columns to datetime, handling zero and NaN values.
    
    Args:
        df (pd.DataFrame): The DataFrame to process.
        datetime_columns (list): List of columns to convert to datetime.

    Returns:
        pd.DataFrame: The DataFrame with the datetime columns processed.
    """
    for column in datetime_columns:
        # Replace invalid entries with a placeholder date
        df[column] = pd.to_datetime(
            df[column].astype(str).replace(['0', '0.0', 'nan', 'NaT', np.nan, 'NaN'], '19700101'),
            format='%Y%m%d', errors='coerce'
        ).dt.normalize()  # Normalize to remove the time part
    
    # Replace NaT values with the placeholder date
    df[datetime_columns] = df[datetime_columns].fillna(pd.Timestamp('1970-01-01'))
    
    return df

def convert_to_int(df, columns):
    """
    Converts specified columns in a dataframe to int64. It handles floats by converting them directly to ints,
    numeric strings are also converted to ints, and non-numeric strings or any other non-convertible values are
    replaced with 0.

    Args:
        df (pd.DataFrame): The dataframe to process.
        columns (list of str): The list of column names to process as integer columns.

    Returns:
        pd.DataFrame: The dataframe with the processed columns.
    """
    for column in columns:
        if column in df.columns:
            # Attempt to convert all values to int64, replacing non-convertible values with 0
            df[column] = pd.to_numeric(df[column], errors='coerce').fillna(0).astype('int64')
        else:
            print(f"Warning: Column '{column}' not found in DataFrame.")
    return df

def convert_to_float(df, columns):
    """
    Converts specified columns in a dataframe to float64. It handles numeric strings and integers by converting them
    directly to floats, and non-numeric strings or any other non-convertible values are replaced with 0.

    Args:
        df (pd.DataFrame): The dataframe to process.
        columns (list of str): The list of column names to process as float columns.

    Returns:
        pd.DataFrame: The dataframe with the processed columns.
    """
    for column in columns:
        if column in df.columns:
            # Attempt to convert all values to float64, replacing non-convertible values with 0
            df[column] = pd.to_numeric(df[column], errors='coerce').fillna(0).astype('float64')
        else:
            print(f"Warning: Column '{column}' not found in DataFrame.")
    return df

def detect_column_types_from_dict(df, column_types_dict):
    """
    Detects and verifies the data types for the specified columns using the provided dictionary.

    Args:
        df (pd.DataFrame): The DataFrame to process.
        column_types_dict (dict): A dictionary mapping column names to their desired data types.

    Returns:
        dict: A dictionary mapping column names to their detected data types.
    """
    column_types = {}
    for column, expected_type in column_types_dict.items():
        if column in df.columns:
            if pd.api.types.is_dtype_equal(df[column], expected_type):
                column_types[column] = expected_type
            else:
                # Mixed type detection (this part could be more sophisticated)
                unique_types = set(df[column].apply(type))
                if expected_type == 'object' and (str in unique_types or bytes in unique_types):
                    column_types[column] = 'object'
                elif expected_type == 'int64' and (int in unique_types or float in unique_types):
                    column_types[column] = 'int64'
                elif expected_type == 'float64' and (float in unique_types or int in unique_types):
                    column_types[column] = 'float64'
                elif expected_type in ['datetime64'] and pd.api.types.is_datetime64_any_dtype(df[column]):
                    column_types[column] = 'datetime64'
                else:
                    column_types[column] = expected_type
        else:
            print(f"Warning: Column '{column}' not found in DataFrame.")
    return column_types

def apply_column_conversions(df, column_types):
    """
    Applies the appropriate conversions to the DataFrame columns based on a dictionary of column types.

    Args:
        df (pd.DataFrame): The DataFrame to process.
        column_types (dict): A dictionary mapping column names to their desired data types.

    Returns:
        pd.DataFrame: The DataFrame with the columns converted to the specified data types.
    """

    string_columns = []
    int_columns = []
    datetime_columns = []
    float_columns = []
    
    # Classify columns by the target data type
    for column, dtype in column_types.items():
        if dtype == 'object':
            string_columns.append(column)
        elif dtype == 'int64':
            int_columns.append(column)
        elif dtype == 'float64':
            float_columns.append(column)
        elif dtype == 'datetime64':
            datetime_columns.append(column)
    
    # Apply conversions
    if string_columns:
        df = clean_and_convert_strings(df, string_columns)
    if int_columns:
        df = convert_to_int(df, int_columns)
    if float_columns:
        df = convert_to_float(df, float_columns)
    if datetime_columns:
        df = convert_to_datetime(df, datetime_columns)

    return df

def clean_column_names(df):
    """
    Cleans the column names by making them lowercase, replacing spaces with underscores, and removing dashes.

    Args:
        df (pd.DataFrame): The DataFrame to process.

    Returns:
        pd.DataFrame: The DataFrame with cleaned column names.
    """
    df.columns = df.columns.str.replace('-', '', regex=False)
    df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')
    return df


In [9]:
from src.const import secured_roll_types, sales_list_types, local_roll_types, hazard_column_types, land_use_column_types

# Clean column names
column_types_DS = detect_column_types_from_dict(DS, secured_roll_types)
DS = apply_column_conversions(DS, column_types_DS)

column_types_LR = detect_column_types_from_dict(LR, local_roll_types)
LR = apply_column_conversions(LR, column_types_LR)

column_types_SL = detect_column_types_from_dict(SL, sales_list_types)
SL = apply_column_conversions(SL, column_types_SL)

column_types_HZ = detect_column_types_from_dict(HZ, hazard_column_types)
HZ = apply_column_conversions(HZ, column_types_HZ)

column_types_LU = detect_column_types_from_dict(LU, land_use_column_types)
LU = apply_column_conversions(LU, column_types_LU)

In [10]:
# rename columns, add a county column, and drop columns that are not needed
DS_columns_to_drop = ['Special Name Legend', 'Ownership Code', 'Special Name Assessee', 'Hazard City Key', 'Hazard Info', 'Fraction', 'Direction','M Fraction', 'M Direction', 'M Street Name', 'M Unit', 'M City State', 'M Zip', 'Mail House No']
DS_columns_to_rename = ['ain', 'City State', 'TRA','BD1 Subpart', 'BD1 Design', 'BD1 Quality', 'BD1 Year Built', 'BD1 Units', 'BD1 Bedrooms', 'BD1 Baths', 'BD1 Square Feet', 'BD2 Subpart', 'BD2 Design', 'BD2 Quality', 
                        'BD2 Year Built', 'BD2 Units', 'BD2 Bedrooms', 'BD2 Baths', 'BD2 Square Feet', 'BD3 Subpart', 'BD3 Design', 'BD3 Quality', 'BD3 Year Built', 'BD3 Units', 'BD3 Bedrooms', 'BD3 Baths', 
                        'BD3 Square Feet', 'BD4 Subpart', 'BD4 Design','BD4 Quality', 'BD4 Year Built', 'BD4 Units', 'BD4 Bedrooms', 'BD4 Baths', 'BD4 Square Feet', 'BD5 Subpart', 'BD5 Design', 'BD5 Quality', 'BD5 Year Built',
                        'BD5 Units', 'BD5 Bedrooms', 'BD5 Baths', 'BD5 Square Feet''BD1 Year Change', 'BD1 Unit Cost', 'BD1 RCN Main', 'BD2 Year Change', 'BD2 Unit Cost', 'BD2 RCN Main', 'BD3 Year Change', 'BD3 Unit Cost', 'BD3 RCN Main', 'BD4 Year Change', 
                        'BD4 Unit Cost', 'BD4 RCN Main', 'BD5 Year Change', 'BD5 Unit Cost', 'BD5 RCN Main']

DS_renamed_columns = ['assessor_identification_number', 'City', 'taxrate_area', 'Building Subpart 1', 'Building Design 1', 'Building Quality 1', 'Building Year Built 1', 'Building Units 1', 'Building Bedrooms 1', 'Building Baths 1', 'Building Square Feet 1', 'Building Subpart 2', 'Building Design 2', 'Building Quality 2',
                      'Building Year Built 2', 'Building Units 2', 'Building Bedrooms 2', 'Building Baths 2', 'Building Square Feet 2', 'Building Subpart 3', 'Building Design 3', 'Building Quality 3', 'Building Year Built 3', 'Building Units 3', 'Building Bedrooms 3', 'Building Baths 3',
                      'Building Square Feet 3', 'Building Subpart 4', 'Building Design 4', 'Building Quality 4', 'Building Year Built 4', 'Building Units 4', 'Building Bedrooms 4', 'Building Baths 4', 'Building Square Feet 4', 'Building Subpart 5', 'Building Design 5', 'Building Quality 5'
                      'Building Year Built 5', 'Building Units 5', 'Building Bedrooms 5', 'Building Baths 5', 'Building Square Feet 5', 'Building Year Change 1', 'Building Unit Cost 1', 'Building RCN Main 1', 'Building Year Change 2', 'Building Unit Cost 2', 'Building RCN Main 2',
                      'Building Year Change 3', 'Building Unit Cost 3', 'Building RCN Main 3', 'Building Year Change 4', 'Building Unit Cost 4', 'Building RCN Main 4', 'Building Year Change 5', 'Building Unit Cost 5', 'Building RCN Main 5']

SL_columns_to_drop = ['fill', 'Direction', 'dttt amount']
SL_columns_to_rename = ['first owner tr', 'bdl1 eff y', 'asss rec', 'bdl1 yb', 'bdl1 eff y', 'bdl1 bath', 'bdl1 bed', 'design ty', 'bdl1 sq ft', 'last sale1 ver', 'last 1 sale date', 'last 1 amount', '2 verification', '2 dat', '2 amoun', '3 verif', '3 sal da', '3 sal amou']

SL_renamed_columns = ['First Owner Transfer', 'Building Effective Year', 'Assessee Recording Date', 'Building Year Built 1', 'Building Effective Year 1', 'Building Baths 1', 'Building Bedrooms 1', 'Design Type', 'Building Square Feet', 'Last Sale Verification', 'Last Sale Date', 'Last Sale Amount', 'Second Sale Verification', 'Second Sale Date', 'Second Sale Amount', 'Third Sale Verification', 'Third Sale Date', 'Third Sale Amount']

LR_columns_to_drop = ['Gross Personal Property Key', 'Mail Address Fraction', 'Mail Address Direction', 'Mail Address Postal City Code', 'Situs Address Fraction', 'Situs Address Direction', 'Filler']

LR_columns_to_drop_2 = ['Gross Personal Property Key', 'Mail Address Unit', 'Mail Address Zip Code', 'Mail Address Street Name', 'Mail Address City and State', 'Mail Address Key', 
                      'Mail Address date of Last Change', 'Mail Address Postal City Code', 
                      'Mail Address House Number', 'Mail Address Fraction', 'Mail Address Direction',
                      'Situs Address Fraction', 'Situs Address Direction', 'Filler']

LR_columns_to_rename = ['Mail Address date of Last Change', 
                        'Mail Address House Number', 'Mail Address Key' ,'Mail Address City and State', 'Mail Address Street Name', 'Mail Address Zip Code', 'Mail Address Unit', 'Situs Address Key', 'Situs Address date of Last Change', 'Situs Address Postal City Code', 'Situs Address House Number',
                         'Situs Address Fraction', 'Situs Address Direction', 'Situs Address Unit', 'Situs Address Zip Code', 'Situs Address Street Name', 'Situs Address City and State',
                         'Building - Square Feet - Main', 'Legal Description - Last LIne Narrative', 'Legal Description - Last LIne Lot', 'Legal Description - Last LIne Division',	'Legal Description - Last LIne Region']

LR_columns_to_rename_2 = ['Situs Address Key', 'Situs Address date of Last Change', 'Situs Address Postal City Code', 'Situs Address House Number',
                         'Situs Address Fraction', 'Situs Address Direction', 'Situs Address Unit', 'Situs Address Zip Code', 'Situs Address Street Name', 'Situs Address City and State',
                         'Building - Square Feet - Main', 'Legal Description - Last LIne Narrative', 'Legal Description - Last LIne Lot', 'Legal Description - Last LIne Division',	'Legal Description - Last LIne Region']

LR_renamed_columns = ['Mail Address Date of Last Change', 'Mail House Number', 'Mail Address Key' , 'Mail City', 'Mail Street Name' ,'Mail Zip Code', 'Mail Unit', 'Address Key', 'Address Date of Last Change', 'Postal City Code', 'Situs House Number', 'Address Fraction', 'Address Direction', 'Unit', 'Zip', 'Street Name', 'City', 'Main Building Square Feet', 'Legal Description Narrative', 'Legal Description Lot', 'Legal Description Division', 'Legal Description Region']

LR_renamed_columns_2 = ['Address Key', 'Address Date of Last Change', 'Postal City Code', 'Situs House Number', 'Address Fraction', 'Address Direction', 'Unit', 'Zip', 'Street Name', 'City', 'Main Building Square Feet', 'Legal Description Narrative', 'Legal Description Lot', 'Legal Description Division', 'Legal Description Region']

LU_columns_to_drop = ['OID_', 'RI_PARCEL_ID', 'APN_RAW_2019', 'RI_DEM_GEO_ID_20', 'RI_UNIQUE_PARCEL_ID_2016', 'LAND_USE_2016', 
 'FIRE_HAZARD', 'SEARISE_1_METER', 'SEARISE_2_METER', 'FLOOD_PLAIN_ZONE', 'EQUAKE_ZONE', 'LIQUAFACTION_ZONE', 
 'LANDSLIDE_ZONE', 'PROTECTED_AREA', 'RIVER_WETLAND_AREA', 'WILDLIFE_AREA', 'CNDDB_RARE_SPECIES_AREA', 
 'HABITAT_RESERVE_AREA', 'WETLAND_AREA', 'URBANIZED_AREA']

HZ_columns_to_drop = ['APN_RAW_2019']


DS = DS.drop(columns=DS_columns_to_drop)
DS = DS.rename(columns=dict(zip(DS_columns_to_rename, DS_renamed_columns)))

SL = SL.drop(columns=SL_columns_to_drop)
SL = SL.rename(columns=dict(zip(SL_columns_to_rename, SL_renamed_columns)))

LR = LR.drop(columns=LR_columns_to_drop)
LR = LR.rename(columns=dict(zip(LR_columns_to_rename, LR_renamed_columns)))

HZ = HZ.drop(columns=HZ_columns_to_drop)

LU = LU.drop(columns=LU_columns_to_drop)


In [11]:
DS = clean_column_names(DS)
SL = clean_column_names(SL)
LR = clean_column_names(LR)
HZ = clean_column_names(HZ)
LU = clean_column_names(LU)

In [12]:
LR.head()

Unnamed: 0,assessor_identification_number,taxrate_area,administrative_region_number,common_area_key,year_sold_to_state,recording_date,land_value,improvement_value,exemption_claim_type_key,gross_personal_property_value,fixture_value,real_estate_exemption,personal_property_exemption,fixture_exemption_value,homeowner's_exemption,first_owner_assessee_name,first_owner_assessee_name_overflow,second_owner_assessee_name,special_name_assessee,address_key,address_date_of_last_change,postal_city_code,situs_house_number,unit,zip,street_name,city,mail_address_key,mail_address_date_of_last_change,mail_house_number,mail_unit,mail_zip_code,mail_street_name,mail_city,legal_description_narrative,legal_description_lot,legal_description_division,legal_description_region,legal_description_line_one,legal_description_line_two,legal_description_line_three,legal_description_line_four,legal_description_line_five,zoning_code,use_code,effective_year,year_built,main_building_square_feet,county_name,state_name
0,2004001003,16,2,0,0,2009-05-29,697735,295271,0,0,0,0,0,0,0,"PATEL,HANISH B AND ALPA P TRS",PATEL TRUST,Unknown,Unknown,D,1993-07-01,15,8321,Unknown,913040000,FAUST AVE,LOS ANGELES CA,F,2009-06-23,8321,Unknown,913040000,FAUST AVE,WEST HILLS CA,TRACT NO 25040,99,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,LARE9,101,1973,1973,2090,Los Angeles,California
1,2004001004,16,2,0,0,2021-12-01,363273,250862,0,0,0,0,0,0,0,"ALI,SYED SHAH AND DILRUBA S TRS",ALI FAMILY TRUST,Unknown,Unknown,D,1993-07-01,15,8313,Unknown,913040000,FAUST AVE,LOS ANGELES CA,F,2022-01-11,8313,Unknown,913040000,FAUST AVE,WEST HILLS CA,TRACT NO 25040,100,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,LARE9,101,1973,1973,2479,Los Angeles,California
2,2004001005,16,2,0,0,2017-08-18,516040,194684,0,0,0,0,0,0,0,"POPAL,ARINA AND",Unknown,"POPAL,MALIHA",Unknown,D,1993-07-01,15,8309,Unknown,913040000,FAUST AVE,LOS ANGELES CA,F,2017-10-10,8309,Unknown,913040000,FAUST AVE,WEST HILLS CA,TRACT NO 25040,101,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,LARE9,100,1973,1973,2057,Los Angeles,California
3,2004001008,16,2,0,0,1979-07-02,125903,217613,0,0,0,0,0,0,7000,"SZABO,GEORGE AND JOY",Unknown,Unknown,Unknown,D,1993-07-01,15,8325,Unknown,913040000,MAYNARD AVE,LOS ANGELES CA,F,1993-07-01,8325,Unknown,913040000,MAYNARD AVE,WEST HILLS CA,*TR=30333,1,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,LARE11,101,1978,1978,2423,Los Angeles,California
4,2004001009,16,2,0,0,1984-06-19,137190,205895,0,0,0,0,0,0,7000,"CAPUTO,DONATO AND IMMACOLATA",Unknown,Unknown,Unknown,D,1993-07-01,15,8311,Unknown,913040000,MAYNARD AVE,LOS ANGELES CA,F,1993-07-01,8311,Unknown,913040000,MAYNARD AVE,CANOGA PARK CA,*TR=30333,2,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,LARE11,101,1978,1978,2226,Los Angeles,California


In [15]:
# rename the HZ and LU apn to assessor_identification_number
HZ.rename(columns = {'apn': 'assessor_identification_number'}, inplace = True)
LU.rename(columns = {'apn': 'assessor_identification_number'}, inplace = True)
DS.rename(columns = {'ain': 'assessor_identification_number'}, inplace = True)

# Only keep the first 6 digits of the zip code column, keep it as an int64
DS['zip'] = DS['zip'].astype(str).str[:5].astype('int64')
LR['zip'] = LR['zip'].astype(str).str[:5].astype('int64')
LR['mail_zip_code'] = LR['mail_zip_code'].astype(str).str[:5].astype('int64')

### Find a way to handle the duplicate data in the dataset.

In [16]:
# identify duplicate assessor_identification_numbers
DS_duplicates = DS[DS.duplicated(subset=['assessor_identification_number'], keep=False)]
SL_duplicates = SL[SL.duplicated(subset=['assessor_identification_number'], keep=False)]
LR_duplicates = LR[LR.duplicated(subset=['assessor_identification_number'], keep=False)]
HZ_duplicates = HZ[HZ.duplicated(subset=['assessor_identification_number'], keep=False)]
LU_duplicates = LU[LU.duplicated(subset=['assessor_identification_number'], keep=False)]

In [17]:
# drop duplicate rows from DS, SL, LR, HZ, and LU
DS.drop_duplicates(subset=['assessor_identification_number'], keep='first', inplace=True)
SL.drop_duplicates(subset=['assessor_identification_number'], keep='first', inplace=True)
LR.drop_duplicates(subset=['assessor_identification_number'], keep='first', inplace=True)
HZ.drop_duplicates(subset=['assessor_identification_number'], keep='first', inplace=True)
LU.drop_duplicates(subset=['assessor_identification_number'], keep='first', inplace=True)

### Validate that all assessor_identification_number are unique and that they are 10 digits long.

In [18]:
def validate_ain(df, column_name='assessor_identification_number'):
    # Convert the column to string for length checking
    df['temp_str_ain'] = df[column_name].astype(str)
    
    # Filter to only include rows where the length of the assessor_identification_number is 10
    df = df[df['temp_str_ain'].str.len() == 10].copy()
    
    # Drop the temporary string column
    df.drop(columns=['temp_str_ain'], inplace=True)
    
    return df

DS = validate_ain(DS)
LR = validate_ain(LR)
SL = validate_ain(SL)
HZ = validate_ain(HZ)
LU = validate_ain(LU)

#### Turn all binary columns into lowercase boolean columns. 

In [19]:
HZ_columns_to_boolean = ['fire_hazard', 'searise_1_meter', 'searise_2_meter', 'flood_plain_zone', 'equake_zone', 'liquafaction_zone', 'landslide_zone', 'protected_area', 'river_wetland_area', 'wildlife_area', 'cnddb_rare_species_area', 'habitat_reserve_area', 'wetland_area', 'urbanized_area']
LU_columns_to_boolean = ['high_quality_transit_area', 'job_center', 'neighborhood_mobility_area', 'absolute_constraint', 'variable_constraint', 'environment_justice_area', 
                         'disadvantaged_community_area', 'community_of_concern', 'adu_space_possibility', 'setback_reduction_adu', 'small_adu_possibility', 'parking_exemption_adu', 
                         'setback_small_adu',	'setback_parking_adu',	'small_parking_adu', 'setback_small_parking_adu']

def convert_binary_to_boolean(df, columns_to_check=None):
    # If columns_to_check is None, check all columns
    if columns_to_check is None:
        columns_to_check = HZ_columns_to_boolean + LU_columns_to_boolean
    
    # Identify specified columns with binary values
    binary_columns = [col for col in columns_to_check if df[col].isin([0, 1]).all()]
    
    # Convert 0/1 to 'false'/'true' respectively using apply
    for col in binary_columns:
        df[col] = df[col].apply(lambda x: 'true' if x == 1 else 'false')
    
    return df

LU = convert_binary_to_boolean(LU, LU_columns_to_boolean)
HZ = convert_binary_to_boolean(HZ, HZ_columns_to_boolean)

In [26]:
#DS.drop(columns=['fixture_val', 'pp_key', 'doc_reason_code', 'm_direction'], inplace=True)

### Remove special characters from the dataset.

In [21]:
import re

# Function to remove special characters from a string
def remove_special_characters(text):
    if isinstance(text, str):
        return re.sub(r'[^A-Za-z0-9\s]', ' ', text)
    return text  # If not a string, return the original value

# Function to process a dataframe
def clean_non_numeric_columns(df):
    # Identify non-numeric columns
    non_numeric_cols = df.select_dtypes(include=['object']).columns
    
    # Apply function to non-numeric columns only
    for col in non_numeric_cols:
        try:
            df[col] = df[col].map(remove_special_characters)
        except Exception as e:
            print(f"Error processing column {col}: {e}")
    
    return df

# Clean non-numeric columns in each dataframe
LR = clean_non_numeric_columns(LR)
DS = clean_non_numeric_columns(DS)
SL = clean_non_numeric_columns(SL)

### Normalize the datetime columns to a standard format.

In [22]:
LR.head()

Unnamed: 0,assessor_identification_number,taxrate_area,administrative_region_number,common_area_key,year_sold_to_state,recording_date,land_value,improvement_value,exemption_claim_type_key,gross_personal_property_value,fixture_value,real_estate_exemption,personal_property_exemption,fixture_exemption_value,homeowner's_exemption,first_owner_assessee_name,first_owner_assessee_name_overflow,second_owner_assessee_name,special_name_assessee,address_key,address_date_of_last_change,postal_city_code,situs_house_number,unit,zip,street_name,city,mail_address_key,mail_address_date_of_last_change,mail_house_number,mail_unit,mail_zip_code,mail_street_name,mail_city,legal_description_narrative,legal_description_lot,legal_description_division,legal_description_region,legal_description_line_one,legal_description_line_two,legal_description_line_three,legal_description_line_four,legal_description_line_five,zoning_code,use_code,effective_year,year_built,main_building_square_feet,county_name,state_name
0,2004001003,16,2,0,0,2009-05-29,697735,295271,0,0,0,0,0,0,0,PATEL HANISH B AND ALPA P TRS,PATEL TRUST,Unknown,Unknown,D,1993-07-01,15,8321,Unknown,91304,FAUST AVE,LOS ANGELES CA,F,2009-06-23,8321,Unknown,91304,FAUST AVE,WEST HILLS CA,TRACT NO 25040,99,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,LARE9,101,1973,1973,2090,Los Angeles,California
1,2004001004,16,2,0,0,2021-12-01,363273,250862,0,0,0,0,0,0,0,ALI SYED SHAH AND DILRUBA S TRS,ALI FAMILY TRUST,Unknown,Unknown,D,1993-07-01,15,8313,Unknown,91304,FAUST AVE,LOS ANGELES CA,F,2022-01-11,8313,Unknown,91304,FAUST AVE,WEST HILLS CA,TRACT NO 25040,100,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,LARE9,101,1973,1973,2479,Los Angeles,California
2,2004001005,16,2,0,0,2017-08-18,516040,194684,0,0,0,0,0,0,0,POPAL ARINA AND,Unknown,POPAL MALIHA,Unknown,D,1993-07-01,15,8309,Unknown,91304,FAUST AVE,LOS ANGELES CA,F,2017-10-10,8309,Unknown,91304,FAUST AVE,WEST HILLS CA,TRACT NO 25040,101,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,LARE9,100,1973,1973,2057,Los Angeles,California
3,2004001008,16,2,0,0,1979-07-02,125903,217613,0,0,0,0,0,0,7000,SZABO GEORGE AND JOY,Unknown,Unknown,Unknown,D,1993-07-01,15,8325,Unknown,91304,MAYNARD AVE,LOS ANGELES CA,F,1993-07-01,8325,Unknown,91304,MAYNARD AVE,WEST HILLS CA,TR 30333,1,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,LARE11,101,1978,1978,2423,Los Angeles,California
4,2004001009,16,2,0,0,1984-06-19,137190,205895,0,0,0,0,0,0,7000,CAPUTO DONATO AND IMMACOLATA,Unknown,Unknown,Unknown,D,1993-07-01,15,8311,Unknown,91304,MAYNARD AVE,LOS ANGELES CA,F,1993-07-01,8311,Unknown,91304,MAYNARD AVE,CANOGA PARK CA,TR 30333,2,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,LARE11,101,1978,1978,2226,Los Angeles,California


In [23]:
datetime_LR = ['recording_date', 'address_date_of_last_change', 'mail_address_date_of_last_change']
datetime_DS = ['recording_date', 'last_sale_date', 'sale_two_date', 'sale_three_date']
datetime_SL = ['transferee', 'assessee_recording_date', 'last_sale_date', 'second_sale_date', 'third_sale_date']

def normalize_datetime_columns(df, datetime_columns):
    """
    Normalize specified datetime columns in the dataframe by creating separate columns for year, month, and day.
    Drop the original datetime columns after normalization.
    
    Parameters:
    df (pd.DataFrame): The dataframe containing datetime columns to be normalized.
    datetime_columns (list of str): List of column names to be normalized.
    
    Returns:
    pd.DataFrame: The dataframe with normalized datetime columns.
    """
    for col in datetime_columns:
        # Convert to datetime format
        df[col] = pd.to_datetime(df[col], errors='coerce')
        
        # Extract year, month, and day into separate columns
        df[f'{col}_year'] = df[col].dt.year
        df[f'{col}_month'] = df[col].dt.month
        df[f'{col}_day'] = df[col].dt.day
        
        # Drop the original datetime column
        df.drop(columns=[col], inplace=True)
        
    return df

# Normalize datetime columns
LR = normalize_datetime_columns(LR, datetime_LR)
DS = normalize_datetime_columns(DS, datetime_DS)
SL = normalize_datetime_columns(SL, datetime_SL)

### Remove apostrophes from the dataset.

In [24]:
# Remove apostrophes from column names
LR.columns = LR.columns.str.replace("'", "")
DS.columns = DS.columns.str.replace("'", "")
HZ.columns = HZ.columns.str.replace("'", "")
LU.columns = LU.columns.str.replace("'", "")
SL.columns = SL.columns.str.replace("'", "")

### Initiate Tests to validate the data.

In [25]:
def find_non_numeric(df, columns):
    """
    Identify non-numeric values in specified columns.
    
    Args:
        df (pd.DataFrame): The DataFrame to process.
        columns (list of str): The list of column names to check for non-numeric values.

    Returns:
        dict: A dictionary with column names as keys and lists of non-numeric values as values.
    """
    non_numeric_values = {}
    for column in columns:
        if column in df.columns:
            non_numeric = df[~df[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).notnull()][column].unique()
            if len(non_numeric) > 0:
                non_numeric_values[column] = non_numeric
    return non_numeric_values

In [26]:
# Function to identify special characters in a string
def has_special_characters(text):
    if isinstance(text, str):
        return bool(re.search(r'[^A-Za-z0-9\s]', text))
    return False

# Function to find and report special characters in a dataframe
def find_special_characters(df):
    special_chars_info = []
    
    # Identify non-numeric columns
    non_numeric_cols = df.select_dtypes(include=['object']).columns
    
    # Check each non-numeric column for special characters
    for col in non_numeric_cols:
        for idx, value in df[col].items():
            if has_special_characters(value):
                special_chars_info.append((col, idx, value))
    
    return special_chars_info

# Find special characters in each dataframe
special_chars_LR = find_special_characters(LR)
special_chars_DS = find_special_characters(DS)
special_chars_SL = find_special_characters(SL)

print("Special Characters in DataFrame LR:")
print(special_chars_LR)
print("\nSpecial Characters in DataFrame DS:")
print(special_chars_DS)
print("\nSpecial Characters in DataFrame SL:")
print(special_chars_SL)

Special Characters in DataFrame LR:
[]

Special Characters in DataFrame DS:
[]

Special Characters in DataFrame SL:
[]


In [27]:
# Function to check if a value can be encoded in UTF-8
def is_utf8_encodable(value):
    try:
        if isinstance(value, str):
            value.encode('utf-8')
        return True
    except (UnicodeEncodeError, TypeError):
        return False

# Function to check if all values in the dataframe are UTF-8 encodable
def check_utf8_encoding(df):
    utf8_errors = []
    
    # Check each column and each value
    for col in df.columns:
        for idx, value in df[col].items():
            if not is_utf8_encodable(value):
                utf8_errors.append((col, idx, value))
    
    return utf8_errors

In [28]:
# Check UTF-8 encoding in each dataframe
utf8_errors_LR = check_utf8_encoding(LR)
utf8_errors_DS = check_utf8_encoding(DS)
utf8_errors_SL = check_utf8_encoding(SL)

print("UTF-8 Encoding Errors in DataFrame LR:")
print(utf8_errors_LR)
print("\nUTF-8 Encoding Errors in DataFrame DS:")
print(utf8_errors_DS)
print("\nUTF-8 Encoding Errors in DataFrame SL:")
print(utf8_errors_SL)

UTF-8 Encoding Errors in DataFrame LR:
[]

UTF-8 Encoding Errors in DataFrame DS:
[]

UTF-8 Encoding Errors in DataFrame SL:
[]


In [29]:
# are there any trailing and leading spaces in the columns
def strip_columns(df):
    # Strip leading and trailing spaces from all columns
    df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
    return df

# Strip leading and trailing spaces from all columns in each dataframe
LR = strip_columns(LR)
DS = strip_columns(DS)
SL = strip_columns(SL)

In [30]:
# see if assessor_identification_number has any duplicates
print('Number of duplicates in SL:', SL.duplicated(subset = 'assessor_identification_number').sum())

Number of duplicates in SL: 0


In [31]:
print('Number of duplicates in DS:', DS.duplicated(subset = 'assessor_identification_number').sum())

Number of duplicates in DS: 0


In [32]:
print('Number of duplicates in LU:', LU.duplicated(subset = 'assessor_identification_number').sum())

Number of duplicates in LU: 0


In [33]:
print('Number of duplicates in LR:', LR.duplicated(subset = 'assessor_identification_number').sum())

Number of duplicates in LR: 0


In [34]:
# Function to examine mixed types in specified columns
def examine_mixed_types_by_index(df, column_indices):
    mixed_types_info = {}
    
    for index in column_indices:
        try:
            col_data = df.iloc[:, index]
            # Get unique types in the column
            unique_types = col_data.apply(lambda x: type(x)).unique()
            # Count occurrences of each type
            type_counts = col_data.apply(lambda x: str(type(x))).value_counts()
            mixed_types_info[index] = {
                'column_name': df.columns[index],
                'unique_types': unique_types,
                'type_counts': type_counts
            }
        except IndexError:
            print(f"Column index '{index}' is out of bounds for the DataFrame.")
    
    return mixed_types_info

columns_to_examine = [16, 34, 37, 40]

# Examine mixed types
mixed_types_info = examine_mixed_types_by_index(DS, columns_to_examine)

# Print the mixed types information
for index, info in mixed_types_info.items():
    print(f"Column Index: {index} (Column Name: {info['column_name']})")
    print(f"Unique Types: {info['unique_types']}")
    print(f"Type Counts:\n{info['type_counts']}\n")

Column Index: 16 (Column Name: year_sold_to_state)
Unique Types: [<class 'int'>]
Type Counts:
year_sold_to_state
<class 'int'>    2422890
Name: count, dtype: int64

Column Index: 34 (Column Name: sale_three_verif_key)
Unique Types: [<class 'int'>]
Type Counts:
sale_three_verif_key
<class 'int'>    2422890
Name: count, dtype: int64

Column Index: 37 (Column Name: building_design_1)
Unique Types: [<class 'int'>]
Type Counts:
building_design_1
<class 'int'>    2422890
Name: count, dtype: int64

Column Index: 40 (Column Name: building_units_1)
Unique Types: [<class 'int'>]
Type Counts:
building_units_1
<class 'int'>    2422890
Name: count, dtype: int64



In [35]:
def find_mixed_data_types_and_nulls(df):

    mixed_types_info = {}
    null_values_info = {}

    for column in df.columns:
        # Get unique data types in the column
        unique_types = df[column].map(type).unique()
        
        # Check for mixed data types
        if len(unique_types) > 1:
            type_details = {}
            for t in unique_types:
                example = df[column][df[column].apply(lambda x: isinstance(x, t))].iloc[0]
                type_details[str(t)] = example
            mixed_types_info[column] = type_details
        
        # Check for null values
        null_count = df[column].isnull().sum()
        if null_count > 0:
            null_values_info[column] = null_count
    
    results = {}
    
    if mixed_types_info:
        results['Mixed Data Types'] = mixed_types_info
    
    if null_values_info:
        results['Null Values'] = null_values_info
    
    if not results:
        return "The DataFrame has no columns with mixed types or null values."
    
    return results

def validate_ain_column(df, column_name='assessor_identification_number'):
    # Check for negative numbers
    if (df[column_name] < 0).any():
        raise ValueError(f"The column '{column_name}' contains negative numbers.")
    
    # Check for 10 digits length
    if not df[column_name].astype(str).str.len().eq(10).all():
        raise ValueError(f"The column '{column_name}' does not have all entries with 10 digits.")
    
    # Check for duplicates
    if df[column_name].duplicated().any():
        raise ValueError(f"The column '{column_name}' contains duplicate values.")
    
    # Return "The DataFrame does not contain any issues with the AIN column."
    return "The DataFrame does not contain any issues with the AIN column."

In [36]:
find_mixed_data_types_and_nulls(DS)

{'Mixed Data Types': {'tax_status': {"<class 'str'>": 'Taxes paid not delinquent',
   "<class 'float'>": nan}},
 'Null Values': {'tax_status': 1}}

In [37]:
validate_ain_column(DS)

'The DataFrame does not contain any issues with the AIN column.'

In [38]:
find_mixed_data_types_and_nulls(LR)

'The DataFrame has no columns with mixed types or null values.'

In [39]:
validate_ain_column(LR)

'The DataFrame does not contain any issues with the AIN column.'

In [40]:
find_mixed_data_types_and_nulls(SL)

'The DataFrame has no columns with mixed types or null values.'

In [41]:
validate_ain_column(SL)

'The DataFrame does not contain any issues with the AIN column.'

In [42]:
find_mixed_data_types_and_nulls(HZ)

'The DataFrame has no columns with mixed types or null values.'

In [43]:
validate_ain_column(HZ)

'The DataFrame does not contain any issues with the AIN column.'

In [44]:
find_mixed_data_types_and_nulls(LU)

'The DataFrame has no columns with mixed types or null values.'

In [45]:
validate_ain_column(LU)

'The DataFrame does not contain any issues with the AIN column.'

In [40]:
from src.paths import CLEANED_DATA_DIR

# save the data to the final directory
DS.to_csv(CLEANED_DATA_DIR / 'secured_basic_cleaned.csv')
LR.to_csv(CLEANED_DATA_DIR / 'local_roll_cleaned.csv')
SL.to_csv(CLEANED_DATA_DIR / 'sales_list_cleaned.csv')
HZ.to_csv(CLEANED_DATA_DIR / 'hazards_cleaned.csv')
LU.to_csv(CLEANED_DATA_DIR / 'land_use_cleaned.csv')

In [41]:
DS.head()

Unnamed: 0,assessor_identification_number,taxrate_area,agency_number,land_roll_year,land_current_value,imp_current_roll_year,imp_current_value,situs_house_no,street_name,unit,city,zip,first_owner_name,first_owner_name_overflow,second_owner_name,recording_date,tax_stat_key,year_sold_to_state,zoning_code,use_code,partial_interest,doc_reason_code,exemption_type,pp_key,pp_value,pp_exemption_val,fixture_val,fixture_exemption_val,num_howmowner_exemption,homeowner_exemption_val,real_estate_exemption_val,last_sale_verif_key,last_sale_amount,last_sale_date,sale_two_verif_key,sale_two_amount,sale_two_date,sale_three_verif_key,sale_three_amount,sale_three_date,building_subpart_1,building_design_1,building_quality_1,building_year_built_1,building_units_1,building_bedrooms_1,building_baths_1,building_square_feet_1,building_subpart_2,building_design_2,building_quality_2,building_year_built_2,building_units_2,building_bedrooms_2,building_baths_2,building_square_feet_2,building_subpart_3,building_design_3,building_quality_3,building_year_built_3,building_units_3,building_bedrooms_3,building_baths_3,building_square_feet_3,building_subpart_4,building_design_4,building_quality_4,building_year_built_4,building_units_4,building_bedrooms_4,building_baths_4,building_square_feet_4,building_subpart_5,building_design_5,building_quality_5building_year_built_5,building_units_5,building_bedrooms_5,building_baths_5,building_square_feet_5,bd5_square_feet,legal_first_line,legal_second_line,legal_third_line,legal_fourth_line,legal_fifth_line,legal_last_line,land_base_year,imp_base_year,land_base_val,imp_base_val,cluster_location,cluster_type,cluster_appraisal_unit,land_reason_key,impairment_key,ddt_amount,bd1_year_change,building_unit_cost_1,building_rcn_main_1,building_year_change_2,building_unit_cost_2,building_rcn_main_2,building_year_change_3,building_unit_cost_3,building_rcn_main_3,building_year_change_4,building_unit_cost_4,building_rcn_main_4,building_year_change_5,building_unit_cost_5,building_rcn_main_5,landlord_reappraisal_year,landlord_units,first_transfree_name,first_transfree_overflow,second_transfree_name,document_key,document_number,county_name,state_name
0,2004001003,16,0,2024,711689,2024,301176,8321,FAUST AVE,Unknown,LOS ANGELES CA,91304,"PATEL,HANISH B AND ALPA P TRS",PATEL TRUST,Unknown,2009-05-29,0,0,LARE9,101,0,U,10,0,0,0,0,0,0,0,0,1,9,2005-09-06,0,0,2003-08-26,0,0,2000-01-11,101,131,0,1973,1,4,3,2090,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,TRACT NO 25040 LOT 99,Unknown,Unknown,Unknown,Unknown,Unknown,2006,2006,600254,252282,2,1,21,0,0,9,1978,2379,49720,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Unknown,Unknown,0,1,798640,Los Angeles,California
1,2004001004,16,0,2024,370538,2024,255879,8313,FAUST AVE,Unknown,LOS ANGELES CA,91304,"ALI,SYED SHAH AND DILRUBA S TRS",ALI FAMILY TRUST,Unknown,2021-12-01,0,0,LARE9,101,0,U,10,0,0,0,0,0,0,0,0,1,9,2009-10-19,0,0,2006-01-05,0,0,1993-03-16,101,131,0,1973,1,5,3,2479,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,TRACT NO 25040 LOT 100,Unknown,Unknown,Unknown,Unknown,Unknown,2010,2010,292800,202200,2,1,21,0,0,9,1978,2346,58160,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Unknown,Unknown,0,1,1775715,Los Angeles,California
2,2004001005,16,0,2024,526360,2024,198577,8309,FAUST AVE,Unknown,LOS ANGELES CA,91304,"POPAL,ARINA AND",Unknown,"POPAL,MALIHA",2017-08-18,0,0,LARE9,100,333,M,10,0,0,0,0,0,0,0,0,1,9,2017-08-18,0,0,2016-12-08,0,0,1997-11-25,101,130,0,1973,1,4,2,2057,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,TRACT NO 25040 LOT 101,Unknown,Unknown,Unknown,Unknown,Unknown,2018,2018,465645,175665,2,1,21,0,0,9,1978,2438,50150,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Unknown,Unknown,0,1,938966,Los Angeles,California
3,2004001008,16,0,2024,128421,2024,221965,8325,MAYNARD AVE,Unknown,LOS ANGELES CA,91304,"SZABO,GEORGE AND JOY",Unknown,Unknown,1979-07-02,0,0,LARE11,101,0,0,10,0,0,0,0,0,1,7000,0,1,137001,1977-08-05,0,0,1970-01-01,0,0,1970-01-01,101,131,0,1978,1,4,3,2423,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,*TR=30333 LOT 1,Unknown,Unknown,Unknown,Unknown,Unknown,1980,1980,59000,105800,2,1,21,0,0,137001,1989,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Unknown,Unknown,0,1,718176,Los Angeles,California
4,2004001009,16,0,2024,139933,2024,210012,8311,MAYNARD AVE,Unknown,LOS ANGELES CA,91304,"CAPUTO,GIANFRANCO AND",JACQUELINE AND,"CAPUTO,DONATO L AND IMMACOLATA",2023-08-29,0,0,LARE11,101,0,W,10,0,0,0,0,0,1,7000,0,1,9,1984-06-19,0,0,1979-06-19,0,0,1977-08-05,101,131,0,1978,1,4,3,2226,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,*TR=30333 LOT 2,Unknown,Unknown,Unknown,Unknown,Unknown,1984,1984,68900,103400,2,1,21,0,0,9,1993,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Unknown,Unknown,0,1,575706,Los Angeles,California
