#### Some useful functions

In [None]:
import pandas as pd

def dataframe_to_csv(dataframe, output_file_name):
    """
    Writes the given DataFrame to a CSV file.

    Args:
    - dataframe: The pandas DataFrame to be written to a CSV file.
    - output_file_name: The name of the output CSV file. Include the path if you want to save it in a specific directory.

    Returns:
    - None
    """
    try:
        # Write the DataFrame to a CSV file
        dataframe.to_csv(output_file_name, index=False)
        print(f"DataFrame has been successfully written to {output_file_name}")
    except Exception as e:
        print(f"An error occurred while writing the DataFrame to a CSV file: {e}")

#### Imports

In [None]:
import pandas as pd
import numpy as np
import re
pd.set_option('display.max_rows', None)

#### Function that preprocess the files

In [None]:
def pre_process_all_files(path_1945, path_1975, path_1985):
  # Read the data
  df1945 = pd.read_csv(path_1945, low_memory=False)
  df1975 = pd.read_csv(path_1975, low_memory=False)
  df1985 = pd.read_csv(path_1985, low_memory=False)

  # Use dropna with how='all' to drop columns where all values are NaN
  dfc1945 = df1945.dropna(axis=1, how='all')
  dfc1975 = df1975.dropna(axis=1, how='all')
  dfc1985 = df1985.dropna(axis=1, how='all')

  # Make a copy of the dataframes
  dfc1945 = dfc1945.copy()
  dfc1975 = dfc1975.copy()
  dfc1985 = dfc1985.copy()

  # Convert the 'ADDRESS' column in each DataFrame to string
  dfc1945['ADDRESS'] = dfc1945['ADDRESS'].astype(str)
  dfc1975['ADDRESS'] = dfc1975['ADDRESS'].astype(str)
  dfc1985['ADDRESS'] = dfc1985['ADDRESS'].astype(str)

  # Append the year to the column names
  dfc1945.columns = [f"{col}_1945" for col in dfc1945.columns]
  dfc1975.columns = [f"{col}_1975" for col in dfc1975.columns]
  dfc1985.columns = [f"{col}_1985" for col in dfc1985.columns]

  return dfc1945, dfc1975, dfc1985

path_for_1945 = '/content/Lubbock - 1945 - 1945.csv'
path_for_1975 = '/content/Lubbock - 1975 - 1975.csv'
path_for_1985 = '/content/Lubbock - 1985 -  1985.csv'

#### Apply the preprocess funtion

In [None]:
df1945, df1975, df1985 = pre_process_all_files(path_for_1945, path_for_1975, path_for_1985)

## Pre-Process 1945

#### Add a column to the df1945 dataframe to track the old index when updated

In [None]:
df1945 = df1945.copy()

# Add a new column to the 1945 dataframe
df1945['oldindex_1945'] = df1945.index

#### Function that process addresses with patterns

In [None]:
def process_addresses(dfc, pattern, dfn=None):
    """
    Processes addresses in a DataFrame using a specified regex pattern.

    Args:
    - dfc: DataFrame containing the original addresses.
    - pattern: Regex pattern to match addresses.
    - dfn: Optional DataFrame to append matched addresses. If None, a new DataFrame is created.

    Returns:
    - A tuple of DataFrames: (updated dfc without matched rows, dfn with appended matched rows).
    """

    # Convert the pattern to a compiled regex object if it's a string
    if isinstance(pattern, str):
        pattern = re.compile(pattern)

    # Check if dfn exists, otherwise initialize it
    if dfn is None:
        dfn_rows = []
    else:
        dfn_rows = dfn.to_dict('records')

    matched_indices = []  # Keep track of indices to remove from dfc

    for index, row in dfc.iterrows():
        match = pattern.match(row['ADDRESS_1945'])
        if match:
            keywords = ', '.join([group.strip() for group in match.groups() if group and group.strip()])
            new_row = row.to_dict()
            new_row['keywords'] = keywords
            new_row['oldindex_1945'] = index

            dfn_rows.append(new_row)
            matched_indices.append(index)

    # Update dfn with the new rows
    dfn_updated = pd.DataFrame(dfn_rows)

    # Remove matched rows from dfc
    dfc_updated = dfc.drop(matched_indices).reset_index(drop=True)

    return dfc_updated, dfn_updated, len(matched_indices)

In [None]:
patterns_dict = {
    "pattern1": r'(\d{4})(\d{2})(ND|ST|TH|nd|st|th)(ST|st),([A-Za-z]+)',
    "pattern2": r'(\d{4})(N|S|E|W|n|s|e|w)(\d{2})(ND|ST|TH|nd|st|th)(ST|st),([A-Za-z]+)',
    "pattern3": r'(\d{4})(AVE|ave)([A-Za-z]),([A-Za-z]+)',
    "pattern4": r'(\d{3,4})(AVE|ave)([A-Za-z]),([A-Za-z]+)',
    "pattern5": r'(\d{4})(N|S|E|W|n|s|e|w)(MAIN|main)(ST|st),([A-Za-z]+)',
    "pattern6": r'(\d{3,4})(N|S|E|W|n|s|e|w)([A-Za-z]+)(AVE|ave),([A-Za-z]+)',
    "pattern7": r'(\d{3,4})([A-Za-z]+)(AVE|ave),([A-Za-z]+)',
    "pattern8": r'(\d{4})(\d{2})(RD|ND|ST|TH|rd|nd|st|th)(AVE|ave|ST|st),([A-Za-z]+)',
    "pattern9": r'(\d{4})(\d{3})(RD|ND|ST|TH|rd|nd|st|th)(AVE|ave|ST|st),([A-Za-z]+)',
    "pattern10": r'(\d{4})(MAIN|main)(AVE|ave|ST|st),([A-Za-z]+)',
    "pattern11": r'(\d{3,4})(N|S|E|W|n|s|e|w)(AVE|ave)([A-Za-z]),([A-Za-z]+)',
    "pattern12": r'(\d{3,4})(N|S|E|W|n|s|e|w)?(HARVARD|URSULINE|ERSKINE|FISHER|STANFORD|BROADWAY)(ST|st)?,([A-Za-z]+)',
    "pattern13": r'(\d{4})(N|S|E|W|n|s|e|w)?([A-Za-z]+)(\d+),([A-Za-z]+)'
}

In [None]:
def process_addresses_with_patterns(dfc, patterns_dict, dfn=None):
    total_matched_indices = 0

    # Initialize dfn if it does not exist
    if dfn is None:
        dfn = pd.DataFrame()

    for pattern_name, pattern_regex in patterns_dict.items():
        dfc, dfn, number_of_matched_indices = process_addresses(dfc, pattern_regex, dfn)
        total_matched_indices += number_of_matched_indices
        print(f'The number of matched indices for {pattern_name} is {number_of_matched_indices}')

    print(f'The total number of matched indices so far are {total_matched_indices}')
    return dfc, dfn, total_matched_indices

#### Apply the function to process the addresses

In [None]:
# Assuming dfc1945 is your initial DataFrame and dfcn1945 (if exists) is to be appended
df1945_updated, dfn1945_updated, total_matched = process_addresses_with_patterns(df1945, patterns_dict)

#### Expanding the short forms in the dataframe like Lub -> Lubbock, Bx -> Box and more

In [None]:
df1945_updated['ADDRESS_1945'] = df1945_updated['ADDRESS_1945'].str.replace(r'\blub\b', 'LUBBOCK', case=False, regex=True)
df1945_updated['ADDRESS_1945'] = df1945_updated['ADDRESS_1945'].str.replace(r'\bBx\b', 'BOX', case=False, regex=True)

#### Funtion to separate the number and its suffix. Ex: 23RD -> 23 RD etc.

In [None]:
def add_space_before_suffix(address):
    # Define the pattern: a number (\d+) immediately followed by a street suffix
    pattern = r'(\d+)(RD|ND|ST|TH|rd|nd|st|th)'

    # Define the replacement pattern: the matched number and suffix separated by a space
    # \1 and \2 refer to the first and second captured groups in the pattern, respectively
    replacement = r'\1 \2'

    # Use re.sub() to search and replace the pattern in the address string
    modified_address = re.sub(pattern, replacement, address)

    return modified_address

#### Apply the `add_space_before_suffix()`

In [None]:
df1945_updated["ADDRESS_1945"] = df1945_updated["ADDRESS_1945"].apply(add_space_before_suffix)

#### Process Non-Empty addresses from 1945 dataframe

In [None]:
def process_nonempty_addresses(dfc, dfn=None):

    # Filter out rows where ADDRESS_1945 is NaN, None, or undesired strings, and directly work on a copy to avoid SettingWithCopyWarning
    valid_addresses = dfc['ADDRESS_1945'].apply(lambda x: x not in [None, 'nan', '"', '""','"""','""""']) & dfc['ADDRESS_1945'].notna()
    valid_rows = dfc[valid_addresses].copy()  # Use .copy() here

    # If there are no valid addresses to process, just return the original dataframes
    if valid_rows.empty:
        return dfc, dfn if dfn is not None else pd.DataFrame()

    # Process the ADDRESS_1945 column: split by commas or spaces, then join with ', '
    valid_rows['keywords'] = valid_rows['ADDRESS_1945'].apply(lambda address: ', '.join(address.replace(',', ' ').split()))

    # Add the 'oldindex_1945' column to keep track of the original index
    valid_rows['oldindex_1945'] = valid_rows.index

    # Prepare dfn (either append to it or create a new one)
    if dfn is not None:
        # When appending, it's safe as dfn is being explicitly modified
        dfn_updated = pd.concat([dfn, valid_rows], ignore_index=True)
    else:
        dfn_updated = valid_rows

    # Remove processed rows from the original dfc
    dfc_updated = dfc.drop(valid_rows.index).reset_index(drop=True)

    return dfc_updated, dfn_updated

#### Apply the function that processes the non-empty addresses

In [None]:
df1945_updated, dfn1945_updated = process_nonempty_addresses(df1945_updated, dfn1945_updated)

#### Check for the updates in the dataframes

In [None]:
df1945_updated

In [None]:
dataframe_to_csv(df1945_updated, '1945_no_addresses_data.csv')

In [None]:
df1945 = dfn1945_updated

## Pre-Process 1975

In [None]:
df1975["ADDRESS_1975"] = df1975["ADDRESS_1975"].apply(add_space_before_suffix)

## Pre-Process 1985

In [None]:
df1985["ADDRESS_1985"] = df1985["ADDRESS_1985"].apply(add_space_before_suffix)

## Mathcing the addresses that are common in all 3 dataframes (1945, 1975 and 1975)

In [None]:
def extract_numbers_from_keywords(keywords):
    """Extracts all numeric values from a comma-separated keyword string."""
    return re.findall(r'\b\d+\b', keywords)

In [None]:
def match_numbers_in_addresses(numbers, df, address_column):
    """Finds addresses that contain all the numbers exactly."""
    matched_rows = []
    for index, row in df.iterrows():
        address = row[address_column]
        # Check if all numbers are present as exact matches in the address
        if all(any(re.fullmatch(rf'\b{num}\b', word) for word in address.split()) for num in numbers):
            matched_rows.append((index, address))
    return matched_rows

In [None]:
def print_unmatched_info(index, numbers, matches_1975, matches_1985):
    keywords = ', '.join(numbers)
    if len(matches_1975) == 0 and len(matches_1985) == 0:
        print(f"Not matched for row {index} in df1945 with keywords: {keywords}")
    else:
        if len(matches_1975) > 1:
            print(f"The count is {len(matches_1975)} in 1975 for row {index} with keywords: {keywords}")
        if len(matches_1985) > 1:
            print(f"The count is {len(matches_1985)} in 1985 for row {index} with keywords: {keywords}")
        if len(matches_1975) == 0:
            print(f"No match in 1975 for row {index} with keywords: {keywords}")
        if len(matches_1985) == 0:
            print(f"No match in 1985 for row {index} with keywords: {keywords}")

In [None]:
def print_matched_info(match_info):
    print(f"Match found for row {match_info['Index_1945']} in df1945 with keywords: {match_info['Keywords']}")
    print(f"- df1975 match at index {match_info['Index_1975']}: {match_info['Address_1975']}")
    print(f"- df1985 match at index {match_info['Index_1985']}: {match_info['Address_1985']}\n")

In [None]:
def find_common_matches(df1945, df1975, df1985):
    matches_list = []

    for index, row in df1945.iterrows():
        keywords = row['keywords']
        numbers = extract_numbers_from_keywords(keywords)

        matches_1975 = match_numbers_in_addresses(numbers, df1975, 'ADDRESS_1975')
        matches_1985 = match_numbers_in_addresses(numbers, df1985, 'ADDRESS_1985')

        # Handle different match scenarios
        if len(matches_1975) == 1 and len(matches_1985) == 1:
            # Exact match in both years
            match_info = {
                'Index_1945': index, 'Keywords': ', '.join(numbers),
                'Index_1975': matches_1975[0][0], 'Address_1975': matches_1975[0][1],
                'Index_1985': matches_1985[0][0], 'Address_1985': matches_1985[0][1]
            }
            matches_list.append(match_info)

            # Print the matched information
            print_matched_info(match_info)

            # Remove matched indices from original DataFrames
            df1945.drop(index, inplace=True)
            df1975.drop(matches_1975[0][0], inplace=True)
            df1985.drop(matches_1985[0][0], inplace=True)
        else:
            # Print different scenarios for unmatched or multiple matches
            print_unmatched_info(index, numbers, matches_1975, matches_1985)

    # Convert list of matches to DataFrame
    matched_in_all = pd.DataFrame(matches_list)

    return matched_in_all, df1945, df1975, df1985

In [None]:
matched_in_all, df1945, df1975, df1985 = find_common_matches(df1945, df1975, df1985)

Match found for row 0 in df1945 with keywords: 2407, 22
- df1975 match at index 475: 2407 22 ND ST
- df1985 match at index 24562: 2407 22 nd Pl., Lubbock, TX

Match found for row 1 in df1945 with keywords: 2808, 28
- df1975 match at index 28726: 2808 28 TH
- df1985 match at index 895: 2808 28 TH ST

Match found for row 2 in df1945 with keywords: 2316, 31
- df1975 match at index 17574: 2316 31 ST
- df1985 match at index 18469: 2316 31 ST ST

Not matched for row 3 in df1945 with keywords: 1913, 27
Match found for row 4 in df1945 with keywords: 2102, 15
- df1975 match at index 739: 2102 15 TH ST
- df1985 match at index 10843: 2102 15 TH ST

No match in 1985 for row 5 with keywords: 2306, 29
Match found for row 6 in df1945 with keywords: 2212, 60
- df1975 match at index 29164: 2212 60 TH
- df1985 match at index 8405: 2212 60 TH ST

The count is 2 in 1985 for row 7 with keywords: 2905, 20
No match in 1975 for row 7 with keywords: 2905, 20
Match found for row 8 in df1945 with keywords: 1523,