## Pre-process all files

#### Imports

In [None]:
!pip install uszipcode

Collecting uszipcode
  Downloading uszipcode-1.0.1-py2.py3-none-any.whl (35 kB)
Collecting pathlib-mate (from uszipcode)
  Downloading pathlib_mate-1.3.2-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.0/57.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting atomicwrites (from uszipcode)
  Downloading atomicwrites-1.4.1.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fuzzywuzzy (from uszipcode)
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Collecting haversine>=2.5.0 (from uszipcode)
  Downloading haversine-2.8.1-py2.py3-none-any.whl (7.7 kB)
Collecting sqlalchemy-mate>=1.4.28.3 (from uszipcode)
  Downloading sqlalchemy_mate-1.4.28.4-py2.py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.1/77.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting SQLAlchemy>=1.4.0 (from uszipcode)
  Downloading SQLAlchemy-1.4.51-cp310-cp310-manylinux_2_5_x8

In [None]:
import pandas as pd
import numpy as np
import re
from uszipcode import SearchEngine

pd.set_option('display.max_rows', None)



#### Some useful functions

In [None]:
def dataframe_to_csv(dataframe, output_file_name):
    """
    Writes the given DataFrame to a CSV file.

    Args:
    - dataframe: The pandas DataFrame to be written to a CSV file.
    - output_file_name: The name of the output CSV file. Include the path if you want to save it in a specific directory.

    Returns:
    - None
    """
    try:
        # Write the DataFrame to a CSV file
        dataframe.to_csv(output_file_name, index=False)
        print(f"DataFrame has been successfully written to {output_file_name}")
    except Exception as e:
        print(f"An error occurred while writing the DataFrame to a CSV file: {e}")

In [None]:
# Function to extract and split keywords
def split_keywords_vectorized(df, column_name):

    # Make a copy to avoid SettingWithCopyWarning if df is a slice from another DataFrame
    df = df.copy()

    # Extract numerical keywords
    df['numerical_keywords'] = df[column_name].str.findall(r'\b\d+\b').str.join(', ')

    # Extract alphabetical keywords and then filter by length
    df['alphabetical_keywords'] = df[column_name].str.findall(r'\b[a-zA-Z]+\b').str.join(', ')

    # Short alphabetical keywords (length ≤ 2)
    df['alphabetical_short_keywords'] = df['alphabetical_keywords'].apply(lambda x: ', '.join([word for word in x.split(', ') if len(word) <= 2]))

    # Long alphabetical keywords (length > 2)
    df['alphabetical_long_keywords'] = df['alphabetical_keywords'].apply(lambda x: ', '.join([word for word in x.split(', ') if len(word) > 2]))

    # Drop the temporary 'alphabetical_keywords' column
    df.drop(columns=['alphabetical_keywords'], inplace=True)

    return df

In [None]:
def add_space_before_suffix(address):
    # Define the pattern: a number (\d+) immediately followed by a street suffix
    pattern = r'(\d+)(RD|ND|ST|TH|rd|nd|st|th)'

    # Define the replacement pattern: the matched number and suffix separated by a space
    # \1 and \2 refer to the first and second captured groups in the pattern, respectively
    replacement = r'\1 \2'

    # Use re.sub() to search and replace the pattern in the address string
    modified_address = re.sub(pattern, replacement, address)

    return modified_address

In [None]:
def remove_rows_without_alphabets_or_numbers(df, address_column):
    # Use a regular expression to filter rows where the address column contains at least one alphabet character or number
    # The regular expression '[a-zA-Z0-9]' looks for any occurrence of any alphabet character or digit
    filtered_df = df[df[address_column].str.contains('[a-zA-Z0-9]', regex=True, na=False)]

    # Reset the index of the filtered DataFrame
    filtered_df.reset_index(drop=True, inplace=True)

    return filtered_df

#### Function that preprocess the files

In [None]:
def pre_process_all_files(path_1945, path_1975, path_1985):
  # Read the data
  df1945 = pd.read_csv(path_1945, low_memory=False)
  df1975 = pd.read_csv(path_1975, low_memory=False)
  df1985 = pd.read_csv(path_1985, low_memory=False)

  # Use dropna with how='all' to drop columns where all values are NaN
  dfc1945 = df1945.dropna(axis=1, how='all')
  dfc1975 = df1975.dropna(axis=1, how='all')
  dfc1985 = df1985.dropna(axis=1, how='all')

  # Make a copy of the dataframes
  dfc1945 = dfc1945.copy()
  dfc1975 = dfc1975.copy()
  dfc1985 = dfc1985.copy()

  # Convert the 'ADDRESS' column in each DataFrame to string
  dfc1945['ADDRESS'] = dfc1945['ADDRESS'].astype(str)
  dfc1975['ADDRESS'] = dfc1975['ADDRESS'].astype(str)
  dfc1985['ADDRESS'] = dfc1985['ADDRESS'].astype(str)

  # Append the year to the column names
  dfc1945.columns = [f"{col}_1945" for col in dfc1945.columns]
  dfc1975.columns = [f"{col}_1975" for col in dfc1975.columns]
  dfc1985.columns = [f"{col}_1985" for col in dfc1985.columns]

  return dfc1945, dfc1975, dfc1985

#### Apply the preprocess funtion

In [None]:
path_for_1945 = '/content/Lubbock - 1945 - 1945.csv'
path_for_1975 = '/content/Lubbock - 1975 - 1975.csv'
path_for_1985 = '/content/Lubbock - 1985 -  1985.csv'

df1945, df1975, df1985 = pre_process_all_files(path_for_1945, path_for_1975, path_for_1985)

## Pre-Process 1945

#### Add a column to the df1945 dataframe to track the old index when updated

In [None]:
df1945 = df1945.copy()

# Add a new column to the 1945 dataframe
df1945['oldindex_1945'] = df1945.index

#### Function that process addresses with patterns

In [None]:
def process_addresses(dfc, pattern, dfn=None):
    """
    Processes addresses in a DataFrame using a specified regex pattern.

    Args:
    - dfc: DataFrame containing the original addresses.
    - pattern: Regex pattern to match addresses.
    - dfn: Optional DataFrame to append matched addresses. If None, a new DataFrame is created.

    Returns:
    - A tuple of DataFrames: (updated dfc without matched rows, dfn with appended matched rows).
    """

    # Convert the pattern to a compiled regex object if it's a string
    if isinstance(pattern, str):
        pattern = re.compile(pattern)

    # Check if dfn exists, otherwise initialize it
    if dfn is None:
        dfn_rows = []
    else:
        dfn_rows = dfn.to_dict('records')

    matched_indices = []  # Keep track of indices to remove from dfc

    for index, row in dfc.iterrows():
        match = pattern.match(row['ADDRESS_1945'])
        if match:
            keywords = ', '.join([group.strip() for group in match.groups() if group and group.strip()])
            new_row = row.to_dict()
            new_row['keywords_1945'] = keywords
            new_row['oldindex_1945'] = index

            dfn_rows.append(new_row)
            matched_indices.append(index)

    # Update dfn with the new rows
    dfn_updated = pd.DataFrame(dfn_rows)

    # Remove matched rows from dfc
    dfc_updated = dfc.drop(matched_indices).reset_index(drop=True)

    return dfc_updated, dfn_updated, len(matched_indices)

In [None]:
patterns_dict = {
    "pattern1": r'(\d{4})(\d{2})(ND|ST|TH|nd|st|th)(ST|st),([A-Za-z]+)',
    "pattern2": r'(\d{4})(N|S|E|W|n|s|e|w)(\d{2})(ND|ST|TH|nd|st|th)(ST|st),([A-Za-z]+)',
    "pattern3": r'(\d{4})(AVE|ave)([A-Za-z]),([A-Za-z]+)',
    "pattern4": r'(\d{3,4})(AVE|ave)([A-Za-z]),([A-Za-z]+)',
    "pattern5": r'(\d{4})(N|S|E|W|n|s|e|w)(MAIN|main)(ST|st),([A-Za-z]+)',
    "pattern6": r'(\d{3,4})(N|S|E|W|n|s|e|w)([A-Za-z]+)(AVE|ave),([A-Za-z]+)',
    "pattern7": r'(\d{3,4})([A-Za-z]+)(AVE|ave),([A-Za-z]+)',
    "pattern8": r'(\d{4})(\d{2})(RD|ND|ST|TH|rd|nd|st|th)(AVE|ave|ST|st),([A-Za-z]+)',
    "pattern9": r'(\d{4})(\d{3})(RD|ND|ST|TH|rd|nd|st|th)(AVE|ave|ST|st),([A-Za-z]+)',
    "pattern10": r'(\d{4})(MAIN|main)(AVE|ave|ST|st),([A-Za-z]+)',
    "pattern11": r'(\d{3,4})(N|S|E|W|n|s|e|w)(AVE|ave)([A-Za-z]),([A-Za-z]+)',
    "pattern12": r'(\d{3,4})(N|S|E|W|n|s|e|w)?(HARVARD|URSULINE|ERSKINE|FISHER|STANFORD|BROADWAY)(ST|st)?,([A-Za-z]+)',
    "pattern13": r'(\d{4})(N|S|E|W|n|s|e|w)?([A-Za-z]+)(\d+),([A-Za-z]+)'
}

In [None]:
def process_addresses_with_patterns(dfc, patterns_dict, dfn=None):
    total_matched_indices = 0

    # Initialize dfn if it does not exist
    if dfn is None:
        dfn = pd.DataFrame()

    for pattern_name, pattern_regex in patterns_dict.items():
        dfc, dfn, number_of_matched_indices = process_addresses(dfc, pattern_regex, dfn)
        total_matched_indices += number_of_matched_indices
        print(f'The number of matched indices for {pattern_name} is {number_of_matched_indices}')

    print(f'The total number of matched indices so far are {total_matched_indices}')
    return dfc, dfn, total_matched_indices

#### Apply the function to process the addresses

In [None]:
df1945_updated, dfn1945_updated, total_matched = process_addresses_with_patterns(df1945, patterns_dict)

The number of matched indices for pattern1 is 215
The number of matched indices for pattern2 is 8
The number of matched indices for pattern3 is 33
The number of matched indices for pattern4 is 6
The number of matched indices for pattern5 is 3
The number of matched indices for pattern6 is 23
The number of matched indices for pattern7 is 48
The number of matched indices for pattern8 is 14
The number of matched indices for pattern9 is 6
The number of matched indices for pattern10 is 5
The number of matched indices for pattern11 is 3
The number of matched indices for pattern12 is 12
The number of matched indices for pattern13 is 6
The total number of matched indices so far are 382


#### Expanding the short forms in the dataframe like Lub -> Lubbock, Bx -> Box and more

In [None]:
dfn1945_updated

Unnamed: 0,People_1945,document number_1945,PARCEL NUMBER_1945,ADDRESS_1945,ORIGINAL GRANTEE CITY OR TOWN_1945,DESIGNATE HOMESTEAD_1945,VALUE OF CITY PROPERTY_1945,VALUE OF PERSONAL PROPERTY_1945,STATE TAX_1945,COUNTY TAX_1945,Total Tax_1945,Zipcode_1945,2022 Assessed Value_1945,oldindex_1945,keywords_1945
0,Aoxue,,,"240722NDST,LUBBOCK",,,2700.0,,,,13.5,79411.0,153148,0,"2407, 22, ND, ST, LUBBOCK"
1,,,,"280828THST,LUBBOCK",,,1800.0,,,,9.0,79410.0,153602,1,"2808, 28, TH, ST, LUBBOCK"
2,,,,"231631STST,LUBBOCK",,,2200.0,,,,11.0,79411.0,96701,4,"2316, 31, ST, ST, LUBBOCK"
3,,,,"191327THST,LUBBOCK",,,1500.0,,,,7.5,79411.0,104699,5,"1913, 27, TH, ST, LUBBOCK"
4,,,,"210215THST,LUBBOCK",,,2000.0,,,,,79401.0,96597,6,"2102, 15, TH, ST, LUBBOCK"
5,,,,"230629THST,LUBBOCK",,,1750.0,,,,8.75,79411.0,80415,7,"2306, 29, TH, ST, LUBBOCK"
6,,,,"221260THST,LUBBOCK",,,1400.0,,,,7.0,79412.0,161252,8,"2212, 60, TH, ST, LUBBOCK"
7,,,,"290520THST,LUBBOCK",,,2650.0,,,,14.33,79410.0,302489,9,"2905, 20, TH, ST, LUBBOCK"
8,,,,"152327THST,LUBBOCK",,,1250.0,,,,6.25,79411.0,56683,10,"1523, 27, TH, ST, LUBBOCK"
9,,,,"201188THST,LUBBOCK",,,1400.0,,,,7.0,79423.0,189088,11,"2011, 88, TH, ST, LUBBOCK"


In [None]:
df1945_updated['ADDRESS_1945'] = df1945_updated['ADDRESS_1945'].str.replace(r'\blub\b', 'LUBBOCK', case=False, regex=True)
df1945_updated['ADDRESS_1945'] = df1945_updated['ADDRESS_1945'].str.replace(r'\bBx\b', 'BOX', case=False, regex=True)

#### Apply the `add_space_before_suffix()`

In [None]:
df1945_updated["ADDRESS_1945"] = df1945_updated["ADDRESS_1945"].apply(add_space_before_suffix)

#### Process Non-Empty addresses from 1945 dataframe

In [None]:
def process_nonempty_addresses(dfc, dfn=None):

    # Filter out rows where ADDRESS_1945 is NaN, None, or undesired strings, and directly work on a copy to avoid SettingWithCopyWarning
    valid_addresses = dfc['ADDRESS_1945'].apply(lambda x: x not in [None, 'nan', '"', '""','"""','""""','" "',',']) & dfc['ADDRESS_1945'].notna()
    valid_rows = dfc[valid_addresses].copy()  # Use .copy() here

    # If there are no valid addresses to process, just return the original dataframes
    if valid_rows.empty:
        return dfc, dfn if dfn is not None else pd.DataFrame()

    # Process the ADDRESS_1945 column: split by commas or spaces, then join with ', '
    valid_rows['keywords_1945'] = valid_rows['ADDRESS_1945'].apply(lambda address: ', '.join(address.replace(',', ' ').split()))

    # Add the 'oldindex_1945' column to keep track of the original index
    valid_rows['oldindex_1945'] = valid_rows.index

    # Prepare dfn (either append to it or create a new one)
    if dfn is not None:
        # When appending, it's safe as dfn is being explicitly modified
        dfn_updated = pd.concat([dfn, valid_rows], ignore_index=True)
    else:
        dfn_updated = valid_rows

    # Remove processed rows from the original dfc
    dfc_updated = dfc.drop(valid_rows.index).reset_index(drop=True)

    return dfc_updated, dfn_updated

#### Apply the function that processes the non-empty addresses

In [None]:
df1945_updated, dfn1945_updated = process_nonempty_addresses(df1945_updated, dfn1945_updated)

#### The abbreviations and their full forms

In [None]:
# Define the abbreviations and their full forms
abbreviations = {
    r'\bbwy\b': 'BROADWAY',
    r'\brt\b': 'ROUTE',
    r'\bave\b': 'AVENUE',
    r'\bbld\b': 'BUILDING'
}

# Replace abbreviations with their full forms (while keeping the abbreviation)
for abbr, full in abbreviations.items():
    dfn1945_updated['keywords_1945'] = dfn1945_updated['keywords_1945'].str.replace(abbr, lambda m: f"{m.group(0)}, {full}", flags=re.IGNORECASE, regex=True)

#### Remove the Addresses that dont have any alphabet or number in them

In [None]:
df1945 = dfn1945_updated

In [None]:
df1945 = remove_rows_without_alphabets_or_numbers(df1945, 'ADDRESS_1945')

#### Split the addresses

In [None]:
# Apply the function to the DataFrame
df1945 = split_keywords_vectorized(df1945, 'keywords_1945')

#### Rename column

In [None]:
df1945 = df1945.copy()
df1945.rename(columns={"numerical_keywords": "NK_1945", "alphabetical_short_keywords": "ASK_1945","alphabetical_long_keywords": "ALK_1945" }, inplace=True)

#### Delete a column

In [None]:
df1945.drop('oldindex_1945', axis=1, inplace=True)


## Preprocessing done df1945
---



## Pre-Process 1975

#### Remove the Addresses that dont have any alphabet in them

In [None]:
df1975 = remove_rows_without_alphabets_or_numbers(df1975, 'ADDRESS_1975')

#### Add spacing before the suffix

In [None]:
df1975["ADDRESS_1975"] = df1975["ADDRESS_1975"].apply(add_space_before_suffix)

#### Add a keywords column to 1975 dataframe

In [None]:
# Function to process address and city/town columns and combine them into keywords
def process_address_and_city(addresses, cities):
    # Split addresses and cities by space, comma, and period
    split_addresses = addresses.str.split('[ ,.]')
    split_cities = cities.fillna('').str.split('[ ,.]')

    # Combine the split parts from both columns
    combined = split_addresses + split_cities

    # Remove empty strings from lists and join the elements into comma-separated strings
    keywords = combined.apply(lambda lst: ','.join(filter(None, lst)))

    return keywords

# Apply the function to create the 'keywords_1975' column
df1975['keywords_1975'] = process_address_and_city(df1975['ADDRESS_1975'], df1975['ORIGINAL GRANTEE CITY OR TOWN_1975'])

#### Split the addresses

In [None]:
df1975 = split_keywords_vectorized(df1975, 'keywords_1975')

#### Rename the column

In [None]:
df1975 = df1975.copy()
df1975.rename(columns={"numerical_keywords": "NK_1975", "alphabetical_short_keywords": "ASK_1975","alphabetical_long_keywords": "ALK_1975" }, inplace=True)


## Preprocessing done df1975
---



## Pre-Process 1985

#### Remove the Addresses that dont have any alphabet in them

In [None]:
df1985 = remove_rows_without_alphabets_or_numbers(df1985, 'ADDRESS_1985')

#### convert the zip codes to respective city names

In [None]:
# List of ZIP codes (replace this list with your ZIP codes)
zip_codes = list(df1985["zip code_1985"].unique())

# Initialize the SearchEngine
search = SearchEngine()

# Function to convert ZIP codes to a dictionary with city names
def zip_to_city_dict(zip_codes):
    zip_city_dict = {}
    for zip_code in zip_codes:
        if zip_code is not None and zip_code != 'nan':
            result = search.by_zipcode(zip_code)
            zip_city_dict[zip_code] = result.major_city if result else 'Unknown'
        else:
            zip_city_dict[zip_code] = 'Unknown'
    return zip_city_dict

# Convert ZIP codes to city names in a dictionary
zip_city_dict = zip_to_city_dict(zip_codes)

Download /root/.uszipcode/simple_db.sqlite from https://github.com/MacHu-GWU/uszipcode-project/releases/download/1.0.1.db/simple_db.sqlite ...
  1.00 MB downloaded ...
  2.00 MB downloaded ...
  3.00 MB downloaded ...
  4.00 MB downloaded ...
  5.00 MB downloaded ...
  6.00 MB downloaded ...
  7.00 MB downloaded ...
  8.00 MB downloaded ...
  9.00 MB downloaded ...
  10.00 MB downloaded ...
  11.00 MB downloaded ...
  Complete!


#### Add space before suffix

In [None]:
df1985["ADDRESS_1985"] = df1985["ADDRESS_1985"].apply(add_space_before_suffix)

#### Add keywords column to 1985 data frame

In [None]:
# Function to split addresses and replace ZIP codes with city names
def process_addresses_and_zip_codes(addresses, zip_codes, zip_city_dict):
    # Split addresses by space, comma, and period
    split_addresses = addresses.str.split('[ ,.]')

    # Vectorized operation to map ZIP codes to city names
    city_names = zip_codes.map(zip_city_dict).fillna('Unknown')

    # Combine split addresses and city names into keywords
    keywords = split_addresses + city_names.apply(lambda x: [x])

    # Flatten the lists into strings
    keywords_str = keywords.apply(lambda x: ', '.join([word for word in x if word]))

    return keywords_str

# Apply the function to the DataFrame
df1985['keywords_1985'] = process_addresses_and_zip_codes(df1985['ADDRESS_1985'], df1985['zip code_1985'], zip_city_dict)

#### Splitting keywords

In [None]:
df1985 = split_keywords_vectorized(df1985, 'keywords_1985')

#### Rename the column

In [None]:
df1985 = df1985.copy()
df1985.rename(columns={"numerical_keywords": "NK_1985", "alphabetical_short_keywords": "ASK_1985","alphabetical_long_keywords": "ALK_1985" }, inplace=True)


## Preprocessing done df1985
---



## Check all the preprocessed dataframes

In [None]:
df1945.head()

Unnamed: 0,People_1945,document number_1945,PARCEL NUMBER_1945,ADDRESS_1945,ORIGINAL GRANTEE CITY OR TOWN_1945,DESIGNATE HOMESTEAD_1945,VALUE OF CITY PROPERTY_1945,VALUE OF PERSONAL PROPERTY_1945,STATE TAX_1945,COUNTY TAX_1945,Total Tax_1945,Zipcode_1945,2022 Assessed Value_1945,keywords_1945,NK_1945,ASK_1945,ALK_1945
0,Aoxue,,,"240722NDST,LUBBOCK",,,2700,,,,13.5,79411.0,153148,"2407, 22, ND, ST, LUBBOCK","2407, 22","ND, ST",LUBBOCK
1,,,,"280828THST,LUBBOCK",,,1800,,,,9.0,79410.0,153602,"2808, 28, TH, ST, LUBBOCK","2808, 28","TH, ST",LUBBOCK
2,,,,"231631STST,LUBBOCK",,,2200,,,,11.0,79411.0,96701,"2316, 31, ST, ST, LUBBOCK","2316, 31","ST, ST",LUBBOCK
3,,,,"191327THST,LUBBOCK",,,1500,,,,7.5,79411.0,104699,"1913, 27, TH, ST, LUBBOCK","1913, 27","TH, ST",LUBBOCK
4,,,,"210215THST,LUBBOCK",,,2000,,,,,79401.0,96597,"2102, 15, TH, ST, LUBBOCK","2102, 15","TH, ST",LUBBOCK


In [None]:
df1975.head()

Unnamed: 0,People_1975,document number_1975,ADDRESS_1975,ORIGINAL GRANTEE CITY OR TOWN_1975,ACERES RENDERED_1975,VALUE DOLLARS (STATE VALUE)_1975,DESIGNATE HOMESTEAD_1975,VALUE OF CITY PROPERTY (TOTAL COUNTY VALUE)_1975,VALUE OF PERSONAL PROPERTY_1975,TOTAL VALUE FOR CONTY TAX_1975,STATE TAX_1975,COUNTY TAX_1975,DISTRICT SCHOOL_1975,TAX TOTAL (INCLUDING HOSPITAL AND WATER TAXES)_1975,Unnamed: 15_1975,Unnamed: 25_1975,keywords_1975,NK_1975,ASK_1975,ALK_1975
0,Ainur,ScanPro1022,1917 10 TH ST,LUBBOCK,,,2340,340,,,0,0.0,,0,,,"1917,10,TH,ST,LUBBOCK","1917, 10","TH, ST",LUBBOCK
1,,,2820 62 ND ST,LUBBOCK,,750.0,3000,3750,,,90,585.0,,1276,,,"2820,62,ND,ST,LUBBOCK","2820, 62","ND, ST",LUBBOCK
2,,,3110 39 TH ST,LUBBOCK,,4650.0,3000,7650,,,558,3627.0,,7906,,,"3110,39,TH,ST,LUBBOCK","3110, 39","TH, ST",LUBBOCK
3,,,2704 B COLGATE,LUBBOCK,,,520,520,,,0,0.0,,0,,,"2704,B,COLGATE,LUBBOCK",2704,B,"COLGATE, LUBBOCK"
4,,,3412 29 TH ST,LUBBOCK,,610.0,3000,3610,,,73,476.0,,1036,,,"3412,29,TH,ST,LUBBOCK","3412, 29","TH, ST",LUBBOCK


In [None]:
df1985.head()

Unnamed: 0,People_1985,Folder Name_1985,document number_1985,ADDRESS_1985,zip code_1985,Area_1985,Roll Seq_1985,LAND _1985,BLD-VAL_1985,SPEC_FEAT_1985,...,LBB SCHOOL_1985,LBB CITY_1985,L C H D_1985,COUNTY.1_1985,GROSS TAX_1985,Unnamed: 20_1985,keywords_1985,NK_1985,ASK_1985,ALK_1985
0,,,none,0402 40 TH ST,79404,,,2600,,,...,104.44,67.38,12.98,18.79,204.35,,"0402, 40, TH, ST, Lubbock","0402, 40","TH, ST",Lubbock
1,,,,407 38 TH ST,79404,205.0,1505 AVE D BLK 1 L 4,2580,10580.0,100.0,...,76.82,79.56,15.32,22.16,194.79,,"407, 38, TH, ST, Lubbock","407, 38","TH, ST",Lubbock
2,,,,0501 35 TH ST,79404,206.0,3059,1250,7790.0,,...,37.57,54.24,10.44,15.11,117.99,,"0501, 35, TH, ST, Lubbock","0501, 35","TH, ST",Lubbock
3,,,,0507 35 TH ST,79404,206.0,3062,1340,11830.0,100.0,...,76.91,79.62,15.33,22.17,194.96,,"0507, 35, TH, ST, Lubbock","0507, 35","TH, ST",Lubbock
4,,,,515 E 35 TH ST,79404,206.0,3066,1190,10910.0,500.0,...,117.18,75.6,14.55,21.05,229.26,,"515, E, 35, TH, ST, Lubbock","515, 35","E, TH, ST",Lubbock


## Matching Try 01

In [None]:
df1945[["keywords_1945",	"NK_1945",	"ASK_1945",	"ALK_1945"]].head()

Unnamed: 0,keywords_1945,NK_1945,ASK_1945,ALK_1945
0,"2407, 22, ND, ST, LUBBOCK","2407, 22","ND, ST",LUBBOCK
1,"2808, 28, TH, ST, LUBBOCK","2808, 28","TH, ST",LUBBOCK
2,"2316, 31, ST, ST, LUBBOCK","2316, 31","ST, ST",LUBBOCK
3,"1913, 27, TH, ST, LUBBOCK","1913, 27","TH, ST",LUBBOCK
4,"2102, 15, TH, ST, LUBBOCK","2102, 15","TH, ST",LUBBOCK


In [None]:
df1975[["keywords_1975",	"NK_1975",	"ASK_1975",	"ALK_1975"]].head()

Unnamed: 0,keywords_1975,NK_1975,ASK_1975,ALK_1975
0,"1917,10,TH,ST,LUBBOCK","1917, 10","TH, ST",LUBBOCK
1,"2820,62,ND,ST,LUBBOCK","2820, 62","ND, ST",LUBBOCK
2,"3110,39,TH,ST,LUBBOCK","3110, 39","TH, ST",LUBBOCK
3,"2704,B,COLGATE,LUBBOCK",2704,B,"COLGATE, LUBBOCK"
4,"3412,29,TH,ST,LUBBOCK","3412, 29","TH, ST",LUBBOCK


In [None]:
df1985[["keywords_1985",	"NK_1985",	"ASK_1985",	"ALK_1985"]].head()

Unnamed: 0,keywords_1985,NK_1985,ASK_1985,ALK_1985
0,"0402, 40, TH, ST, Lubbock","0402, 40","TH, ST",Lubbock
1,"407, 38, TH, ST, Lubbock","407, 38","TH, ST",Lubbock
2,"0501, 35, TH, ST, Lubbock","0501, 35","TH, ST",Lubbock
3,"0507, 35, TH, ST, Lubbock","0507, 35","TH, ST",Lubbock
4,"515, E, 35, TH, ST, Lubbock","515, 35","E, TH, ST",Lubbock


In [None]:
import pandas as pd

def prepare_vectorized_matching(df_base, df_targets, base_year, target_years):
    """
    Perform vectorized address matching based on specified criteria.

    :param df_base: DataFrame containing base addresses and keywords.
    :param df_targets: List of DataFrames for comparison.
    :param base_year: The base year for df_base.
    :param target_years: List of years corresponding to each DataFrame in df_targets.
    """
    # Container for match results
    match_results = []

    # Iterate through target DataFrames for comparison
    for df_target, year in zip(df_targets, target_years):
        # Vectorized extraction and comparison of NK, ASK, ALK from base to target DataFrame
        # Assuming NK, ASK, ALK are separated by comma and space ", "
        # This approach might need adjustment based on actual data structure

        # Step 1: Numerical Keyword (NK) Matching
        # Convert NK columns to sets for easier comparison
        df_base[f'NK_{base_year}_set'] = df_base[f'NK_{base_year}'].str.split(', ').apply(set)
        df_target[f'NK_{year}_set'] = df_target[f'NK_{year}'].str.split(', ').apply(set)

        # Calculate intersection size of NK sets between base and target
        nk_match = df_base[f'NK_{base_year}_set'].apply(lambda x: df_target[f'NK_{year}_set'].apply(lambda y: len(x & y)))

        # Define NK match criteria based on base NK count
        nk_criteria = df_base[f'NK_{base_year}_set'].apply(len).apply(lambda x: 2 if x >= 2 else 1)

        # Step 2: ALK Matching
        # Split ALK keywords for easier comparison, consider non-empty values only
        df_base['ALK_matches'] = df_base[f'ALK_{base_year}'].str.lower().str.split(', ').apply(lambda alk: df_target[f'ALK_{year}'].str.lower().str.contains('|'.join(alk) if alk != [''] else '^\b$', regex=True))

        # Aggregate match information
        for index, row in df_base.iterrows():
            nk_match_count = nk_match.loc[index]
            alk_matches = row['ALK_matches']

            # Identify rows in target DataFrame that meet NK and ALK criteria
            valid_nk_indices = nk_match_count[nk_match_count >= nk_criteria.loc[index]].index
            valid_alk_indices = alk_matches[alk_matches].index

            # Intersection of NK and ALK match indices for final matching
            final_match_indices = valid_nk_indices.intersection(valid_alk_indices)

            if not final_match_indices.empty:
                match_info = {
                    'Base_Index': index,
                    'Base_Year': base_year,
                    'Target_Year': year,
                    'Matched_Indices': final_match_indices.tolist(),
                    'Match_Count': len(final_match_indices)
                }
                match_results.append(match_info)
                print(f"Match found: Base Year {base_year}, Target Year {year}, Base Index {index}, Match Count: {len(final_match_indices)}")

    return pd.DataFrame(match_results)


In [None]:
df_base = df1945  # Example base DataFrame
df_targets = [df1975, df1985]  # List of target comparison DataFrames
base_year = 1945
target_years = [1975, 1985]

match_results_df = prepare_vectorized_matching(df_base, df_targets, base_year, target_years)

In [None]:
match_results_df.head()

In [None]:
matches_df.set_index(['Index_1945', 'Year', 'Matched_Index'], inplace=True)

In [None]:
writ