#### pre-reqs

In [None]:
import pandas as pd
import numpy as np
import re
pd.set_option('display.max_rows', None)

In [None]:
# Read the data
df1945 = pd.read_csv('/content/Lubbock - 1945 - 1945.csv',low_memory=False)
df1975 = pd.read_csv('/content/Lubbock - 1975 - 1975.csv',low_memory=False)
df1985 = pd.read_csv('/content/Lubbock - 1985 -  1985.csv',low_memory=False)

In [None]:
# Use dropna with how='all' to drop columns where all values are NaN
dfc1945 = df1945.dropna(axis=1, how='all')
dfc1975 = df1975.dropna(axis=1, how='all')
dfc1985 = df1985.dropna(axis=1, how='all')

In [None]:
# Make a copy of the dataframes
dfc1945 = dfc1945.copy()
dfc1975 = dfc1975.copy()
dfc1985 = dfc1985.copy()

# Convert the 'ADDRESS' column in each DataFrame to string
dfc1945['ADDRESS'] = dfc1945['ADDRESS'].astype(str)
dfc1975['ADDRESS'] = dfc1975['ADDRESS'].astype(str)
dfc1985['ADDRESS'] = dfc1985['ADDRESS'].astype(str)

## Treatment of 1945 dataframe

#### Step 1: Add a New Column in dfc1945 for the Index

In [None]:
dfc1945 = dfc1945.copy()

# Add a new column to the 1945 dataframe
dfc1945['oldindex'] = dfc1945.index

#### Step 2: Function that processes the addresses

In [None]:
def process_addresses(dfc, pattern, dfn=None):
    """
    Processes addresses in a DataFrame using a specified regex pattern.

    Args:
    - dfc: DataFrame containing the original addresses.
    - pattern: Regex pattern to match addresses.
    - dfn: Optional DataFrame to append matched addresses. If None, a new DataFrame is created.

    Returns:
    - A tuple of DataFrames: (updated dfc without matched rows, dfn with appended matched rows).
    """
    import pandas as pd
    import re

    # Convert the pattern to a compiled regex object if it's a string
    if isinstance(pattern, str):
        pattern = re.compile(pattern)

    # Check if dfn exists, otherwise initialize it
    if dfn is None:
        dfn_rows = []
    else:
        dfn_rows = dfn.to_dict('records')

    matched_indices = []  # Keep track of indices to remove from dfc

    for index, row in dfc.iterrows():
        match = pattern.match(row['ADDRESS'])
        if match:
            keywords = ', '.join([group.strip() for group in match.groups() if group and group.strip()])
            new_row = row.to_dict()
            new_row['keywords'] = keywords
            new_row['oldindex'] = index

            dfn_rows.append(new_row)
            matched_indices.append(index)

    # Update dfn with the new rows
    dfn_updated = pd.DataFrame(dfn_rows)

    # Remove matched rows from dfc
    dfc_updated = dfc.drop(matched_indices).reset_index(drop=True)

    return dfc_updated, dfn_updated, len(matched_indices)

#### Step 3: Identify Patterns

In [None]:
dfc1945['ADDRESS'] = dfc1945['ADDRESS'].str.replace(r'\blub\b', 'LUBBOCK', case=False, regex=True)
dfc1945['ADDRESS'] = dfc1945['ADDRESS'].str.replace(r'\bBx\b', 'BOX', case=False, regex=True)

In [None]:
lenght_of_total_mathced_indices = 0

In [None]:
pattern1 = r'(\d{4})(\d{2})(ND|ST|TH|nd|st|th)(ST|st),([A-Za-z]+)'

dfc1945_updated, dfcn1945_updated, number_of_matched_indices = process_addresses(dfc1945, pattern1)
lenght_of_total_mathced_indices += number_of_matched_indices

print(f'The number of mathced indices are              {number_of_matched_indices}')
print(f'The total number of matched indices so far are {lenght_of_total_mathced_indices}')

The number of mathced indices are              215
The total number of matched indices so far are 215


In [None]:
pattern2 = r'(\d{4})(N|S|E|W|n|s|e|w)(\d{2})(ND|ST|TH|nd|st|th)(ST|st),([A-Za-z]+)'

dfc1945_updated, dfcn1945_updated, number_of_matched_indices = process_addresses(dfc1945_updated, pattern2, dfcn1945_updated)
lenght_of_total_mathced_indices += number_of_matched_indices

print(f'The number of mathced indices are              {number_of_matched_indices}')
print(f'The total number of matched indices so far are {lenght_of_total_mathced_indices}')

The number of mathced indices are              8
The total number of matched indices so far are 223


In [None]:
pattern3 = r'(\d{4})(AVE|ave)([A-Za-z]),([A-Za-z]+)'

dfc1945_updated, dfcn1945_updated, number_of_matched_indices = process_addresses(dfc1945_updated, pattern3, dfcn1945_updated)
lenght_of_total_mathced_indices += number_of_matched_indices

print(f'The number of mathced indices are              {number_of_matched_indices}')
print(f'The total number of matched indices so far are {lenght_of_total_mathced_indices}')

The number of mathced indices are              33
The total number of matched indices so far are 256


In [None]:
pattern4 = r'(\d{3,4})(AVE|ave)([A-Za-z]),([A-Za-z]+)'

dfc1945_updated, dfcn1945_updated, number_of_matched_indices = process_addresses(dfc1945_updated, pattern4, dfcn1945_updated)
lenght_of_total_mathced_indices += number_of_matched_indices

print(f'The number of mathced indices are              {number_of_matched_indices}')
print(f'The total number of matched indices so far are {lenght_of_total_mathced_indices}')

The number of mathced indices are              6
The total number of matched indices so far are 262


In [None]:
pattern5 = r'(\d{4})(N|S|E|W|n|s|e|w)(MAIN|main)(ST|st),([A-Za-z]+)'

dfc1945_updated, dfcn1945_updated, number_of_matched_indices = process_addresses(dfc1945_updated, pattern5, dfcn1945_updated)
lenght_of_total_mathced_indices += number_of_matched_indices

print(f'The number of mathced indices are              {number_of_matched_indices}')
print(f'The total number of matched indices so far are {lenght_of_total_mathced_indices}')

The number of mathced indices are              3
The total number of matched indices so far are 265


In [None]:
pattern6 = r'(\d{3,4})(N|S|E|W|n|s|e|w)([A-Za-z]+)(AVE|ave),([A-Za-z]+)'

dfc1945_updated, dfcn1945_updated, number_of_matched_indices = process_addresses(dfc1945_updated, pattern6, dfcn1945_updated)
lenght_of_total_mathced_indices += number_of_matched_indices

print(f'The number of mathced indices are              {number_of_matched_indices}')
print(f'The total number of matched indices so far are {lenght_of_total_mathced_indices}')

The number of mathced indices are              23
The total number of matched indices so far are 288


In [None]:
pattern7 = r'(\d{3,4})([A-Za-z]+)(AVE|ave),([A-Za-z]+)'

dfc1945_updated, dfcn1945_updated, number_of_matched_indices = process_addresses(dfc1945_updated, pattern7, dfcn1945_updated)
lenght_of_total_mathced_indices += number_of_matched_indices

print(f'The number of mathced indices are              {number_of_matched_indices}')
print(f'The total number of matched indices so far are {lenght_of_total_mathced_indices}')

The number of mathced indices are              48
The total number of matched indices so far are 336


In [None]:
pattern8 = r'(\d{4})(\d{2})(RD|ND|ST|TH|rd|nd|st|th)(AVE|ave|ST|st),([A-Za-z]+)'

dfc1945_updated, dfcn1945_updated, number_of_matched_indices = process_addresses(dfc1945_updated, pattern8, dfcn1945_updated)
lenght_of_total_mathced_indices += number_of_matched_indices

print(f'The number of mathced indices are              {number_of_matched_indices}')
print(f'The total number of matched indices so far are {lenght_of_total_mathced_indices}')

The number of mathced indices are              14
The total number of matched indices so far are 350


In [None]:
pattern9 = r'(\d{4})(\d{3})(RD|ND|ST|TH|rd|nd|st|th)(AVE|ave|ST|st),([A-Za-z]+)'

dfc1945_updated, dfcn1945_updated, number_of_matched_indices = process_addresses(dfc1945_updated, pattern9, dfcn1945_updated)
lenght_of_total_mathced_indices += number_of_matched_indices

print(f'The number of mathced indices are              {number_of_matched_indices}')
print(f'The total number of matched indices so far are {lenght_of_total_mathced_indices}')

The number of mathced indices are              6
The total number of matched indices so far are 356


In [None]:
pattern10 = r'(\d{4})(MAIN|main)(AVE|ave|ST|st),([A-Za-z]+)'

dfc1945_updated, dfcn1945_updated, number_of_matched_indices = process_addresses(dfc1945_updated, pattern10, dfcn1945_updated)
lenght_of_total_mathced_indices += number_of_matched_indices

print(f'The number of mathced indices are              {number_of_matched_indices}')
print(f'The total number of matched indices so far are {lenght_of_total_mathced_indices}')

The number of mathced indices are              5
The total number of matched indices so far are 361


In [None]:
pattern11 = r'(\d{3,4})(N|S|E|W|n|s|e|w)(AVE|ave)([A-Za-z]),([A-Za-z]+)'

dfc1945_updated, dfcn1945_updated, number_of_matched_indices = process_addresses(dfc1945_updated, pattern11, dfcn1945_updated)
lenght_of_total_mathced_indices += number_of_matched_indices

print(f'The number of mathced indices are              {number_of_matched_indices}')
print(f'The total number of matched indices so far are {lenght_of_total_mathced_indices}')

The number of mathced indices are              3
The total number of matched indices so far are 364


In [None]:
pattern12 = r'(\d{3,4})(N|S|E|W|n|s|e|w)?(HARVARD|URSULINE|ERSKINE|FISHER|STANFORD|BROADWAY)(ST|st)?,([A-Za-z]+)'

dfc1945_updated, dfcn1945_updated, number_of_matched_indices = process_addresses(dfc1945_updated, pattern12, dfcn1945_updated)
lenght_of_total_mathced_indices += number_of_matched_indices

print(f'The number of mathced indices are              {number_of_matched_indices}')
print(f'The total number of matched indices so far are {lenght_of_total_mathced_indices}')

The number of mathced indices are              12
The total number of matched indices so far are 376


In [None]:
pattern12 = r'(\d{4})([A-Za-z]+)(DR|dr|Dr),([A-Za-z]+)?'

dfc1945_updated, dfcn1945_updated, number_of_matched_indices = process_addresses(dfc1945_updated, pattern12, dfcn1945_updated)
lenght_of_total_mathced_indices += number_of_matched_indices

print(f'The number of mathced indices are              {number_of_matched_indices}')
print(f'The total number of matched indices so far are {lenght_of_total_mathced_indices}')

The number of mathced indices are              3
The total number of matched indices so far are 379


In [None]:
pattern13 = r'(\d{4})(N|S|E|W|n|s|e|w)?([A-Za-z]+)(\d+),([A-Za-z]+)'

dfc1945_updated, dfcn1945_updated, number_of_matched_indices = process_addresses(dfc1945_updated, pattern13, dfcn1945_updated)
lenght_of_total_mathced_indices += number_of_matched_indices

print(f'The number of mathced indices are              {number_of_matched_indices}')
print(f'The total number of matched indices so far are {lenght_of_total_mathced_indices}')

The number of mathced indices are              6
The total number of matched indices so far are 385


In [None]:
dfc1945_updated.head(100)

Unnamed: 0,People,document number,PARCEL NUMBER,ADDRESS,ORIGINAL GRANTEE CITY OR TOWN,DESIGNATE HOMESTEAD,VALUE OF CITY PROPERTY,VALUE OF PERSONAL PROPERTY,STATE TAX,COUNTY TAX,Total Tax,Zipcode,2022 Assessed Value,oldindex
0,,,,220814THST,,,2000.0,,,,10.37,79401.0,7540.0,43
1,,,,IDALOU,,,20.0,,,,,,0.0,46
2,,,,"202282NDST#101,LUBBOCK",,,1600.0,,,,8.0,79423.0,18608.0,56
3,,,,"1907E1STPL,LUBBOCK",,,,,,,,79403.0,87955.0,64
4,,,,,,,,,,,,,,107
5,,,,"812MAINST,LUBBOCK",,,2400.0,,,,,79401.0,147780.0,109
6,,,,"1910RESEARCHBLVD,LUBBOCK",,,150.0,,,,0.75,79407.0,340465.0,115
7,,,,"21179THST,LUBBOCK",,,600.0,,,,3.0,79401.0,296825.0,144
8,,,,"1210S6THST,SLATON",,,,,,,30.5,79364.0,750.0,148
9,,,,"14098THST,SHALLOWATER",,,450.0,,,,2.25,79363.0,271861.0,153


In [None]:
dfcn1945_updated[lenght_of_total_mathced_indices-number_of_matched_indices:]

Unnamed: 0,People,document number,PARCEL NUMBER,ADDRESS,ORIGINAL GRANTEE CITY OR TOWN,DESIGNATE HOMESTEAD,VALUE OF CITY PROPERTY,VALUE OF PERSONAL PROPERTY,STATE TAX,COUNTY TAX,Total Tax,Zipcode,2022 Assessed Value,oldindex,keywords
379,,,,"1420NCR1300,LUBBOCK",,,1100,,,,,79416.0,52861,3,"1420, N, CR, 1300, LUBBOCK"
380,,,,"1631CR7340,LUBBOCK",,,600,,,,3.0,79423.0,293997,8,"1631, CR, 7340, LUBBOCK"
381,,,,"1711ECR5400,LUBBOCK",,,800,,,,,79403.0,176208,15,"1711, E, CR, 5400, LUBBOCK"
382,,,,"1711ECR5400,LUBBOCK",,,80,,,,0.98,79403.0,176208,16,"1711, E, CR, 5400, LUBBOCK"
383,,,,"2007EFM1294,LUBBOCK",,,1300,,,,6.5,79403.0,284059,31,"2007, E, FM, 1294, LUBBOCK"
384,,,,"2419FM597,ABERNATHY",,,1700,,,,,79311.0,335728,33,"2419, FM, 597, ABERNATHY"


#### Step 4: Adjust and split addresses of specific kind

In [None]:
def add_space_before_suffix(address):
    # Define the pattern: a number (\d+) immediately followed by a street suffix
    pattern = r'(\d+)(RD|ND|ST|TH|rd|nd|st|th)'

    # Define the replacement pattern: the matched number and suffix separated by a space
    # \1 and \2 refer to the first and second captured groups in the pattern, respectively
    replacement = r'\1 \2'

    # Use re.sub() to search and replace the pattern in the address string
    modified_address = re.sub(pattern, replacement, address)

    return modified_address

In [None]:
dfc1945_updated["ADDRESS"] = dfc1945_updated["ADDRESS"].apply(add_space_before_suffix)

In [None]:
def process_nonempty_addresses_alternative(dfc, dfn=None):
    import pandas as pd

    # Filter out rows where ADDRESS is NaN, None, or 'nan'
    valid_addresses = dfc['ADDRESS'].apply(lambda x: x not in [None, 'nan','"','""']) & dfc['ADDRESS'].notna()
    valid_rows = dfc[valid_addresses]

    # If there are no valid addresses to process, just return the original dataframes
    if valid_rows.empty:
        return dfc, dfn if dfn is not None else pd.DataFrame()

    # Process the ADDRESS column: split by commas or spaces, then join with ', '
    valid_rows['keywords'] = valid_rows['ADDRESS'].apply(lambda address: ', '.join(address.replace(',', ' ').split()))

    # Add the 'oldindex' column to keep track of the original index
    valid_rows['oldindex'] = valid_rows.index

    # Prepare dfn (either append to it or create a new one)
    if dfn is not None:
        dfn_updated = pd.concat([dfn, valid_rows], ignore_index=True)
    else:
        dfn_updated = valid_rows.copy()

    # Remove processed rows from the original dfc
    dfc_updated = dfc.drop(valid_rows.index).reset_index(drop=True)

    return dfc_updated, dfn_updated

# Assuming dfc1945 is already defined
dfc1945_updated, dfcn1945_updated = process_nonempty_addresses_alternative(dfc1945_updated, dfcn1945_updated)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_rows['keywords'] = valid_rows['ADDRESS'].apply(lambda address: ', '.join(address.replace(',', ' ').split()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_rows['oldindex'] = valid_rows.index


In [None]:
dfc1945_updated

Output hidden; open in https://colab.research.google.com to view.

In [None]:
dfcn1945_updated

Output hidden; open in https://colab.research.google.com to view.

In [None]:
empty_addresses_1945 = dfc1945_updated