In [None]:
# Import the requried modules/libraries
import re
import ast # This is useful to convert the string representation of lists to actual lists
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from matplotlib.gridspec import GridSpec
import textwrap
from functools import reduce
import time
import os
import json



# Mount the google drive
from google.colab import drive
drive.mount('/content/drive')

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# display all the rows, columns and display them at max width
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
df1945 = pd.read_csv('/content/drive/MyDrive/RA Part-time UMD/Lubbock Project/After Jan 16th 2025/Final data from each year/df1945 - df1945.csv')
df1975 = pd.read_csv('/content/drive/MyDrive/RA Part-time UMD/Lubbock Project/After Jan 16th 2025/Final data from each year/df1975 - df1975.csv')
df1985 = pd.read_csv('/content/drive/MyDrive/RA Part-time UMD/Lubbock Project/After Jan 16th 2025/Final data from each year/df1985 - df1985.csv')
df2012_2020_2021 = pd.read_csv('/content/drive/MyDrive/RA Part-time UMD/Lubbock Project/After Jan 16th 2025/Final data from each year/2012 2020 2021 data.csv')

In [None]:
dataframes = [df1945, df1975, df1985, df2012_2020_2021]

for dataframe in dataframes:
  print(dataframe.columns)

Index(['google_standard_address_1945', 'zip_code_1945', 'latitude_1945', 'longitude_1945',
       'value_of_city_property_1945', 'value_of_personal_property_1945',
       'total_property_value_1945', '2025_dollar_value_total_property_value_1945',
       'state_tax_1945', 'county_tax_1945', 'gross_tax_1945', 'area_1945', 'homestead_1945',
       'district_school_1945', 'address_repetation_1945'],
      dtype='object')
Index(['google_standard_address_1975', 'zip_code_1975', 'latitude_1975', 'longitude_1975',
       'value_dollars_(state_value)_1975', 'value_of_city_property_(total_county_value)_1975',
       'value_of_personal_property_1975', 'total_property_value_1975',
       '2025_dollar_value_total_property_value_1975', 'state_tax_1975', 'county_tax_1975',
       'gross_tax_1975', 'area_1975', 'homestead_1975', 'district_school_1975',
       'address_repetation_1975'],
      dtype='object')
Index(['google_standard_address_1985', 'zip_code_1985', 'latitude_1985', 'longitude_1985',
   

In [None]:
# Collect all addresses from all dataframes
all_addresses = pd.concat([
    df1945['google_standard_address_1945'].rename('Address'),
    df1975['google_standard_address_1975'].rename('Address'),
    df1985['google_standard_address_1985'].rename('Address'),
    df2012_2020_2021['Standard_Address'].rename('Address')
]).drop_duplicates().reset_index(drop=True).to_frame()

# Display unique addresses
print(f"Total unique addresses: {len(all_addresses)}")

Total unique addresses: 119276


In [None]:
import pandas as pd
import os
import re

# ===============================
# Step 0: Define Helper Function
# ===============================

def safe_save(df, filename):
    """
    Safely save the DataFrame to a pickle file (you can also use CSV).
    This ensures we don't lose intermediate progress if something goes wrong later.
    """
    try:
        df.to_pickle(filename)
        print(f"✅ Successfully saved DataFrame with shape {df.shape} to '{filename}'")
    except Exception as e:
        print(f"⚠️ ERROR: Could not save DataFrame to '{filename}'. Reason: {e}")

# ===============================
# Step 1: Collect Unique Addresses
# ===============================

try:
    print("📌 Collecting all addresses from each dataframe...")

    # Concatenate all address columns into one Series
    all_addresses_series = pd.concat([
        df1945['google_standard_address_1945'],
        df1975['google_standard_address_1975'],
        df1985['google_standard_address_1985'],
        df2012_2020_2021['Standard_Address']
    ])

    # Drop duplicates and sort (optional, but neat)
    all_addresses_series = all_addresses_series.drop_duplicates().sort_values(ignore_index=True)

    # Create a new DataFrame with a single 'Address' column
    final_df = pd.DataFrame({'Address': all_addresses_series})

    print(f"✅ Total unique addresses collected: {final_df.shape[0]}")

    # Save intermediate result
    safe_save(final_df, "intermediate_step1.pkl")

except Exception as e:
    print("❌ ERROR during Step 1 (Collecting Unique Addresses):", e)
    raise e


# ===============================
# Step 2: Rename Address Columns in Each Yearly DataFrame
# ===============================

try:
    print("\n📌 Renaming address columns in each yearly DataFrame to a common name 'Address'...")

    df1945_renamed = df1945.rename(columns={'google_standard_address_1945': 'Address'})
    df1975_renamed = df1975.rename(columns={'google_standard_address_1975': 'Address'})
    df1985_renamed = df1985.rename(columns={'google_standard_address_1985': 'Address'})
    df2012_2020_2021_renamed = df2012_2020_2021.rename(columns={'Standard_Address': 'Address'})

    print("✅ Successfully renamed columns in all DataFrames.")

except Exception as e:
    print("❌ ERROR during Step 2 (Renaming columns):", e)
    raise e


# ===============================
# Step 3: Merge Datasets
# ===============================

def merge_data(df_source, name):
    """ Merges a given dataset into final_df and prints progress """
    global final_df
    try:
        print(f"\n📌 Merging {name} into final_df...")
        final_df = final_df.merge(df_source, on='Address', how='left')
        print(f"✅ After merging {name}, final_df shape: {final_df.shape}")
        safe_save(final_df, f"intermediate_after_{name}.pkl")
    except Exception as e:
        print(f"❌ ERROR during merging {name}: {e}")
        safe_save(final_df, "final_df_error.pkl")
        raise e

merge_data(df1945_renamed, "df1945")
merge_data(df1975_renamed, "df1975")
merge_data(df1985_renamed, "df1985")
merge_data(df2012_2020_2021_renamed, "df2012_2020_2021")


# ===============================
# Step 4: Extract ZIP Codes from Address
# ===============================

def extract_zip(address):
    """Extracts a 5-digit ZIP code from the end of an address string if present."""
    match = re.search(r'\b\d{5}\b', str(address))
    return match.group(0) if match else None

print("\n📌 Extracting ZIP codes from Address column...")
final_df["zip_code"] = final_df["Address"].apply(extract_zip)

print(f"✅ ZIP codes extracted: {final_df['zip_code'].notna().sum()} out of {len(final_df)}")


# ===============================
# Step 5: Unify Latitude and Longitude
# ===============================

print("\n📌 Unifying latitude and longitude from all available sources...")

# Initialize empty columns
final_df["latitude"] = None
final_df["longitude"] = None

# Define year-based columns
year_list = [1945, 1975, 1985]

# Fill latitude and longitude from older datasets first
for year in year_list:
    lat_col = f"latitude_{year}"
    lon_col = f"longitude_{year}"

    if lat_col in final_df.columns:
        final_df["latitude"] = final_df["latitude"].fillna(final_df[lat_col])
    if lon_col in final_df.columns:
        final_df["longitude"] = final_df["longitude"].fillna(final_df[lon_col])

# Finally, fill from the most recent dataset
if "Latitude" in final_df.columns:
    final_df["latitude"] = final_df["latitude"].fillna(final_df["Latitude"])
if "Longitude" in final_df.columns:
    final_df["longitude"] = final_df["longitude"].fillna(final_df["Longitude"])

print("✅ Latitude and Longitude unified successfully!")


# ===============================
# Step 6: Drop Unnecessary Columns
# ===============================

print("\n📌 Dropping old year-based zip_code, latitude, and longitude columns...")

cols_to_drop = [col for col in final_df.columns if col.startswith(("zip_code_", "latitude_", "longitude_"))]
cols_to_drop += ["Latitude", "Longitude"]  # Drop these if they exist

final_df.drop(columns=[col for col in cols_to_drop if col in final_df.columns], inplace=True)

print(f"✅ Dropped {len(cols_to_drop)} unnecessary columns.")


# ===============================
# Step 7: Reorder Columns
# ===============================

print("\n📌 Reordering columns for clarity...")

desired_columns = ["Address", "zip_code", "latitude", "longitude"]
all_other_cols = [c for c in final_df.columns if c not in desired_columns]
new_col_order = desired_columns + all_other_cols

final_df = final_df[new_col_order]

print("✅ Final column order set!")
print("📊 Final dataframe preview:")
print(final_df.head(5))


# ===============================
# Step 8: Final Save and Summary
# ===============================

safe_save(final_df, "final_consolidated_dataframe.pkl")

print("\n✅✅✅ ----- Final DataFrame Summary ----- ✅✅✅")
print(f"📌 Number of addresses: {final_df.shape[0]}")
print(f"📌 Number of columns: {final_df.shape[1]}")
print("📊 Sample rows:")
print(final_df.head(10))
print("-------------------------------------------------")


📌 Collecting all addresses from each dataframe...
✅ Total unique addresses collected: 119276
✅ Successfully saved DataFrame with shape (119276, 1) to 'intermediate_step1.pkl'

📌 Renaming address columns in each yearly DataFrame to a common name 'Address'...
✅ Successfully renamed columns in all DataFrames.

📌 Merging df1945 into final_df...
✅ After merging df1945, final_df shape: (119276, 15)
✅ Successfully saved DataFrame with shape (119276, 15) to 'intermediate_after_df1945.pkl'

📌 Merging df1975 into final_df...
✅ After merging df1975, final_df shape: (119276, 30)
✅ Successfully saved DataFrame with shape (119276, 30) to 'intermediate_after_df1975.pkl'

📌 Merging df1985 into final_df...
✅ After merging df1985, final_df shape: (119276, 50)
✅ Successfully saved DataFrame with shape (119276, 50) to 'intermediate_after_df1985.pkl'

📌 Merging df2012_2020_2021 into final_df...
✅ After merging df2012_2020_2021, final_df shape: (119276, 74)
✅ Successfully saved DataFrame with shape (119276,

In [None]:
# Drop the specific column if it exists
if 'Address (as appeared in the raw data)' in final_df.columns:
    final_df.drop(columns=['Address (as appeared in the raw data)'], inplace=True)
    print("✅ Successfully removed column: 'Address (as appeared in the raw data)'")
else:
    print("⚠️ Column 'Address (as appeared in the raw data)' not found in final_df.")


✅ Successfully removed column: 'Address (as appeared in the raw data)'


In [None]:
def extract_zip(address):
    """
    Extracts a ZIP code from an address string that follows the pattern:
    "{some address}, {ZIPCODE}, USA"
    """
    match = re.search(r' (\d{5}), USA$', str(address).strip())  # Matches ", 12345, USA" at the end
    return match.group(1) if match else None  # Return ZIP code if found, otherwise None

# Apply to the Address column
final_df["zip_code"] = final_df["Address"].apply(extract_zip)

print(f"✅ ZIP codes extracted: {final_df['zip_code'].notna().sum()} out of {len(final_df)}")


✅ ZIP codes extracted: 119216 out of 119276


In [None]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119276 entries, 0 to 119275
Data columns (total 65 columns):
 #   Column                                            Non-Null Count   Dtype  
---  ------                                            --------------   -----  
 0   Address                                           119276 non-null  object 
 1   zip_code                                          119216 non-null  object 
 2   latitude                                          119276 non-null  float64
 3   longitude                                         119276 non-null  float64
 4   value_of_city_property_1945                       2037 non-null    float64
 5   value_of_personal_property_1945                   297 non-null     float64
 6   total_property_value_1945                         2186 non-null    float64
 7   2025_dollar_value_total_property_value_1945       2186 non-null    float64
 8   state_tax_1945                                    441 non-null     float64
 9   coun

In [None]:
final_df.to_csv('/content/drive/MyDrive/RA Part-time UMD/Lubbock Project/After Jan 16th 2025/Final data from each year/all_years_consolidated_dataframe.csv', index=False)

In [1]:
# Categorizing columns
categories = {
    "Location Information": [
        "Address", "zip_code", "latitude", "longitude"
    ],
    "Property Values (1945)": [
        "value_of_city_property_1945", "value_of_personal_property_1945",
        "total_property_value_1945", "2025_dollar_value_total_property_value_1945"
    ],
    "Tax Information (1945)": [
        "state_tax_1945", "county_tax_1945", "gross_tax_1945"
    ],
    "Area & Homestead (1945)": [
        "area_1945", "homestead_1945", "district_school_1945", "address_repetation_1945"
    ],
    "Property Values (1975)": [
        "value_dollars_(state_value)_1975", "value_of_city_property_(total_county_value)_1975",
        "value_of_personal_property_1975", "total_property_value_1975",
        "2025_dollar_value_total_property_value_1975"
    ],
    "Tax Information (1975)": [
        "state_tax_1975", "county_tax_1975", "gross_tax_1975"
    ],
    "Area & Homestead (1975)": [
        "area_1975", "homestead_1975", "district_school_1975", "address_repetation_1975"
    ],
    "Property Values (1985)": [
        "landvalue _1985", "buildingvalue_1985", "specialfeaturesvalue_1985",
        "total_property_value_1985", "2025_dollar_value_total_property_value_1985"
    ],
    "Tax Information (1985)": [
        "county_1985", "city_tax_1985", "school_tax_1985", "LCHD_tax_1985", "gross_tax_1985"
    ],
    "Area & Homestead (1985)": [
        "area_sqm_1985", "area_sqft_1985", "homestead_1985", "schoolvalue_1985",
        "countyvalue_1985", "cityvalue_1985", "address_repetation_1985"
    ],
    "Property Values (2012)": [
        "2025_dollar_value_total_property_value_2012", "TotalValue_2012", "LandValue_2012",
        "ImpValue_2012", "LandSizeAC_2012", "LandSizeFT_2012", "YearBuilt_2012"
    ],
    "Property Values (2020)": [
        "2025_dollar_value_total_property_value_2020", "TotalValue_2020", "LandValue_2020",
        "ImpValue_2020", "LandSizeAC_2020", "LandSizeFT_2020", "YearBuilt_2020"
    ],
    "Property Values (2021)": [
        "2025_dollar_value_total_property_value_2021", "TotalValue_2021", "LandValue_2021",
        "ImpValue_2021", "LandSizeAC_2021", "LandSizeFT_2021", "YearBuilt_2021"
    ]
}

In [6]:
categories['Property Values (1945)'], categories['Property Values (1975)'], categories['Property Values (1985)'], categories['Property Values (2012)'], categories['Property Values (2020)'], categories['Property Values (2021)']

(['value_of_city_property_1945',
  'value_of_personal_property_1945',
  'total_property_value_1945',
  '2025_dollar_value_total_property_value_1945'],
 ['value_dollars_(state_value)_1975',
  'value_of_city_property_(total_county_value)_1975',
  'value_of_personal_property_1975',
  'total_property_value_1975',
  '2025_dollar_value_total_property_value_1975'],
 ['landvalue _1985',
  'buildingvalue_1985',
  'specialfeaturesvalue_1985',
  'total_property_value_1985',
  '2025_dollar_value_total_property_value_1985'],
 ['2025_dollar_value_total_property_value_2012',
  'TotalValue_2012',
  'LandValue_2012',
  'ImpValue_2012',
  'LandSizeAC_2012',
  'LandSizeFT_2012',
  'YearBuilt_2012'],
 ['2025_dollar_value_total_property_value_2020',
  'TotalValue_2020',
  'LandValue_2020',
  'ImpValue_2020',
  'LandSizeAC_2020',
  'LandSizeFT_2020',
  'YearBuilt_2020'],
 ['2025_dollar_value_total_property_value_2021',
  'TotalValue_2021',
  'LandValue_2021',
  'ImpValue_2021',
  'LandSizeAC_2021',
  'LandS

In [3]:
categories['Property Values (1975)']

['value_dollars_(state_value)_1975',
 'value_of_city_property_(total_county_value)_1975',
 'value_of_personal_property_1975',
 'total_property_value_1975',
 '2025_dollar_value_total_property_value_1975']

In [4]:
categories['Property Values (1985)']

['landvalue _1985',
 'buildingvalue_1985',
 'specialfeaturesvalue_1985',
 'total_property_value_1985',
 '2025_dollar_value_total_property_value_1985']