In [None]:
import pandas as pd

df_2023_clean = pd.read_csv("environmental_data_2023_clean.csv")
df_2024_clean = pd.read_csv("environmental_data_2024_clean.csv")
df_2025_clean = pd.read_csv("cleaned_data_2025_environment.csv")


In [None]:
# Verify that all yearly cleaned datasets share the same column structure
# This ensures the datasets can be safely concatenated without misalignment.
df_2023_clean.columns.equals(df_2024_clean.columns), df_2024_clean.columns.equals(df_2025_clean.columns)


In [None]:
# Stack the cleaned 2023, 2024, and 2025 datasets into a single multi-year DataFrame
# ignore_index=True resets the index to avoid duplicates across years.
df_env_all = pd.concat([df_2023_clean, df_2024_clean, df_2025_clean], ignore_index=True)


In [None]:
# Confirm that there are no duplicate Site–Year–Month combinations
# after stacking the datasets.
df_env_all.duplicated(["Site", "Year", "Month"]).sum()

In [None]:
# Validate that all Month values fall within the expected range (1–12)
# Helps catch corrupted or malformed records.
df_env_all.loc[~df_env_all["Month"].between(1, 12), ["Site", "Year", "Month"]].head()


In [None]:
# Check record counts per year to confirm all expected years loaded correctly
df_env_all["Year"].value_counts().sort_index()

In [None]:
# Inspect the number of records per Site per Year
# Ensures no sites were dropped during stacking.
df_env_all.groupby(["Year", "Site"]).size()

In [None]:
# Sort the stacked dataset for logical ordering
# (Year → Month → Site) improves readability and SQL querying.
df_env_all = df_env_all.sort_values(["Year", "Month", "Site"]).reset_index(drop=True)


In [None]:
# Quick visual check of the sorted structure
df_env_all[["Year", "Month", "Site"]].head(15)

In [None]:
# Create a unique surrogate key for each record
# This will be used as the primary key in SQL Server.
df_env_all.insert(0, "env_id", range(1, len(df_env_all) + 1))

In [None]:
# Verify that the generated env_id column is unique
df_env_all["env_id"].is_unique

In [None]:
# Create a final immutable copy for export and downstream use
df_env_all_final = df_env_all.copy()

# Save the final stacked dataset as a pickle file
# Preserves data types and is fast to reload in Python.
df_env_all_final.to_pickle(
    "right_environmental_data_2023_2025_final.pkl"
)

In [None]:
# Export the final dataset to CSV for SQL Server ingestion
# index=False prevents pandas from writing an extra index column.
df_env_all_final.to_csv(
    "right_environmental_data_2023_2025_final.csv",
    index=False
)