In [None]:
# Notebook Purpose: - Clean 2025 environmental dataset. - Standardise units and flags. - Prepare for stacking.

# Assumptions: 
# - Raw data contains mixed boolean types. 
# - Numeric fields may contain excessive decimals. 
# - Compliance fields may contain nulls.

# Output: - Cleaned 2025 dataset ready for stacking

In [None]:
import pandas as pd 
df_fresh = pd.read_csv("../data/raw/environmental_sites_with_errors.csv")
df_fresh

In [None]:
# Data Inspection Utility. Purpose: Provides a structured, repeatable data quality overview of the raw 2025 dataset 
# before any cleaning or transformation is applied. This function performs structural, completeness, duplication, and schema validation checks to support defensible preprocessing. 
# Note: This function does NOT modify the dataset. It is purely diagnostic.

def inspect_df(df_fresh: pd.DataFrame, sample_size: int = 5):
    def section(title: str):
        print("\n" + "=" * 80)
        print(title)
        print("=" * 80)

    section("SHAPE & STRUCTURE")
    print("Rows, Columns:", df_fresh.shape)
    print("Columns:", df_fresh.columns.tolist())
    print("Index:", df_fresh.index)

    section("DATA TYPES & NULL OVERVIEW")
    df_fresh.info()

    section("ROW-LEVEL SANITY CHECKS")
    display(df_fresh.head(sample_size))
    display(df_fresh.tail(sample_size))
    if len(df_fresh) > 0:
        display(df_fresh.sample(min(sample_size, len(df_fresh))))

    section("MISSING VALUES")
    missing_counts = df_fresh.isna().sum()
    missing_pct = df_fresh.isna().mean().sort_values(ascending=False)
    print(missing_counts[missing_counts > 0])
    print(missing_pct[missing_pct > 0])

    section("DUPLICATES")
    print("Total duplicated rows:", df_fresh.duplicated().sum())

    section("NUMERIC COLUMNS")
    num_df_fresh = df_fresh.select_dtypes(include="number")
    if not df_fresh.empty:
        display(num_df_fresh.describe().T)

    section("CATEGORICAL COLUMNS")
    cat_df_fresh = df_fresh.select_dtypes(include=["object", "category"])
    if not cat_df_fresh.empty:
        print(cat_df_fresh.nunique().sort_values())
        display(cat_df_fresh.iloc[:, 0].value_counts(dropna=False).head(10))

    section("SCHEMA SNAPSHOT")
    print(df_fresh.dtypes)

    print("\nINSPECTION COMPLETE — NO DATA MODIFIED")
    
inspect_df(df_fresh)

In [None]:
# Converting Site using upper for easier mapping
df_fresh['Site'] = df_fresh['Site'].str.strip().str.upper()


In [None]:
# Seeing the unique values allows to easier map for Site
df_fresh['Site'].unique()

In [None]:
# Standardising column names and mapping the new column names 

site_map = { 
    'SITE A' : 'A', 
    'SITE B' : 'B', 
    'SITE-C' : 'C', 
    'SITE D' : 'D', 
    'S1TE E' : 'E' 
} 
# Apply the column order to the DataFrame
df_fresh['Site'] = df_fresh['Site'].replace(site_map)

In [None]:
# Quick inspection to confirm new column order
df_fresh['Site'] = df_fresh['Site'].astype(str)
print(df_fresh['Site'])

In [None]:
# Convert Energy_kWh column to numeric type
# Non-numeric values are set to NaN for consistency in calculations

df_fresh['Energy_kWh'] = pd.to_numeric(df_fresh['Energy_kWh'], errors='coerce')

In [None]:
# Correcting the spelling mistake
df_fresh.loc[3, 'Energy_kWh'] = 5000.0

In [None]:
# Ensure Energy_kWh column is stored as float (decimal) for calculations and SQL export

df_fresh['Energy_kWh'] = df_fresh['Energy_kWh'].astype(float)

In [None]:
# Use .loc to quickly verify a specific row after conversion

df_fresh.loc[3, 'Energy_kWh']

In [None]:
# Use .loc to correct spelling mistake 

df_fresh.loc[10, 'Water_m3'] = 800.3

In [None]:
# Use .loc to quickly verify a specific row after conversion

df_fresh.loc[10, 'Water_m3']

In [None]:
# Ensure Water_m3 column is stored as float (decimal) for calculations and SQL export
df_fresh['Water_m3'] = df_fresh['Water_m3'].astype(float)

In [None]:
# Use .loc to correct spelling mistake 

df_fresh.loc[12, 'Waste_tonnes'] = 50.000000

In [None]:
# For each key measurement column, create a boolean *_recorded flag
# indicating whether a value is present (non-missing) in the dataset.

cols = [
    'Energy_kWh',
    'Water_m3',
    'Waste_tonnes',
    'Recycled_percent',
    'CO2_tonnes',
    'Compliance_score'
]

for col in cols:
    df_fresh[f'{col}_recorded'] = df_fresh[col].notna()

In [None]:
# Replace NaN values in key measurement columns with 0
# Ensures that sums, averages, and other calculations do not fail
# and aligns with the 2023–2024 datasets.

df_fresh[cols] = df_fresh[cols].fillna(0)

In [None]:
# Changing a measurement using .loc in the CO2 column. 
df_fresh.loc[59, 'CO2_tonnes'] = 1.456382	

In [None]:
# Year was missing from dataset, to align with 2023/2024, year column was inserted 
df_fresh['Year'] = 2025

In [None]:
# Using pop to change the structure of columns 
df_fresh.insert(2, 'Year', df_fresh.pop('Year'))

In [None]:
# Re-ordering Site, Year and Month columns
df_fresh = df_fresh.sort_values(["Site", "Year", "Month"]).reset_index(drop=True)

In [None]:
# Add a temporary unique ID column (env_id) for potential use as a primary key in SQL Server

df_fresh["env_id"] = range(1, len(df_fresh) + 1)

In [None]:
# Move the env_id column to the front of the DataFrame for visibility

df_fresh = df_fresh[["env_id"] + [c for c in df_fresh.columns if c != "env_id"]]

In [None]:
# Drop the env_id column because we decided to add it later in the stacked dataset instead

df_fresh = df_fresh.drop(columns=['env_id'])

In [None]:
# Create a subset of the dataset containing only Site A and Year 2025
# This allows focused inspection and aggregation for a single site.

df_a_2025 = df_fresh[
    (df_fresh['Site'] == "A") & (df_fresh["Year"] == 2025)
    ].copy()

In [None]:
# Quick inspection: all rows for Site A where Month is October (10)

df_a_2025[df_a_2025['Month'] == 10]

In [None]:
# Rules specify how to combine multiple records for the same Site–Year–Month:
# - Sum numeric totals (Energy, Water, Waste, CO2, Incidents)
# - Average percentages and compliance scores
# - Use 'any' for *_recorded flags to indicate if any data exists in the month

agg_rules = {
    "Energy_kWh": "sum", 
    "Water_m3" : "sum", 
    "Waste_tonnes" : "sum",
    "CO2_tonnes": "sum", 
    "Environmental_incidents": "sum", 

    "Recycled_percent": "mean",
    "Compliance_score": "mean",

    "Energy_kWh_recorded": "any",
    "Water_m3_recorded": "any",
    "Waste_tonnes_recorded": "any",
    "Recycled_percent_recorded": "any",
    "CO2_tonnes_recorded": "any",
    "Compliance_score_recorded": "any",
}

In [None]:
# Quick inspection of the aggregated result for October

df_a_2025_clean[df_a_2025_clean["Month"] ==10]

In [None]:
# Incident fix: 0 is valid, NaN = missing
df_fresh["Environmental_incidents_recorded"] = df_fresh["Environmental_incidents"].notna()

# Numeric / decimal enforcement
numeric_cols = ["Energy_kWh","Water_m3","Waste_tonnes","CO2_tonnes","Recycled_percent","Compliance_score"]
for col in numeric_cols:
    df_fresh[col] = pd.to_numeric(df_fresh[col], errors="coerce").astype(float)

# Optional: create recorded flags
for col in numeric_cols:
    df_fresh[f"{col}_recorded"] = df_fresh[col].notna() & (df_fresh[col] != 0)
    df_fresh[f"{col}_recorded"] = df_fresh[f"{col}_recorded"].astype(bool)


In [None]:
agg_rules = {
    "Energy_kWh": "sum",
    "Water_m3": "sum",
    "Waste_tonnes": "sum",
    "CO2_tonnes": "sum",
    "Environmental_incidents": "sum",
    "Recycled_percent": "mean",
    "Compliance_score": "mean",
    "Energy_kWh_recorded": "any",
    "Water_m3_recorded": "any",
    "Waste_tonnes_recorded": "any",
    "Recycled_percent_recorded": "any",
    "CO2_tonnes_recorded": "any",
    "Compliance_score_recorded": "any",
    "Environmental_incidents_recorded": "any"
}

df_2025_clean = df_fresh.groupby(["Site","Year","Month"], as_index=False).agg(agg_rules)


In [None]:
# Sanity check after aggregation 
df_2025_clean

In [None]:
# Persist cleaned 2025data to disk to support a reproducible
# multi-year environmental data pipeline

df_2025_clean.to_csv(
    "cleaned_data_2025_environment.csv",
    index=False
)