In [None]:
# Notebook Purpose: - Clean 2024 environmental dataset. - Standardise units and flags. - Prepare for stacking.

# Assumptions: 
# - Raw data contains mixed boolean types. 
# - Numeric fields may contain excessive decimals. 
# - Compliance fields may contain nulls.

# Output: - Cleaned 2024 dataset ready for stacking

In [None]:
import pandas as pd
df_2024 = pd.read_csv("../data/env_2024_60.csv")
df_2024


In [None]:
# Data Inspection Utility. Purpose: Provides a structured, repeatable data quality overview of the raw 2024 dataset 
# before any cleaning or transformation is applied. This function performs structural, completeness, duplication, and schema validation checks to support defensible preprocessing. 
# Note: This function does NOT modify the dataset. It is purely diagnostic.


def inspect_df(df_2024: pd.DataFrame, sample_size: int = 5):
    def section(title: str):
        print("\n" + "=" * 80)
        print(title)
        print("=" * 80)

    section("SHAPE & STRUCTURE")
    print("Rows, Columns:", df_2024.shape)
    print("Columns:", df_2024.columns.tolist())
    print("Index:", df_2024.index)

    section("DATA TYPES & NULL OVERVIEW")
    df_2024.info()

    section("ROW-LEVEL SANITY CHECKS")
    display(df_2024.head(sample_size))
    display(df_2024.tail(sample_size))
    if len(df_2024) > 0:
        display(df_2024.sample(min(sample_size, len(df_2024))))

    section("MISSING VALUES")
    missing_counts = df_2024.isna().sum()
    missing_pct = df_2024.isna().mean().sort_values(ascending=False)
    print(missing_counts[missing_counts > 0])
    print(missing_pct[missing_pct > 0])

    section("DUPLICATES")
    print("Total duplicated rows:", df_2024.duplicated().sum())

    section("NUMERIC COLUMNS")
    num_df_2024 = df_2024.select_dtypes(include="number")
    if not df_2024.empty:
        display(num_df_2024.describe().T)

    section("CATEGORICAL COLUMNS")
    cat_df_2024 = df_2024.select_dtypes(include=["object", "category"])
    if not cat_df_2024.empty:
        print(cat_df_2024.nunique().sort_values())
        display(cat_df_2024.iloc[:, 0].value_counts(dropna=False).head(10))

    section("SCHEMA SNAPSHOT")
    print(df_2024.dtypes)

    print("\nINSPECTION COMPLETE — NO DATA MODIFIED")
    
inspect_df(df_2024)

In [None]:
# Looking at column names to determine whether they need reording/renaming 
df_2024.columns.tolist()


In [None]:
# Mapping the new column names 
rename_map = {
    "site": "Site",
    "year": "Year",
    "month": "Month",
    "energy_mwh": "Energy_kWh",
    "water_m3": "Water_m3",
    "waste_tonnes": "Waste_tonnes",
    "ghg_tonnes": "CO2_tonnes",
    "incidents": "Environmental_incidents",
}

In [None]:
# Standardising column names 
df_2024 = df_2024.rename(columns=rename_map)

In [None]:
# Arrange columns in a consistent, logical order
# to improve readability and ensure compatibility
# with downstream stacking and SQL ingestion.
target_order = [
    "Site",
    "Year",
    "Month",
    "Energy_kWh",
    "Water_m3",
    "Waste_tonnes",
    "CO2_tonnes",
    "Environmental_incidents",
]
# Apply the column order to the DataFrame
df_2024 = df_2024[target_order]

In [None]:
# Quick inspection to confirm new column order
df_2024.columns.tolist()
df_2024.head()

In [None]:
# Add placeholder columns for metrics not yet present in the 2024 dataset
# Recycled_percent and Compliance_score are set to pd.NA to indicate missing values.
# This preserves schema consistency with the 2024 dataset for stacking.

df_2024["Recycled_percent"] = pd.NA
df_2024["Compliance_score"] = pd.NA

In [None]:
# Rename columns to match standardised schema (no changes needed here, kept for consistency)

df_2024 = df_2024.rename(columns={
    "Recycled_percent" : "Recycled_percent"
})

In [None]:
# Fill missing values in percentage/score columns with 0.
# This ensures downstream calculations don’t break.
# (May trigger a FutureWarning in newer pandas versions, but functionality is correct)
df_2024[["Recycled_percent", "Compliance_score"]] = (
    df_2024[["Recycled_percent", "Compliance_score"]].fillna(0)
)

In [None]:
# Cast placeholder columns to float to maintain consistency
# with the 2024 dataset and support numeric calculations later.

df_2024["Recycled_percent"] = df_2024["Recycled_percent"].astype(float)
df_2024["Compliance_score"] = df_2024["Compliance_score"].astype(float)


In [None]:
# Quick inspection of DataFrame structure, data types, and non-null counts

df_2024.info()

In [None]:
# Standardise site identifiers for consistency and easier querying in SQL Server 

site_map = { 
    'SITE A' : 'A', 
    'SITE B' : 'B', 
    'SITE C' : 'C', 
    'SITE D' : 'D', 
    'SITE E' : 'E' 
} 

df_2024['Site'] = df_2024['Site'].replace(site_map)
df_2024.head()

In [None]:
# Define measurement columns for which we will track data presence.

measurement_cols = [
    "Energy_kWh",
    "Water_m3",
    "Waste_tonnes",
    "CO2_tonnes",
    "Environmental_incidents",
    "Recycled_percent",
    "Compliance_score",
]

In [None]:
# Create *_recorded flags: True if value is present and non-zero

for col in measurement_cols:
    df_2024[f"{col}_recorded"] = df_2024[col].notna() & (df_2024[col] != 0)

In [None]:
# Cast recorded flags to boolean for consistency

for col in measurement_cols:
    df_2024[f"{col}_recorded"] = df_2024[f"{col}_recorded"].astype(bool)


In [None]:
# Quick inspection of the DataFrame including original metrics and recorded flags

df_2024[[*measurement_cols, *[f"{c}_recorded" for c in measurement_cols]]].head()

In [None]:
# Redefine measurement columns excluding Environmental_incidents for separate handling

measurement_cols = [
    "Energy_kWh",
    "Water_m3",
    "Waste_tonnes",
    "CO2_tonnes",
    "Recycled_percent",
    "Compliance_score",
]

# Recreate *_recorded flags for metrics excluding incidents (optional/clarity)

for col in measurement_cols:
    df_2024[f"{col}_recorded"] = df_2024[col].notna() & (df_2024[col] != 0)

# Override for environmental incidents: 0 is a valid recorded value

df_2024["Environmental_incidents_recorded"] = df_2024["Environmental_incidents"].notna()

In [None]:
# Quick inspection of the first 60 rows

df_2024.head(60)

In [None]:
# Filter example: all rows for Site A

df_2024[df_2024['Site'] == 'A']

In [None]:
# Filter example: Site A, Month 2

filtered_df = df_2024[
	   (df_2024['Month'] == 2) &
	   (df_2024['Site'] == 'A')
]
filtered_df

In [None]:
# DEFINE AGGREGATION RULES FOR MONTHLY CONSOLIDATION

agg_rules = {
    "Energy_kWh": "sum",
    "Water_m3": "sum",
    "Waste_tonnes": "sum",
    "CO2_tonnes": "sum",
    "Environmental_incidents": "sum",
    "Recycled_percent": "mean",
    "Compliance_score": "mean",

    "Energy_kWh_recorded": "any",
    "Water_m3_recorded": "any",
    "Waste_tonnes_recorded": "any",
    "Recycled_percent_recorded": "any",
    "CO2_tonnes_recorded": "any",
    "Compliance_score_recorded": "any",
    "Environmental_incidents_recorded": "any",
}

In [None]:
# Aggregate multiple partial records per Site–Year–Month into a single row
# using the defined aggregation rules (sums, means, 'any' for recorded flags)

df_clean = (
    df_2024
    .groupby(["Site", "Year", "Month"], as_index=False)
    .agg(agg_rules)
)

In [None]:
# Check for duplicates after aggregation to ensure one record per Site–Year–Month

df_clean.duplicated(["Site", "Year", "Month"]).sum()

In [None]:
# Verify that the completeness flag column exists post-aggregation

"Environmental_incidents_recorded" in df_clean.columns

In [None]:
# Quick inspection of the aggregated DataFrame
# Confirm column order and names after aggregation

df_clean.head()
df_clean.columns.tolist()

In [None]:
# Create a final cleaned copy for export and downstream processing

df_2024_clean = df_clean.copy()

In [None]:
# Persist cleaned 2024 data to disk to support a reproducible
# multi-year environmental data pipeline

df_2024_clean.to_csv(
    "Environmental_data_2024_clean.csv", 
    index=False
)