In [5]:
import pandas as pd 
import numpy as np 

df_env = pd.read_csv("../data/cleaned/right_environmental_data_2023_2025_final.csv") 

In [23]:
# Create a working copy of the stacked environmental dataset
# This protects the original dataframe from accidental modification
df = df_env.copy()

In [7]:
# Define all boolean *_recorded columns used to assess data completeness
# Each column indicates whether a metric was reported for that row
recorded_cols = [
    "Energy_kWh_recorded", 
    "Water_m3_recorded", 
    "Waste_tonnes_recorded", 
    "CO2_tonnes_recorded", 
    "Compliance_score_recorded", 
    "Environmental_incidents_recorded", 
] 

In [8]:
# Count how many metrics were recorded per Site–Year–Month
df["recorded_count"] = df[recorded_cols].sum(axis=1) 

# Define the total number of expected metrics per row
# (Used as the denominator for completeness calculations)
df["total_expected_metrics"] = len(recorded_cols)

# Calculate percentage data completeness for each record
# Rounded to one decimal place for reporting clarity
df["data_completness_pct"] = ( 
    df["recorded_count"] / df["total_expected_metrics"] * 100 
).round(1)

# Calculate how many metrics are missing per record
df["missing_metric_count"] = (
    df["total_expected_metrics"] - df["recorded_count"] 
    )

In [9]:
# Flag rows where at least one environmental incident occurred
# NaN is treated as no incident
df["incident_flag"] = df["Environmental_incidents"].fillna(0).gt(0)

In [10]:
# Assign compliance bands based on compliance score
def compliance_band(score): 
    if pd.isna(score): 
        return "Unknown" 
    if score >= 80: 
        return "Good"
    if score >= 60: 
        return "Watch" 
    return "Poor" 

df["compliance_band"] = df["Compliance_score"].apply(compliance_band) 

In [11]:
# Define baseline metric used for statistical variation detection
baseline_metric = "CO2_tonnes" 

# Calculate site-level mean and standard deviation for baseline metric
# Used to identify abnormal deviations within each site
site_mean = df.groupby("Site")[baseline_metric].transform("mean")
site_std = df.groupby("Site")[baseline_metric].transform("std").replace(0, np.nan)

# Compute z-score for baseline metric per row
df["variation_z"] = (df[baseline_metric] - site_mean) / site_std

# Flag records where variation exceeds ±2 standard deviations
df["variation_flag"] = df["variation_z"].abs().ge(2).fillna(False) 



In [12]:
# Determine overall review status using hierarchical risk logic
# Red = immediate concern
# Amber = monitoring required
# Green = no issues
def review_status(row):
    if row["incident_flag"]:
        return "Red"
    if row["compliance_band"] == "Poor":
        return "Red"
    if row["data_completness_pct"] < 50:
        return "Red"

    if row["compliance_band"] == "Watch":
        return "Amber"
    if row["data_completness_pct"] < 80:
        return "Amber"
    if row["variation_flag"]:
        return "Amber"

    return "Green"

df["review_status"] = df.apply(review_status, axis=1)

# Flag whether follow-up action is required
df["follow_up_required"] = df["review_status"].isin(["Amber", "Red"])


In [13]:
# Build human-readable review notes explaining why a record was flagged
def build_notes(row):
    reasons = []
    if row["incident_flag"]:
        reasons.append("Incident recorded")
    if row["compliance_band"] == "Poor":
        reasons.append("Low compliance score")
    if row["compliance_band"] == "Watch":
        reasons.append("Moderate compliance score")
    if row["data_completness_pct"] < 80:
        reasons.append(f"Incomplete monitoring ({row['data_completness_pct']}%)")
    if row["variation_flag"] and not pd.isna(row["variation_z"]):
        reasons.append(
            f"Abnormal {baseline_metric} variation (z={row['variation_z']:.2f})"
        )
    return "; ".join(reasons) if reasons else "No issues flagged"

df["review_notes"] = df.apply(build_notes, axis=1)


In [14]:
# Create final compliance review table for reporting / SQL ingestion
compliance_review = df[
    [
        "env_id",
        "Site", "Year", "Month",
        "data_completness_pct",
        "missing_metric_count",
        "Environmental_incidents",
        "incident_flag",
        "Compliance_score",
        "compliance_band",
        "variation_flag",
        "review_status",
        "follow_up_required",
        "review_notes",
    ]
].copy()


In [15]:
# Sort for readability and reporting consistency
compliance_review = compliance_review.sort_values(
    ["Year", "Month", "Site"]
).reset_index(drop=True)

# Add a surrogate primary key for SQL usage
compliance_review.insert(0, "review_id", range(1, len(compliance_review) + 1))

In [16]:
# Sanity checks
compliance_review["review_status"].value_counts()

review_status
Red      102
Green      9
Amber      2
Name: count, dtype: int64

In [17]:
# Sanity checks
compliance_review["env_id"].is_unique

True

In [19]:
# This line standardises a column name after an earlier typo during feature 
# engineering, ensuring consistent naming before export and reporting
df = df.rename(columns={"data_completness_pct": "data_completeness_pct"})

In [21]:
# Export compliance review table
compliance_review.to_csv("compliance_review.csv", index=False)