In [1]:
import pandas as pd 
import numpy as np 

df_env = pd.read_csv("env_2023_2025_final_sql.csv")

In [2]:
#Working on a copy not the OG 
df = df_env.copy()

In [4]:
recorded_cols = [
    "Energy_kWh_recorded", 
    "Water_m3_recorded", 
    "Waste_tonnes_recorded", 
    "CO2_tonnes_recorded", 
    "Compliance_score_recorded", 
    "Environmental_incidents_recorded", 
] 

In [6]:
df["recorded_count"] = df[recorded_cols].sum(axis=1) 
df["total_expected_metrics"] = len(recorded_cols)
df["data_completness_pct"] = ( 
    df["recorded_count"] / df["total_expected_metrics"] * 100 
).round(1)

df["missing_metric_count"] = (
    df["total_expected_metrics"] - df["recorded_count"] 
    )

In [7]:
df["incident_flag"] = df["Environmental_incidents"].fillna(0).gt(0)

In [9]:
def compliance_band(score): 
    if pd.isna(score): 
        return "Unknown" 
    if score >= 80: 
        return "Good"
    if score >= 60: 
        return "Watch" 
    return "Poor" 

df["compliance_band"] = df["Compliance_score"].apply(compliance_band) 

In [11]:
baseline_metric = "CO2_tonnes" 

site_mean = df.groupby("Site")[baseline_metric].transform("mean")
site_std = df.groupby("Site")[baseline_metric].transform("std").replace(0, np.nan)

df["variation_z"] = (df[baseline_metric] - site_mean) / site_std
df["variation_flag"] = df["variation_z"].abs().ge(2).fillna(False) 

In [13]:
def review_status(row):
    if row["incident_flag"]:
        return "Red"
    if row["compliance_band"] == "Poor":
        return "Red"
    if row["data_completness_pct"] < 50:
        return "Red"

    if row["compliance_band"] == "Watch":
        return "Amber"
    if row["data_completness_pct"] < 80:
        return "Amber"
    if row["variation_flag"]:
        return "Amber"

    return "Green"

df["review_status"] = df.apply(review_status, axis=1)
df["follow_up_required"] = df["review_status"].isin(["Amber", "Red"])


In [14]:
def build_notes(row):
    reasons = []
    if row["incident_flag"]:
        reasons.append("Incident recorded")
    if row["compliance_band"] == "Poor":
        reasons.append("Low compliance score")
    if row["compliance_band"] == "Watch":
        reasons.append("Moderate compliance score")
    if row["data_completness_pct"] < 80:
        reasons.append(f"Incomplete monitoring ({row['data_completness_pct']}%)")
    if row["variation_flag"] and not pd.isna(row["variation_z"]):
        reasons.append(
            f"Abnormal {baseline_metric} variation (z={row['variation_z']:.2f})"
        )
    return "; ".join(reasons) if reasons else "No issues flagged"

df["review_notes"] = df.apply(build_notes, axis=1)


In [16]:
compliance_review = df[
    [
        "env_id",
        "Site", "Year", "Month",
        "data_completness_pct",
        "missing_metric_count",
        "Environmental_incidents",
        "incident_flag",
        "Compliance_score",
        "compliance_band",
        "variation_flag",
        "review_status",
        "follow_up_required",
        "review_notes",
    ]
].copy()


In [17]:
compliance_review = compliance_review.sort_values(
    ["Year", "Month", "Site"]
).reset_index(drop=True)

compliance_review.insert(0, "review_id", range(1, len(compliance_review) + 1))

In [20]:
compliance_review["review_status"].value_counts()


review_status
Red      75
Green     3
Amber     1
Name: count, dtype: int64

In [21]:
compliance_review["env_id"].is_unique

True

In [22]:
compliance_review

Unnamed: 0,review_id,env_id,Site,Year,Month,data_completness_pct,missing_metric_count,Environmental_incidents,incident_flag,Compliance_score,compliance_band,variation_flag,review_status,follow_up_required,review_notes
0,1,1,A,2023,1,83.3,1,0,False,0.0,Poor,False,Red,True,Low compliance score
1,2,2,C,2023,1,83.3,1,2,True,0.0,Poor,False,Red,True,Incident recorded; Low compliance score
2,3,3,D,2023,1,83.3,1,2,True,0.0,Poor,False,Red,True,Incident recorded; Low compliance score
3,4,4,E,2023,1,83.3,1,2,True,0.0,Poor,False,Red,True,Incident recorded; Low compliance score
4,5,5,B,2023,2,83.3,1,0,False,0.0,Poor,False,Red,True,Low compliance score
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74,75,75,A,2025,6,100.0,0,0,False,72.9,Watch,False,Amber,True,Moderate compliance score
75,76,76,A,2025,7,100.0,0,0,False,84.5,Good,False,Green,False,No issues flagged
76,77,77,A,2025,8,100.0,0,3,True,79.1,Watch,False,Red,True,Incident recorded; Moderate compliance score
77,78,78,A,2025,10,100.0,0,3,True,75.2,Watch,False,Red,True,Incident recorded; Moderate compliance score


In [23]:
df = df.rename(columns={"data_completness_pct": "data_completeness_pct"})

In [24]:
df

Unnamed: 0,env_id,Site,Year,Month,Energy_kWh,Water_m3,Waste_tonnes,CO2_tonnes,Environmental_incidents,Recycled_percent,...,total_expected_metrics,data_completeness_pct,missing_metric_count,incident_flag,compliance_band,variation_z,variation_flag,review_status,follow_up_required,review_notes
0,1,A,2023,1,804.8,306.7,32.90,56.20,0,0.0,...,6,83.3,1,False,Poor,-0.503234,False,Red,True,Low compliance score
1,2,C,2023,1,1871.3,864.2,73.60,159.60,2,0.0,...,6,83.3,1,True,Poor,-0.168228,False,Red,True,Incident recorded; Low compliance score
2,3,D,2023,1,916.5,307.1,40.70,194.80,2,0.0,...,6,83.3,1,True,Poor,-0.294241,False,Red,True,Incident recorded; Low compliance score
3,4,E,2023,1,772.1,378.3,39.90,70.80,2,0.0,...,6,83.3,1,True,Poor,-0.760985,False,Red,True,Incident recorded; Low compliance score
4,5,B,2023,2,1348.9,372.3,27.90,88.40,0,0.0,...,6,83.3,1,False,Poor,-1.131729,False,Red,True,Low compliance score
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74,75,A,2025,6,5941.0,676.0,36.98,5.89,0,44.8,...,6,100.0,0,False,Watch,-0.863416,False,Amber,True,Moderate compliance score
75,76,A,2025,7,3988.0,703.0,35.22,8.47,0,46.7,...,6,100.0,0,False,Good,-0.844945,False,Green,False,No issues flagged
76,77,A,2025,8,5730.0,1019.0,32.73,15.94,3,45.9,...,6,100.0,0,True,Watch,-0.791465,False,Red,True,Incident recorded; Moderate compliance score
77,78,A,2025,10,8739.0,1735.0,48.06,19.24,3,31.8,...,6,100.0,0,True,Watch,-0.767840,False,Red,True,Incident recorded; Moderate compliance score


In [25]:
compliance_review.to_csv("compliance_review.csv", index=False)