# ERA5 vs Station Performance Analysis

This notebook evaluates the performance of ERA5 daily temperature against station observations
under different environmental conditions, using the file `stations_daily_with_features.csv`.

We compute and visualise metrics for:
- Distance to sea (C.1)
- Wind speed and direction (C.2, 2022–2023)
- Rainfall regimes (C.3, 2020–2023)

The error is defined as **ERA5 − station**, so:
- Negative values → ERA5 underestimates station temperature.
- Positive values → ERA5 overestimates station temperature.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

df = pd.read_csv("stations_daily_with_features.csv", parse_dates=["date"])
df = df[~df["error"].isna()].copy()

def compute_error_metrics(group: pd.Series) -> pd.Series:
    arr = group.to_numpy()
    return pd.Series({
        "error_mean": np.mean(arr),
        "error_std": np.std(arr, ddof=1),
        "error_rmse": np.sqrt(np.mean(arr**2)),
        "n": arr.size
    })


## C.1 – Metrics by distance to the sea

In [None]:
bins = [0, 10, 50, np.inf]
labels = ["< 10 km", "10–50 km", "> 50 km"]
df["dist_cat"] = pd.cut(df["distance_to_sea_km"], bins=bins, labels=labels, include_lowest=True)

dist_summary = (
    df.dropna(subset=["dist_cat"])
      .groupby("dist_cat")["error"]
      .apply(compute_error_metrics)
      .unstack()
)
dist_summary


In [None]:
station_summary = (
    df.groupby("station_id")
      .agg(
          name=("name", "first"),
          env_class=("env_class", "first"),
          distance_to_sea_km=("distance_to_sea_km", "first"),
          error_mean_station=("error", "mean"),
          n=("error", "size")
      )
)

fig, ax = plt.subplots(figsize=(6, 4), dpi=120)
data_for_box = [df.loc[df["dist_cat"] == lab, "error"].dropna() for lab in labels]
ax.boxplot(data_for_box, labels=labels, showfliers=False)
ax.set_xlabel("Distance to sea category")
ax.set_ylabel("Temperature error (ERA5 − station) [°C]")
ax.set_title("Error vs distance to sea")
plt.tight_layout()

fig, ax = plt.subplots(figsize=(6, 4), dpi=120)
for env, g in station_summary.reset_index().groupby("env_class"):
    ax.scatter(g["distance_to_sea_km"], g["error_mean_station"], label=env, alpha=0.8)
ax.axhline(0, linestyle="--")
ax.set_xlabel("Distance to sea [km]")
ax.set_ylabel("Mean temperature error (ERA5 − station) [°C]")
ax.set_title("Station bias vs distance to sea")
ax.legend(title="Env class")
plt.tight_layout()


## C.2 – Metrics by wind (2022–2023)

In [None]:
df_ws = df[df["date"] >= "2022-01-01"].copy()
df_ws = df_ws[~df_ws["WS"].isna()].copy()

p25, p75 = df_ws["WS"].quantile([0.25, 0.75])

def categorize_ws(ws):
    if ws < p25:
        return "weak"
    elif ws > p75:
        return "strong"
    else:
        return "medium"

df_ws["WS_cat"] = df_ws["WS"].apply(categorize_ws)
ws_order = ["weak", "medium", "strong"]

ws_summary = (
    df_ws.groupby("WS_cat")["error"]
         .apply(compute_error_metrics)
         .unstack()
)
ws_summary.loc[ws_order]


In [None]:
fig, ax = plt.subplots(figsize=(6, 4), dpi=120)
data_ws_box = [df_ws.loc[df_ws["WS_cat"] == cat, "error"].dropna() for cat in ws_order]
ax.boxplot(data_ws_box, labels=ws_order, showfliers=False)
ax.set_xlabel("Wind speed category")
ax.set_ylabel("Temperature error (ERA5 − station) [°C]")
ax.set_title("Error vs wind speed regime (2022–2023)")
plt.tight_layout()

df_ws_dir = df_ws.dropna(subset=["wind_regime"]).copy()
wind_regime_summary = (
    df_ws_dir.groupby("wind_regime")["error"]
             .apply(compute_error_metrics)
             .unstack()
)
wind_regime_summary


In [None]:
fig, ax = plt.subplots(figsize=(7, 4), dpi=120)
ax.bar(wind_regime_summary.index, wind_regime_summary["error_mean"])
ax.axhline(0, linestyle="--")
ax.set_xlabel("Wind direction regime")
ax.set_ylabel("Mean temperature error [°C]")
ax.set_title("Mean error by wind direction (2022–2023)")
plt.xticks(rotation=45)
plt.tight_layout()


## C.3 – Metrics by rainfall (2020–2023)

In [None]:
df_rain = df[~df["precip"].isna()].copy()
df_rain["rain_cat"] = np.where(df_rain["precip"] == 0, "dry", "wet")

rain_summary = (
    df_rain.groupby("rain_cat")["error"]
           .apply(compute_error_metrics)
           .unstack()
)
rain_summary


In [None]:
p90_rain = df_rain.loc[df_rain["precip"] > 0, "precip"].quantile(0.9)
df_heavy = df_rain[df_rain["precip"] > p90_rain].copy()

if not df_heavy.empty:
    heavy_metrics = compute_error_metrics(df_heavy["error"])
    heavy_metrics

fig, ax = plt.subplots(figsize=(6, 4), dpi=120)
data_rain_box = [df_rain.loc[df_rain["rain_cat"] == lab, "error"].dropna()
                 for lab in ["dry", "wet"]]
ax.boxplot(data_rain_box, labels=["dry", "wet"], showfliers=False)
ax.set_xlabel("Rainfall category")
ax.set_ylabel("Temperature error (ERA5 − station) [°C]")
ax.set_title("Error on dry vs wet days")
plt.tight_layout()
