# Spanish Energy Load Forecasting with Weather

This notebook focuses specifically on the Spanish **Hourly energy demand, generation and weather** dataset from Kaggle.

We will:
- Load and clean the Spanish dataset (load + weather).
- Perform time-series EDA (decomposition, anomalies, load vs temperature).
- Train SARIMAX (with exogenous temperature).
- Train a Gradient Boosting model with lagged load, temperature and calendar features.
- Build simple probabilistic forecasts (quantile GB) and a champion–challenger comparison.


## 0. Setup and Data Loading

We expect the original Kaggle file `energy_dataset.csv` at:

`data/spain/energy_dataset.csv`

If it lives elsewhere, adjust `RAW_PATH` below.


In [None]:
from __future__ import annotations

from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import GradientBoostingRegressor

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (11, 5)

RAW_PATH = Path("data") / "spain" / "energy_dataset.csv"
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

if not RAW_PATH.exists():
    raise FileNotFoundError(
        f"Expected raw Spanish dataset at {RAW_PATH.resolve()}\n"
        "Place 'energy_dataset.csv' there or adjust RAW_PATH."
    )

raw_df = pd.read_csv(RAW_PATH)
raw_df.head()

### 0.1 Clean and standardise columns

We standardise to:
- `timestamp` (index)
- `load` – from `total load actual`
- `temperature` – mean across temperature-like columns


In [None]:
def build_spain_energy_frame(raw: pd.DataFrame) -> pd.DataFrame:
    """Return cleaned hourly Spanish load + temperature.

    Expects columns:
    - 'time'
    - 'total load actual'
    plus one or more temperature-like columns.
    """
    df = raw.copy()
    if "time" not in df.columns or "total load actual" not in df.columns:
        raise ValueError("Missing 'time' or 'total load actual' in raw data.")

    df["timestamp"] = pd.to_datetime(df["time"], errors="coerce")
    df = df.dropna(subset=["timestamp"]).sort_values("timestamp")

    df["load"] = df["total load actual"].astype(float)

    temp_cols = [c for c in df.columns if ("temp" in c.lower() or "temperature" in c.lower())]
    if temp_cols:
        df["temperature"] = df[temp_cols].astype(float).mean(axis=1)
    else:
        df["temperature"] = np.nan

    df = df.set_index("timestamp").asfreq("H")
    df["load"] = df["load"].interpolate(limit_direction="both")
    if df["temperature"].notna().any():
        df["temperature"] = df["temperature"].interpolate(limit_direction="both")

    return df[["load", "temperature"]]


df = build_spain_energy_frame(raw_df)
df.head()

### 0.2 Save a reusable cleaned file (optional)


In [None]:
clean_path = Path("data") / "energy.csv"
clean_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(clean_path)
print("Saved cleaned Spanish data to", clean_path)

### 0.3 Helper metrics


In [None]:
def rmse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """Root mean squared error."""
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))


def mape(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """Mean absolute percentage error (in %)."""
    y_true_safe = np.clip(y_true, 1e-6, None)
    return float(np.mean(np.abs((y_true - y_pred) / y_true_safe)) * 100)


print("Sanity RMSE (self):", rmse(df["load"].to_numpy(), df["load"].to_numpy()))

## 1. Tier 1 – EDA and Decomposition


In [None]:
# Global plot
df["load"].plot(alpha=0.8)
plt.title("Spanish energy load – full series")
plt.ylabel("Load")
plt.show()

# Last 60 days zoom
last_period = df.iloc[-24*60:]
last_period["load"].plot()
plt.title("Spanish load – last ~60 days")
plt.ylabel("Load")
plt.show()

# Decompose with daily seasonality
decomp = seasonal_decompose(df["load"].dropna(), model="additive", period=24)
fig = decomp.plot()
fig.set_size_inches(11, 8)
plt.tight_layout()
plt.show()

df["trend"] = decomp.trend
df["seasonal"] = decomp.seasonal
df["resid"] = decomp.resid
df[["load", "trend", "seasonal", "resid"]].head()

### 1.1 Calendar profiles and anomalies


In [None]:
cal_df = df.copy()
cal_df["hour"] = cal_df.index.hour
cal_df["dayofweek"] = cal_df.index.dayofweek

sns.lineplot(data=cal_df, x="hour", y="load", estimator="mean", ci=None)
plt.title("Average daily load profile (Spain)")
plt.show()

sns.lineplot(data=cal_df, x="dayofweek", y="load", estimator="mean", ci=None)
plt.title("Average weekly load profile (Spain)")
plt.show()

# Anomaly detection
resid = df["resid"].dropna()
resid_std = resid.std()
stat_mask = np.abs(resid) > 3 * resid_std
stat_anoms = resid[stat_mask]
rule_mask = (df["load"] < 0) | (df["load"] > df["load"].quantile(0.995))
rule_anoms = df["load"][rule_mask]

plt.figure(figsize=(12, 5))
plt.plot(df.index, df["load"], alpha=0.6, label="load")
plt.scatter(stat_anoms.index, df.loc[stat_anoms.index, "load"], s=10, label="stat", color="red")
plt.scatter(rule_anoms.index, rule_anoms.values, s=10, label="rule", color="orange")
plt.legend()
plt.title("Anomalies – Spanish load")
plt.show()

### 1.2 Load vs temperature and cross-correlation


In [None]:
if df["temperature"].notna().any():
    tmp = df[["load", "temperature"]].dropna().sample(min(8000, len(df)), random_state=RANDOM_STATE)
    sns.scatterplot(data=tmp, x="temperature", y="load", alpha=0.3)
    plt.title("Spanish load vs temperature (sample)")
    plt.show()

    max_lag = 72
    load_vals = df["load"].interpolate(limit_direction="both").to_numpy()
    temp_vals = df["temperature"].interpolate(limit_direction="both").to_numpy()
    lags = np.arange(-max_lag, max_lag + 1)
    ccs: List[float] = []
    for lag in lags:
        if lag < 0:
            x = temp_vals[:lag]
            y = load_vals[-lag:]
        elif lag > 0:
            x = temp_vals[lag:]
            y = load_vals[:-lag]
        else:
            x = temp_vals
            y = load_vals
        ccs.append(np.corrcoef(x, y)[0, 1])

    plt.plot(lags, ccs)
    plt.axhline(0, color="black", linewidth=0.8)
    plt.title("Temperature–load cross-correlation (Spain)")
    plt.xlabel("Lag (hours, negative = temp leads)")
    plt.ylabel("Correlation")
    plt.show()
else:
    print("No temperature data – skipping weather analysis.")

## 2. Tier 2 – Modelling (SARIMAX + GB with weather)


In [None]:
horizon_days = 30
horizon = 24 * horizon_days
if len(df) <= 2 * horizon:
    raise ValueError("Not enough data for 30-day test horizon.")

train_df = df.iloc[:-horizon].copy()
test_df = df.iloc[-horizon:].copy()

print("Train:", train_df.index.min(), "→", train_df.index.max())
print("Test: ", test_df.index.min(), "→", test_df.index.max())

In [None]:
def fit_sarimax(train: pd.DataFrame, test: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
    """Fit SARIMAX on Spanish load with optional temperature."""
    endog_train = train["load"].astype(float)
    exog_cols: List[str] = []
    if train["temperature"].notna().any():
        exog_cols = ["temperature"]
    exog_train = train[exog_cols] if exog_cols else None
    exog_test = test[exog_cols] if exog_cols else None

    model = SARIMAX(
        endog_train,
        exog=exog_train,
        order=(1, 0, 2),
        seasonal_order=(1, 1, 1, 24),
        enforce_stationarity=False,
        enforce_invertibility=False,
    )
    res = model.fit(disp=False)
    fitted = res.fittedvalues.to_numpy()
    forecast = res.forecast(steps=len(test), exog=exog_test).to_numpy()
    return fitted, forecast


sarimax_fitted, sarimax_forecast = fit_sarimax(train_df, test_df)
print("SARIMAX train RMSE:", rmse(train_df["load"].to_numpy(), sarimax_fitted))
print("SARIMAX test RMSE: ", rmse(test_df["load"].to_numpy(), sarimax_forecast))

plt.plot(train_df.index, train_df["load"], label="train", alpha=0.6)
plt.plot(test_df.index, test_df["load"], label="test", alpha=0.8)
plt.plot(test_df.index, sarimax_forecast, label="SARIMAX", linestyle="--")
plt.title("Spanish load – SARIMAX forecast vs actual")
plt.legend()
plt.show()

In [None]:
def make_lagged_features(df_in: pd.DataFrame, n_lags: int = 24) -> pd.DataFrame:
    """Create lag, calendar and temperature features."""
    df_feat = df_in.copy()
    for lag in range(1, n_lags + 1):
        df_feat[f"lag_{lag}"] = df_feat["load"].shift(lag)
    df_feat["hour"] = df_feat.index.hour
    df_feat["dayofweek"] = df_feat.index.dayofweek
    if df_feat["temperature"].notna().any():
        df_feat["temperature"] = df_feat["temperature"].interpolate(limit_direction="both")
    return df_feat.dropna()


lagged_df = make_lagged_features(df, n_lags=24)
lagged_train = lagged_df.loc[train_df.index.intersection(lagged_df.index)]
lagged_test = lagged_df.loc[test_df.index.intersection(lagged_df.index)]

feature_cols = [c for c in lagged_train.columns if c != "load"]
X_train = lagged_train[feature_cols].to_numpy()
y_train = lagged_train["load"].to_numpy()
X_test = lagged_test[feature_cols].to_numpy()
y_test = lagged_test["load"].to_numpy()

X_train.shape, X_test.shape

In [None]:
def evaluate_gb_cv(X: np.ndarray, y: np.ndarray, n_splits: int = 5) -> pd.DataFrame:
    """Time series CV for Gradient Boosting."""
    tscv = TimeSeriesSplit(n_splits=n_splits)
    rows: List[Dict[str, float]] = []
    for fold, (tr, val) in enumerate(tscv.split(X), start=1):
        X_tr, X_val = X[tr], X[val]
        y_tr, y_val = y[tr], y[val]
        model = GradientBoostingRegressor(
            random_state=RANDOM_STATE,
            n_estimators=300,
            learning_rate=0.05,
            max_depth=3,
        )
        model.fit(X_tr, y_tr)
        y_hat = model.predict(X_val)
        rows.append({
            "fold": fold,
            "rmse": rmse(y_val, y_hat),
            "mae": mean_absolute_error(y_val, y_hat),
            "mape": mape(y_val, y_hat),
        })
    return pd.DataFrame(rows)


cv_metrics = evaluate_gb_cv(X_train, y_train, n_splits=5)
cv_metrics

In [None]:
sns.boxplot(data=cv_metrics.melt(id_vars="fold", value_vars=["rmse", "mape"]), x="variable", y="value")
plt.title("GB – rolling-origin CV errors (Spain)")
plt.ylabel("Error")
plt.show()

gbr_model = GradientBoostingRegressor(
    random_state=RANDOM_STATE,
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
)
gbr_model.fit(X_train, y_train)
y_pred_test = gbr_model.predict(X_test)

print("GBR test RMSE:", rmse(y_test, y_pred_test))
print("GBR test MAE: ", mean_absolute_error(y_test, y_pred_test))
print("GBR test MAPE:", mape(y_test, y_pred_test))

plt.plot(lagged_test.index, y_test, label="actual", alpha=0.8)
plt.plot(lagged_test.index, y_pred_test, label="GBR", linestyle="--")
plt.title("Spanish load – GBR forecast vs actual")
plt.legend()
plt.show()

## 3. Tier 3 – Probabilistic Forecasts and Champion–Challenger


In [None]:
# Quantile Gradient Boosting
quantiles = [0.1, 0.5, 0.9]
q_models: Dict[float, GradientBoostingRegressor] = {}
for q in quantiles:
    m = GradientBoostingRegressor(
        loss="quantile",
        alpha=q,
        random_state=RANDOM_STATE,
        n_estimators=300,
        learning_rate=0.05,
        max_depth=3,
    )
    m.fit(X_train, y_train)
    q_models[q] = m

q_preds = {q: m.predict(X_test) for q, m in q_models.items()}

plt.figure(figsize=(12, 5))
idx = lagged_test.index
plt.plot(idx, y_test, label="actual", color="black", linewidth=1)
plt.plot(idx, q_preds[0.5], label="q0.5", linestyle="--")
plt.fill_between(idx, q_preds[0.1], q_preds[0.9], alpha=0.3, label="q0.1–0.9")
plt.title("Spanish load – quantile GB forecast")
plt.legend()
plt.show()

In [None]:
# Conformal-style interval from CV residuals
tscv = TimeSeriesSplit(n_splits=5)
abs_resids: List[float] = []
for tr, val in tscv.split(X_train):
    X_tr, X_val = X_train[tr], X_train[val]
    y_tr, y_val = y_train[tr], y_train[val]
    m = GradientBoostingRegressor(
        random_state=RANDOM_STATE,
        n_estimators=200,
        learning_rate=0.05,
        max_depth=3,
    )
    m.fit(X_tr, y_tr)
    y_hat = m.predict(X_val)
    abs_resids.extend(np.abs(y_val - y_hat))

abs_resids = np.array(abs_resids)
alpha = 0.1
q_conf = float(np.quantile(abs_resids, 1 - alpha))
print("Approx. 90% absolute error quantile:", q_conf)

lower = y_pred_test - q_conf
upper = y_pred_test + q_conf

plt.figure(figsize=(12, 5))
plt.plot(lagged_test.index, y_test, label="actual", color="black")
plt.plot(lagged_test.index, y_pred_test, label="GBR", linestyle="--")
plt.fill_between(lagged_test.index, lower, upper, alpha=0.3, label="conformal band")
plt.title("Spanish load – GBR with conformal-style band")
plt.legend()
plt.show()

In [None]:
# Champion–challenger
y_test_sarimax = test_df["load"].to_numpy()
naive = np.repeat(train_df["load"].iloc[-1], len(y_test_sarimax))

def model_metrics(name: str, y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str, float]:
    return {
        "model": name,
        "rmse": rmse(y_true, y_pred),
        "mae": mean_absolute_error(y_true, y_pred),
        "mape": mape(y_true, y_pred),
    }

rows: List[Dict[str, float]] = []
rows.append(model_metrics("NaiveLast", y_test_sarimax, naive))
rows.append(model_metrics("SARIMAX", y_test_sarimax, sarimax_forecast))
aligned = min(len(y_test_sarimax), len(y_test))
rows.append(model_metrics("GBR", y_test[-aligned:], y_pred_test[-aligned:]))

champion_df = pd.DataFrame(rows).set_index("model")
champion_df

In [None]:
champion_df.sort_values("rmse").plot(kind="bar")
plt.title("Spanish load – champion–challenger (lower is better)")
plt.ylabel("Error")
plt.xticks(rotation=0)
plt.show()