# Spanish Energy Load – Extended Model Zoo

This notebook applies **multiple forecasting models** to the Spanish
dataset (Hourly energy demand, generation and weather – Spain):

- Naïve & seasonal naïve baselines
- Holt–Winters Exponential Smoothing
- SARIMAX with exogenous temperature
- Linear Regression with lagged load + weather + calendar
- Random Forest Regressor
- HistGradientBoostingRegressor
- GradientBoostingRegressor

At the end we build an extended **champion–challenger** comparison.


## 0. Load and clean Spanish dataset

We first try `data/energy.csv` (cleaned Spanish load + temperature).
If missing, we fall back to `data/spain/energy_dataset.csv` and build it.


In [None]:
from __future__ import annotations

from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor

from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (11, 5)

CLEAN_PATH = Path("data") / "energy.csv"
RAW_PATH = Path("data") / "spain" / "energy_dataset.csv"
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

def build_spain_energy_frame(raw: pd.DataFrame) -> pd.DataFrame:
    """Return cleaned hourly Spanish load + temperature.

    Expects:
    - 'time' column with timestamps
    - 'total load actual' as target
    - temperature-like columns (containing 'temp' or 'temperature').
    """
    df = raw.copy()
    if "time" not in df.columns or "total load actual" not in df.columns:
        raise ValueError("Missing 'time' or 'total load actual' in raw data.")

    df["timestamp"] = pd.to_datetime(df["time"], errors="coerce")
    df = df.dropna(subset=["timestamp"]).sort_values("timestamp")
    df["load"] = df["total load actual"].astype(float)

    temp_cols = [c for c in df.columns if ("temp" in c.lower() or "temperature" in c.lower())]
    if temp_cols:
        df["temperature"] = df[temp_cols].astype(float).mean(axis=1)
    else:
        df["temperature"] = np.nan

    df = df.set_index("timestamp").asfreq("H")
    df["load"] = df["load"].interpolate(limit_direction="both")
    if df["temperature"].notna().any():
        df["temperature"] = df["temperature"].interpolate(limit_direction="both")

    return df[["load", "temperature"]]


if CLEAN_PATH.exists():
    df = pd.read_csv(CLEAN_PATH, parse_dates=[0], index_col=0)
else:
    if not RAW_PATH.exists():
        raise FileNotFoundError(f"Neither {CLEAN_PATH} nor {RAW_PATH} were found.")
    raw_df = pd.read_csv(RAW_PATH)
    df = build_spain_energy_frame(raw_df)
    CLEAN_PATH.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(CLEAN_PATH)

df.head()

### 0.1 Metrics and train–test split


In [None]:
def rmse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """Root mean squared error."""
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))


def mape(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """Mean absolute percentage error (in %)."""
    y_true_safe = np.clip(y_true, 1e-6, None)
    return float(np.mean(np.abs((y_true - y_pred) / y_true_safe)) * 100)


horizon_days = 30
horizon = 24 * horizon_days
if len(df) <= 2 * horizon:
    raise ValueError("Not enough data for 30-day test horizon.")

train_df = df.iloc[:-horizon].copy()
test_df = df.iloc[-horizon:].copy()

y_train = train_df["load"].to_numpy()
y_test = test_df["load"].to_numpy()

print("Train:", train_df.index.min(), "→", train_df.index.max())
print("Test: ", test_df.index.min(), "→", test_df.index.max())
print("Train size:", len(train_df), "Test size:", len(test_df))

## 1. Baselines & Holt–Winters


In [None]:
seasonal_period = 24

# Naïve
naive_forecast = np.repeat(y_train[-1], len(y_test))

# Seasonal naïve (repeat last day pattern)
last_day_pattern = train_df["load"].iloc[-seasonal_period:].to_numpy()
seasonal_naive_forecast = np.tile(
    last_day_pattern,
    int(np.ceil(len(y_test) / seasonal_period)),
)[: len(y_test)]

print("Naive RMSE:         ", rmse(y_test, naive_forecast))
print("Seasonal naive RMSE:", rmse(y_test, seasonal_naive_forecast))

# Holt–Winters (Exponential Smoothing)
hw_model = ExponentialSmoothing(
    train_df["load"],
    trend="add",
    seasonal="add",
    seasonal_periods=seasonal_period,
)
hw_fit = hw_model.fit(optimized=True)
hw_forecast = hw_fit.forecast(steps=len(test_df))

print("Holt–Winters RMSE:  ", rmse(y_test, hw_forecast.to_numpy()))


## 2. SARIMAX with exogenous temperature


In [None]:
def fit_sarimax(train: pd.DataFrame, test: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
    """Fit SARIMAX on Spanish load with optional temperature."""
    endog_train = train["load"].astype(float)
    exog_cols: List[str] = []
    if train["temperature"].notna().any():
        exog_cols = ["temperature"]
    exog_train = train[exog_cols] if exog_cols else None
    exog_test = test[exog_cols] if exog_cols else None

    model = SARIMAX(
        endog_train,
        exog=exog_train,
        order=(1, 0, 2),
        seasonal_order=(1, 1, 1, 24),
        enforce_stationarity=False,
        enforce_invertibility=False,
    )
    res = model.fit(disp=False)
    fitted = res.fittedvalues.to_numpy()
    forecast = res.forecast(steps=len(test), exog=exog_test).to_numpy()
    return fitted, forecast


sarimax_fitted, sarimax_forecast = fit_sarimax(train_df, test_df)

print("SARIMAX train RMSE:", rmse(train_df["load"].to_numpy(), sarimax_fitted))
print("SARIMAX test RMSE: ", rmse(test_df["load"].to_numpy(), sarimax_forecast))

## 3. Lagged feature matrix (load + temperature + calendar)


In [None]:
def make_lagged_features(df_in: pd.DataFrame, n_lags: int = 24) -> pd.DataFrame:
    """Create lag, calendar and temperature features for ML models."""
    df_feat = df_in.copy()
    for lag in range(1, n_lags + 1):
        df_feat[f"lag_{lag}"] = df_feat["load"].shift(lag)
    df_feat["hour"] = df_feat.index.hour
    df_feat["dayofweek"] = df_feat.index.dayofweek
    if df_feat["temperature"].notna().any():
        df_feat["temperature"] = df_feat["temperature"].interpolate(limit_direction="both")
    return df_feat.dropna()


lagged_df = make_lagged_features(df, n_lags=24)
lagged_train = lagged_df.loc[train_df.index.intersection(lagged_df.index)]
lagged_test = lagged_df.loc[test_df.index.intersection(lagged_df.index)]

feature_cols = [c for c in lagged_train.columns if c != "load"]
X_train = lagged_train[feature_cols].to_numpy()
y_train_lag = lagged_train["load"].to_numpy()
X_test = lagged_test[feature_cols].to_numpy()
y_test_lag = lagged_test["load"].to_numpy()

X_train.shape, X_test.shape

## 4. ML models on lagged features


In [None]:
# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train_lag)
y_pred_lin = lin_reg.predict(X_test)

print("LinearReg RMSE:", rmse(y_test_lag, y_pred_lin))
print("LinearReg MAE: ", mean_absolute_error(y_test_lag, y_pred_lin))
print("LinearReg MAPE:", mape(y_test_lag, y_pred_lin))

# Random Forest
rf_model = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    n_jobs=-1,
    random_state=RANDOM_STATE,
)
rf_model.fit(X_train, y_train_lag)
y_pred_rf = rf_model.predict(X_test)

print("RandomForest RMSE:", rmse(y_test_lag, y_pred_rf))
print("RandomForest MAE: ", mean_absolute_error(y_test_lag, y_pred_rf))
print("RandomForest MAPE:", mape(y_test_lag, y_pred_rf))

# HistGradientBoosting
hgb_model = HistGradientBoostingRegressor(
    learning_rate=0.05,
    max_depth=7,
    max_iter=300,
    random_state=RANDOM_STATE,
)
hgb_model.fit(X_train, y_train_lag)
y_pred_hgb = hgb_model.predict(X_test)

print("HistGB RMSE:", rmse(y_test_lag, y_pred_hgb))
print("HistGB MAE: ", mean_absolute_error(y_test_lag, y_pred_hgb))
print("HistGB MAPE:", mape(y_test_lag, y_pred_hgb))

# GradientBoostingRegressor
gbr_model = GradientBoostingRegressor(
    random_state=RANDOM_STATE,
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
)
gbr_model.fit(X_train, y_train_lag)
y_pred_gbr = gbr_model.predict(X_test)

print("GBR RMSE:", rmse(y_test_lag, y_pred_gbr))
print("GBR MAE: ", mean_absolute_error(y_test_lag, y_pred_gbr))
print("GBR MAPE:", mape(y_test_lag, y_pred_gbr))

## 5. Extended champion–challenger table


In [None]:
common_index = test_df.index.intersection(lagged_test.index)
y_true_common = test_df.loc[common_index, "load"].to_numpy()

def as_series(pred: np.ndarray, index: pd.Index) -> pd.Series:
    return pd.Series(pred, index=index)

# Baselines and classical models on test_df.index
naive_series = as_series(naive_forecast, test_df.index).loc[common_index].to_numpy()
seasonal_naive_series = as_series(seasonal_naive_forecast, test_df.index).loc[common_index].to_numpy()
hw_series = as_series(hw_forecast.to_numpy(), test_df.index).loc[common_index].to_numpy()
sarimax_series = as_series(sarimax_forecast, test_df.index).loc[common_index].to_numpy()

# ML models on lagged_test.index
gbr_series = as_series(y_pred_gbr, lagged_test.index).loc[common_index].to_numpy()
lin_series = as_series(y_pred_lin, lagged_test.index).loc[common_index].to_numpy()
rf_series = as_series(y_pred_rf, lagged_test.index).loc[common_index].to_numpy()
hgb_series = as_series(y_pred_hgb, lagged_test.index).loc[common_index].to_numpy()

rows: List[Dict[str, float | str]] = []

def add_metrics(name: str, y_true: np.ndarray, y_pred: np.ndarray) -> None:
    rows.append(
        {
            "model": name,
            "rmse": rmse(y_true, y_pred),
            "mae": mean_absolute_error(y_true, y_pred),
            "mape": mape(y_true, y_pred),
        }
    )

add_metrics("NaiveLast",        y_true_common, naive_series)
add_metrics("SeasonalNaive_24", y_true_common, seasonal_naive_series)
add_metrics("HoltWinters",      y_true_common, hw_series)
add_metrics("SARIMAX",          y_true_common, sarimax_series)
add_metrics("GBR",              y_true_common, gbr_series)
add_metrics("LinearReg",        y_true_common, lin_series)
add_metrics("RandomForest",     y_true_common, rf_series)
add_metrics("HistGB",           y_true_common, hgb_series)

champion_df = pd.DataFrame(rows).set_index("model").sort_values("rmse")
champion_df

In [None]:
champion_df["rmse"].plot(kind="bar")
plt.title("Spanish load – extended champion–challenger (RMSE, lower is better)")
plt.ylabel("RMSE")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()