# 4.1 — Temporal Fusion Transformer: Wind Onshore
Point predictions with interpretable attention. 24h ahead, trained 2015–2017, tested 2018.

In [1]:
import pandas as pd
import numpy as np
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

df = pd.read_parquet('../cleaned_data.parquet')
df['time'] = pd.to_datetime(df['time'], utc=True)

device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
try:
    t = torch.randn(2, 2, device=device)
    _ = t @ t
except:
    device = torch.device('cpu')

print(f"Shape: {df.shape}")
print(f"PyTorch: {torch.__version__}")
print(f"Device: {device}")

Shape: (35056, 80)
PyTorch: 2.10.0
Device: mps


Prepare features and normalize using training stats

In [2]:
target_col = 'generation wind onshore'
tso_col = 'forecast wind onshore day ahead'

weather_cols = [
    'wind_speed_madrid', 'wind_speed_bilbao', 'wind_speed_barcelona',
    'wind_speed_seville', 'wind_speed_valencia',
    'pressure_bilbao', 'pressure_barcelona', 'pressure_seville',
    'pressure_madrid', 'pressure_valencia',
    'humidity_valencia', 'humidity_bilbao',
    'temp_barcelona', 'temp_max_barcelona',
]
time_cols = ['hour', 'month']
feature_cols = weather_cols + time_cols + [tso_col]

# Normalize using training data stats only
train_mask = df['time'].dt.year <= 2017

target_mean = df.loc[train_mask, target_col].mean()
target_std = df.loc[train_mask, target_col].std()

feat_means = df.loc[train_mask, feature_cols].mean()
feat_stds = df.loc[train_mask, feature_cols].std().replace(0, 1)

target_norm = (df[target_col].values - target_mean) / target_std
features_norm = ((df[feature_cols] - feat_means) / feat_stds).fillna(0).values

# Combine: [target, features] as input channels
all_data = np.column_stack([target_norm, features_norm]).astype(np.float32)

print(f"Input channels: {all_data.shape[1]} (1 target + {len(feature_cols)} features)")
print(f"Features: {feature_cols}")
print(f"Target mean: {target_mean:.0f} MW, std: {target_std:.0f} MW")

Input channels: 18 (1 target + 17 features)
Features: ['wind_speed_madrid', 'wind_speed_bilbao', 'wind_speed_barcelona', 'wind_speed_seville', 'wind_speed_valencia', 'pressure_bilbao', 'pressure_barcelona', 'pressure_seville', 'pressure_madrid', 'pressure_valencia', 'humidity_valencia', 'humidity_bilbao', 'temp_barcelona', 'temp_max_barcelona', 'hour', 'month', 'forecast wind onshore day ahead']
Target mean: 5426 MW, std: 3181 MW


Sliding window dataset — 168h context, 24h prediction

In [3]:
context_length = 168   # 7 days of history
prediction_length = 24  # 24h ahead

class TimeSeriesDataset(Dataset):
    def __init__(self, data, ctx_len, pred_len, start_idx, end_idx):
        self.data = data
        self.ctx_len = ctx_len
        self.pred_len = pred_len
        self.start = start_idx
        self.end = end_idx

    def __len__(self):
        return self.end - self.start - self.ctx_len - self.pred_len + 1

    def __getitem__(self, idx):
        i = self.start + idx
        x = self.data[i : i + self.ctx_len]                          # (ctx_len, all_channels)
        y = self.data[i + self.ctx_len : i + self.ctx_len + self.pred_len, 0]  # (pred_len,)
        x_future = self.data[i + self.ctx_len : i + self.ctx_len + self.pred_len, 1:]  # (pred_len, features)
        return (
            torch.from_numpy(x),
            torch.from_numpy(x_future),
            torch.from_numpy(y),
        )

train_end = int(train_mask.sum())
val_split = int(train_end * 0.8)

train_ds = TimeSeriesDataset(all_data, context_length, prediction_length, 0, val_split)
val_ds = TimeSeriesDataset(all_data, context_length, prediction_length, val_split, train_end)
train_loader = DataLoader(train_ds, batch_size=128, shuffle=True, num_workers=0)
val_loader = DataLoader(val_ds, batch_size=256, shuffle=False, num_workers=0)

print(f"Train samples: {len(train_ds)}, Val samples: {len(val_ds)}")
print(f"Context: {context_length}h, Prediction: {prediction_length}h")

Train samples: 20846, Val samples: 5069
Context: 168h, Prediction: 24h


TFT model — variable selection, gated residual networks, LSTM, interpretable multi-head attention

In [4]:
class GatedResidualNetwork(nn.Module):
    """Core building block: FC -> ELU -> FC -> GLU gate -> LayerNorm + skip connection"""
    def __init__(self, input_size, hidden_size, output_size, dropout=0.1):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.gate_fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(output_size)
        self.skip = nn.Linear(input_size, output_size) if input_size != output_size else nn.Identity()

    def forward(self, x):
        residual = self.skip(x)
        h = F.elu(self.fc1(x))
        h = self.dropout(h)
        output = self.fc2(h)
        gate = torch.sigmoid(self.gate_fc(h))
        return self.layer_norm(gate * output + residual)


class VariableSelectionNetwork(nn.Module):
    """Learns softmax weights over input variables, applies per-variable GRNs"""
    def __init__(self, num_vars, d_model, dropout=0.1):
        super().__init__()
        self.num_vars = num_vars
        self.d_model = d_model
        # Per-variable transformation
        self.var_transforms = nn.ModuleList([
            nn.Linear(1, d_model) for _ in range(num_vars)
        ])
        # Selection weights from flattened inputs
        self.weight_network = GatedResidualNetwork(
            num_vars * d_model, d_model, num_vars, dropout
        )

    def forward(self, x):
        # x: (batch, time, num_vars)
        var_outputs = []
        for i in range(self.num_vars):
            var_outputs.append(self.var_transforms[i](x[:, :, i:i+1]))

        # Stack: (batch, time, num_vars, d_model)
        var_stack = torch.stack(var_outputs, dim=2)

        # Compute selection weights
        flat = var_stack.reshape(x.shape[0], x.shape[1], -1)
        weights = F.softmax(self.weight_network(flat), dim=-1)  # (batch, time, num_vars)

        # Weighted combination
        selected = (var_stack * weights.unsqueeze(-1)).sum(dim=2)
        return selected, weights


class InterpretableMultiHeadAttention(nn.Module):
    """Multi-head attention with shared value weights for interpretability (Lim et al. 2019)"""
    def __init__(self, d_model, n_heads, dropout=0.1):
        super().__init__()
        self.n_heads = n_heads
        self.d_k = d_model // n_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, self.d_k)  # Shared across heads
        self.out_proj = nn.Linear(self.d_k, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, q, k, v, mask=None):
        bs = q.size(0)
        Q = self.W_q(q).view(bs, -1, self.n_heads, self.d_k).transpose(1, 2)
        K = self.W_k(k).view(bs, -1, self.n_heads, self.d_k).transpose(1, 2)
        V = self.W_v(v).unsqueeze(1).expand(-1, self.n_heads, -1, -1)

        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        attn = F.softmax(scores, dim=-1)
        attn = self.dropout(attn)

        context = torch.matmul(attn, V)
        context = context.mean(dim=1)  # Average over heads
        return self.out_proj(context), attn


class TemporalFusionTransformer(nn.Module):
    """TFT: Variable Selection -> LSTM -> Interpretable Attention -> Point Output"""
    def __init__(self, num_observed, num_known_future, d_model=32, n_heads=4,
                 n_lstm_layers=1, pred_len=24, dropout=0.1):
        super().__init__()
        self.pred_len = pred_len
        self.d_model = d_model

        # Variable selection for observed (past) and known future inputs
        self.obs_vsn = VariableSelectionNetwork(num_observed, d_model, dropout)
        self.fut_vsn = VariableSelectionNetwork(num_known_future, d_model, dropout)

        # LSTM encoder-decoder for local temporal processing
        self.encoder_lstm = nn.LSTM(d_model, d_model, n_lstm_layers,
                                     batch_first=True, dropout=dropout if n_lstm_layers > 1 else 0)
        self.decoder_lstm = nn.LSTM(d_model, d_model, n_lstm_layers,
                                     batch_first=True, dropout=dropout if n_lstm_layers > 1 else 0)

        # Post-LSTM gated skip connection
        self.lstm_gate = GatedResidualNetwork(d_model, d_model, d_model, dropout)

        # Interpretable multi-head attention
        self.attention = InterpretableMultiHeadAttention(d_model, n_heads, dropout)
        self.attn_gate = GatedResidualNetwork(d_model, d_model, d_model, dropout)

        # Output
        self.output_proj = nn.Linear(d_model, 1)

    def forward(self, x_observed, x_future):
        # Variable selection
        enc_selected, enc_weights = self.obs_vsn(x_observed)
        dec_selected, dec_weights = self.fut_vsn(x_future)

        # LSTM encoding
        enc_out, (h, c) = self.encoder_lstm(enc_selected)
        dec_out, _ = self.decoder_lstm(dec_selected, (h, c))

        # Concatenate encoder + decoder outputs
        lstm_out = torch.cat([enc_out, dec_out], dim=1)
        input_cat = torch.cat([enc_selected, dec_selected], dim=1)

        # Gated skip connection
        lstm_out = self.lstm_gate(lstm_out) + input_cat

        # Self-attention over full sequence
        attn_out, attn_weights = self.attention(lstm_out, lstm_out, lstm_out)

        # Gated skip connection after attention
        attn_out = self.attn_gate(attn_out) + lstm_out

        # Extract decoder positions and project to output
        decoder_out = attn_out[:, -self.pred_len:, :]
        output = self.output_proj(decoder_out).squeeze(-1)

        return output, enc_weights, attn_weights


num_observed = all_data.shape[1]     # target + all features
num_known_future = len(feature_cols)  # features only (no target in future)

model = TemporalFusionTransformer(
    num_observed=num_observed,
    num_known_future=num_known_future,
    d_model=16,
    n_heads=4,
    n_lstm_layers=1,
    pred_len=prediction_length,
    dropout=0.1,
).to(device)

print(f"TFT parameters: {sum(p.numel() for p in model.parameters()):,}")

TFT parameters: 27,972


Train with MSE loss, early stopping (patience=15)

In [5]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)
criterion = nn.MSELoss()

n_epochs = 100
patience = 15
best_val_loss = float('inf')
best_state = None
epochs_no_improve = 0

for epoch in range(n_epochs):
    # Train
    model.train()
    train_losses = []
    for x_ctx, x_fut, y in train_loader:
        x_ctx, x_fut, y = x_ctx.to(device), x_fut.to(device), y.to(device)
        preds, _, _ = model(x_ctx, x_fut)
        loss = criterion(preds, y)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        train_losses.append(loss.item())
    
    # Validate
    model.eval()
    val_losses = []
    with torch.no_grad():
        for x_ctx, x_fut, y in val_loader:
            x_ctx, x_fut, y = x_ctx.to(device), x_fut.to(device), y.to(device)
            preds, _, _ = model(x_ctx, x_fut)
            val_losses.append(criterion(preds, y).item())
    
    scheduler.step()
    train_loss = np.mean(train_losses)
    val_loss = np.mean(val_losses)
    
    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_state = {k: v.clone() for k, v in model.state_dict().items()}
        epochs_no_improve = 0
        marker = ' *'
    else:
        epochs_no_improve += 1
        marker = ''
    
    if (epoch + 1) % 10 == 0 or epoch == 0 or epochs_no_improve == 0:
        print(f"Epoch {epoch+1:3d}/{n_epochs}, Train: {train_loss:.5f}, Val: {val_loss:.5f}, LR: {scheduler.get_last_lr()[0]:.6f}{marker}")
    
    if epochs_no_improve >= patience:
        print(f"\nEarly stopping at epoch {epoch+1} (no improvement for {patience} epochs)")
        break

# Restore best weights
if best_state is not None:
    model.load_state_dict(best_state)
    print(f"Restored best model (val loss: {best_val_loss:.5f})")

Epoch   1/100, Train: 0.08338, Val: 0.00621, LR: 0.001000 *


Epoch   2/100, Train: 0.00309, Val: 0.00525, LR: 0.000999 *


Epoch   3/100, Train: 0.00241, Val: 0.00502, LR: 0.000998 *


Epoch   4/100, Train: 0.00222, Val: 0.00494, LR: 0.000996 *


Epoch   5/100, Train: 0.00212, Val: 0.00493, LR: 0.000994 *


Epoch   6/100, Train: 0.00207, Val: 0.00493, LR: 0.000991 *


Epoch   7/100, Train: 0.00203, Val: 0.00484, LR: 0.000988 *


Epoch   8/100, Train: 0.00200, Val: 0.00482, LR: 0.000984 *


Epoch   9/100, Train: 0.00198, Val: 0.00482, LR: 0.000980 *


Epoch  10/100, Train: 0.00197, Val: 0.00480, LR: 0.000976 *


Epoch  12/100, Train: 0.00195, Val: 0.00479, LR: 0.000965 *


Epoch  14/100, Train: 0.00194, Val: 0.00478, LR: 0.000952 *


Epoch  16/100, Train: 0.00191, Val: 0.00476, LR: 0.000938 *


Epoch  17/100, Train: 0.00191, Val: 0.00475, LR: 0.000930 *


Epoch  20/100, Train: 0.00189, Val: 0.00476, LR: 0.000905


Epoch  21/100, Train: 0.00188, Val: 0.00472, LR: 0.000895 *


Epoch  23/100, Train: 0.00188, Val: 0.00471, LR: 0.000875 *


Epoch  29/100, Train: 0.00185, Val: 0.00470, LR: 0.000806 *


Epoch  30/100, Train: 0.00185, Val: 0.00473, LR: 0.000794


Epoch  36/100, Train: 0.00182, Val: 0.00470, LR: 0.000713 *


Epoch  39/100, Train: 0.00182, Val: 0.00469, LR: 0.000669 *


Epoch  40/100, Train: 0.00180, Val: 0.00470, LR: 0.000655


Epoch  43/100, Train: 0.00180, Val: 0.00468, LR: 0.000609 *


Epoch  47/100, Train: 0.00177, Val: 0.00468, LR: 0.000547 *


Epoch  50/100, Train: 0.00175, Val: 0.00466, LR: 0.000500 *


Epoch  54/100, Train: 0.00175, Val: 0.00466, LR: 0.000437 *


Epoch  56/100, Train: 0.00173, Val: 0.00465, LR: 0.000406 *


Epoch  60/100, Train: 0.00171, Val: 0.00472, LR: 0.000345


Epoch  70/100, Train: 0.00167, Val: 0.00472, LR: 0.000206



Early stopping at epoch 71 (no improvement for 15 epochs)
Restored best model (val loss: 0.00465)


Generate 24h-ahead forecasts on 2018 test set

In [6]:
model.eval()
test_start = train_end
test_end = len(all_data)

all_preds = []
all_actuals = []
all_times = []

with torch.no_grad():
    for i in range(test_start, test_end - prediction_length, prediction_length):
        if i - context_length < 0:
            continue

        x_ctx = torch.from_numpy(all_data[i - context_length : i]).unsqueeze(0).to(device)
        x_fut = torch.from_numpy(all_data[i : i + prediction_length, 1:]).unsqueeze(0).to(device)

        preds, _, _ = model(x_ctx, x_fut)

        # Denormalize
        pred_mw = preds.squeeze().cpu().numpy() * target_std + target_mean
        actual_mw = all_data[i : i + prediction_length, 0] * target_std + target_mean
        times = df['time'].iloc[i : i + prediction_length].values

        all_preds.append(pred_mw)
        all_actuals.append(actual_mw)
        all_times.append(times)

print(f"Generated {len(all_preds)} forecast windows across 2018")

Generated 364 forecast windows across 2018


Evaluate: MAE, RMSE, MAPE vs TSO baseline

In [7]:
# Flatten all windows for aggregate metrics
flat_preds = np.concatenate(all_preds)
flat_actuals = np.concatenate(all_actuals)

# TSO baseline
flat_tso = []
for w in range(len(all_preds)):
    idx = test_start + w * prediction_length
    tso_vals = df[tso_col].iloc[idx : idx + prediction_length].values
    flat_tso.append(tso_vals)
flat_tso = np.concatenate(flat_tso)

# TFT metrics
tft_mae = np.mean(np.abs(flat_actuals - flat_preds))
tft_rmse = np.sqrt(np.mean((flat_actuals - flat_preds) ** 2))
tft_mape = np.mean(np.abs((flat_actuals - flat_preds) / np.clip(flat_actuals, 1, None))) * 100

# TSO metrics
tso_mae_val = np.mean(np.abs(flat_actuals - flat_tso))
tso_rmse_val = np.sqrt(np.mean((flat_actuals - flat_tso) ** 2))
tso_mape_val = np.mean(np.abs((flat_actuals - flat_tso) / np.clip(flat_actuals, 1, None))) * 100

print(f"{'Metric':<10} {'TFT':>10} {'TSO':>10} {'Improvement':>12}")
print('-' * 44)
print(f"{'MAE (MW)':<10} {tft_mae:>10.1f} {tso_mae_val:>10.1f} {(1 - tft_mae / tso_mae_val) * 100:>+11.1f}%")
print(f"{'RMSE (MW)':<10} {tft_rmse:>10.1f} {tso_rmse_val:>10.1f} {(1 - tft_rmse / tso_rmse_val) * 100:>+11.1f}%")
print(f"{'MAPE (%)':<10} {tft_mape:>10.1f} {tso_mape_val:>10.1f} {(1 - tft_mape / tso_mape_val) * 100:>+11.1f}%")

# Per-hour MAE across the 24h forecast horizon
per_hour_mae = np.zeros(prediction_length)
for h in range(prediction_length):
    h_preds = np.array([p[h] for p in all_preds])
    h_actuals = np.array([a[h] for a in all_actuals])
    per_hour_mae[h] = np.mean(np.abs(h_actuals - h_preds))

print(f"\nPer-hour MAE (MW):")
print(f"  h+1:  {per_hour_mae[0]:.0f}  |  h+6:  {per_hour_mae[5]:.0f}  |  h+12: {per_hour_mae[11]:.0f}  |  h+24: {per_hour_mae[23]:.0f}")

Metric            TFT        TSO  Improvement
--------------------------------------------
MAE (MW)        448.0      448.1        +0.0%
RMSE (MW)       615.3      614.0        -0.2%
MAPE (%)         11.6       11.6        +0.1%

Per-hour MAE (MW):
  h+1:  368  |  h+6:  400  |  h+12: 459  |  h+24: 386


XGBoost residual correction — per-horizon models trained on validation-set TFT errors

In [None]:
sample_windows = range(9, 16)

sample_corrected = []
sample_actual = []
sample_tso = []
sample_time = []

for w in sample_windows:
    if w >= len(all_preds):
        break
    sample_corrected.extend(corrected_preds[w])
    sample_actual.extend(all_actuals[w])
    sample_time.extend(pd.to_datetime(all_times[w]))
    idx = test_start + w * prediction_length
    sample_tso.extend(df[tso_col].iloc[idx : idx + prediction_length].values)

fig, ax = plt.subplots(figsize=(14, 5))
ax.plot(sample_time, sample_actual, color='#2c3e50', linewidth=1.5, label='Actual')
ax.plot(sample_time, sample_corrected, color='coral', linewidth=1.3, label='TFT + XGBoost')
ax.plot(sample_time, sample_tso, color='grey', linewidth=1.0, linestyle='--', alpha=0.7, label='TSO Forecast')
ax.set_ylabel('MW')
ax.set_title('TFT Wind Onshore — Predicted vs Actual (Sample Week, Jan 2018)')
ax.legend()
plt.xticks(rotation=30)
plt.tight_layout()
plt.show()

Predicted vs actual — sample week

In [None]:
import os
os.makedirs('../dashboard/public/data', exist_ok=True)

# Build sample week data using corrected predictions
sample_data = []
for w in range(9, 16):
    if w >= len(all_preds):
        break
    pred = corrected_preds[w]
    actual = all_actuals[w]
    times = all_times[w]
    tso = df[tso_col].iloc[test_start + w * prediction_length : test_start + w * prediction_length + prediction_length].values
    for h in range(prediction_length):
        t = pd.Timestamp(times[h])
        sample_data.append({
            'time': t.strftime('%Y-%m-%d %H:%M'),
            'actual': round(float(actual[h]), 1),
            'predicted': round(float(pred[h]), 1),
            'tso': round(float(tso[h]), 1),
        })

output = {
    'target': 'wind_onshore',
    'model': 'TFT + XGBoost Residual Correction',
    'prediction_length_hours': prediction_length,
    'context_length_hours': context_length,
    'metrics': {
        'mae': round(float(corr_mae), 1),
        'rmse': round(float(corr_rmse), 1),
        'mape': round(float(corr_mape), 1),
        'tso_mae': round(float(tso_mae_val), 1),
        'tso_rmse': round(float(tso_rmse_val), 1),
        'raw_mae': round(float(tft_mae), 1),
        'raw_rmse': round(float(tft_rmse), 1),
    },
    'sample_forecast': sample_data,
}

with open('../dashboard/public/data/tft_wind.json', 'w') as f:
    json.dump(output, f, indent=2)

print('Saved tft_wind.json')
print(f"Raw  MAE: {tft_mae:.1f} MW → Corrected MAE: {corr_mae:.1f} MW (TSO: {tso_mae_val:.1f} MW)")

Export JSON for dashboard

In [9]:
import os
os.makedirs('../dashboard/public/data', exist_ok=True)

# Build sample week data
sample_data = []
for w in range(9, 16):
    if w >= len(all_preds):
        break
    pred = all_preds[w]
    actual = all_actuals[w]
    times = all_times[w]
    tso = df[tso_col].iloc[test_start + w * prediction_length : test_start + w * prediction_length + prediction_length].values
    for h in range(prediction_length):
        t = pd.Timestamp(times[h])
        sample_data.append({
            'time': t.strftime('%Y-%m-%d %H:%M'),
            'actual': round(float(actual[h]), 1),
            'predicted': round(float(pred[h]), 1),
            'tso': round(float(tso[h]), 1),
        })

output = {
    'target': 'wind_onshore',
    'model': 'Temporal Fusion Transformer',
    'prediction_length_hours': prediction_length,
    'context_length_hours': context_length,
    'metrics': {
        'mae': round(float(tft_mae), 1),
        'rmse': round(float(tft_rmse), 1),
        'mape': round(float(tft_mape), 1),
        'tso_mae': round(float(tso_mae_val), 1),
        'tso_rmse': round(float(tso_rmse_val), 1),
    },
    'sample_forecast': sample_data,
}

with open('../dashboard/public/data/tft_wind.json', 'w') as f:
    json.dump(output, f, indent=2)

print('Saved tft_wind.json')

Saved tft_wind.json
