# 4.2 — Temporal Fusion Transformer: Solar Generation
Point predictions with interpretable attention. 24h ahead, trained 2015–2017, tested 2018.

In [None]:
import pandas as pd
import numpy as np
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

df = pd.read_parquet('../cleaned_data.parquet')
df['time'] = pd.to_datetime(df['time'], utc=True)

# Use MPS (Apple Silicon GPU) if available
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
try:
    t = torch.randn(2, 2, device=device)
    _ = t @ t
except:
    device = torch.device('cpu')

print(f"Shape: {df.shape}")
print(f"PyTorch: {torch.__version__}")
print(f"Device: {device}")

Prepare features and normalize using training stats

In [None]:
target_col = 'generation solar'
tso_col = 'forecast solar day ahead'

weather_cols = [
    'clouds_all_madrid', 'clouds_all_bilbao', 'clouds_all_barcelona',
    'clouds_all_seville', 'clouds_all_valencia',
    'temp_madrid', 'temp_bilbao', 'temp_barcelona',
    'temp_seville', 'temp_valencia',
    'temp_max_madrid', 'temp_max_bilbao', 'temp_max_barcelona',
    'temp_max_seville', 'temp_max_valencia',
    'humidity_madrid', 'humidity_bilbao', 'humidity_barcelona',
    'humidity_seville', 'humidity_valencia',
]
time_cols = ['hour', 'month']
feature_cols = weather_cols + time_cols  # NO TSO covariate for solar

# Train/test split
train_mask = df['time'].dt.year <= 2017

# Normalize using training stats
target_mean = df.loc[train_mask, target_col].mean()
target_std = df.loc[train_mask, target_col].std()

feat_means = df.loc[train_mask, feature_cols].mean()
feat_stds = df.loc[train_mask, feature_cols].std().replace(0, 1)

target_norm = (df[target_col].values - target_mean) / target_std
features_norm = ((df[feature_cols] - feat_means) / feat_stds).fillna(0).values

# For TFT: separate observed (target + features) and future-known (features only)
observed_data = np.column_stack([target_norm, features_norm]).astype(np.float32)
future_data = features_norm.astype(np.float32)

num_observed = observed_data.shape[1]   # target + features
num_future = future_data.shape[1]       # features only

print(f"Observed channels: {num_observed} (1 target + {len(feature_cols)} features)")
print(f"Future-known channels: {num_future} ({len(feature_cols)} features)")
print(f"Target mean: {target_mean:.0f} MW, std: {target_std:.0f} MW")

Sliding window dataset — 168h context, 24h prediction

In [None]:
context_length = 168   # 7 days of history
prediction_length = 24  # 24h ahead

class TimeSeriesDataset(Dataset):
    def __init__(self, obs_data, fut_data, target_norm, ctx_len, pred_len, start_idx, end_idx):
        self.obs_data = obs_data
        self.fut_data = fut_data
        self.target_norm = target_norm
        self.ctx_len = ctx_len
        self.pred_len = pred_len
        self.start = start_idx
        self.end = end_idx

    def __len__(self):
        return self.end - self.start - self.ctx_len - self.pred_len + 1

    def __getitem__(self, idx):
        i = self.start + idx
        x_obs = self.obs_data[i : i + self.ctx_len]              # (ctx_len, num_observed)
        x_fut = self.fut_data[i + self.ctx_len : i + self.ctx_len + self.pred_len]  # (pred_len, num_future)
        y = self.target_norm[i + self.ctx_len : i + self.ctx_len + self.pred_len]    # (pred_len,)
        return (
            torch.from_numpy(x_obs),
            torch.from_numpy(x_fut),
            torch.from_numpy(y.copy()).float(),
        )

train_end = int(train_mask.sum())
train_ds = TimeSeriesDataset(observed_data, future_data, target_norm, context_length, prediction_length, 0, train_end)
train_loader = DataLoader(train_ds, batch_size=128, shuffle=True, num_workers=0)

print(f"Train samples: {len(train_ds)}")
print(f"Context: {context_length}h, Prediction: {prediction_length}h")

TFT model — variable selection, gated residual networks, LSTM, interpretable multi-head attention

In [None]:
class GatedResidualNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0.1):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.gate_fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(output_size)
        self.skip = nn.Linear(input_size, output_size) if input_size != output_size else nn.Identity()

    def forward(self, x):
        residual = self.skip(x)
        h = F.elu(self.fc1(x))
        h = self.dropout(h)
        output = self.fc2(h)
        gate = torch.sigmoid(self.gate_fc(h))
        return self.layer_norm(gate * output + residual)

class VariableSelectionNetwork(nn.Module):
    def __init__(self, num_vars, d_model, dropout=0.1):
        super().__init__()
        self.num_vars = num_vars
        self.d_model = d_model
        self.var_transforms = nn.ModuleList([nn.Linear(1, d_model) for _ in range(num_vars)])
        self.weight_network = GatedResidualNetwork(num_vars * d_model, d_model, num_vars, dropout)

    def forward(self, x):
        var_outputs = []
        for i in range(self.num_vars):
            var_outputs.append(self.var_transforms[i](x[:, :, i:i+1]))
        var_stack = torch.stack(var_outputs, dim=2)
        flat = var_stack.reshape(x.shape[0], x.shape[1], -1)
        weights = F.softmax(self.weight_network(flat), dim=-1)
        selected = (var_stack * weights.unsqueeze(-1)).sum(dim=2)
        return selected, weights

class InterpretableMultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout=0.1):
        super().__init__()
        self.n_heads = n_heads
        self.d_k = d_model // n_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, self.d_k)
        self.out_proj = nn.Linear(self.d_k, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, q, k, v, mask=None):
        bs = q.size(0)
        Q = self.W_q(q).view(bs, -1, self.n_heads, self.d_k).transpose(1, 2)
        K = self.W_k(k).view(bs, -1, self.n_heads, self.d_k).transpose(1, 2)
        V = self.W_v(v).unsqueeze(1).expand(-1, self.n_heads, -1, -1)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = F.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        context = torch.matmul(attn, V)
        context = context.mean(dim=1)
        return self.out_proj(context), attn

class TemporalFusionTransformer(nn.Module):
    def __init__(self, num_observed, num_known_future, d_model=32, n_heads=4,
                 n_lstm_layers=1, pred_len=24, dropout=0.1):
        super().__init__()
        self.pred_len = pred_len
        self.d_model = d_model
        self.obs_vsn = VariableSelectionNetwork(num_observed, d_model, dropout)
        self.fut_vsn = VariableSelectionNetwork(num_known_future, d_model, dropout)
        self.encoder_lstm = nn.LSTM(d_model, d_model, n_lstm_layers, batch_first=True,
                                     dropout=dropout if n_lstm_layers > 1 else 0)
        self.decoder_lstm = nn.LSTM(d_model, d_model, n_lstm_layers, batch_first=True,
                                     dropout=dropout if n_lstm_layers > 1 else 0)
        self.lstm_gate = GatedResidualNetwork(d_model, d_model, d_model, dropout)
        self.attention = InterpretableMultiHeadAttention(d_model, n_heads, dropout)
        self.attn_gate = GatedResidualNetwork(d_model, d_model, d_model, dropout)
        self.output_proj = nn.Linear(d_model, 1)

    def forward(self, x_observed, x_future):
        enc_selected, enc_weights = self.obs_vsn(x_observed)
        dec_selected, dec_weights = self.fut_vsn(x_future)
        enc_out, (h, c) = self.encoder_lstm(enc_selected)
        dec_out, _ = self.decoder_lstm(dec_selected, (h, c))
        lstm_out = torch.cat([enc_out, dec_out], dim=1)
        input_cat = torch.cat([enc_selected, dec_selected], dim=1)
        lstm_out = self.lstm_gate(lstm_out) + input_cat
        attn_out, attn_weights = self.attention(lstm_out, lstm_out, lstm_out)
        attn_out = self.attn_gate(attn_out) + lstm_out
        decoder_out = attn_out[:, -self.pred_len:, :]
        output = self.output_proj(decoder_out).squeeze(-1)
        return output, enc_weights, attn_weights

model = TemporalFusionTransformer(
    num_observed=num_observed,
    num_known_future=num_future,
    d_model=32,
    n_heads=4,
    n_lstm_layers=1,
    pred_len=prediction_length,
    dropout=0.1,
).to(device)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"On device: {device}")

Train with MSE loss, cosine annealing LR

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=30)
criterion = nn.MSELoss()

n_epochs = 30
for epoch in range(n_epochs):
    model.train()
    losses = []
    for x_obs, x_fut, y in train_loader:
        x_obs, x_fut, y = x_obs.to(device), x_fut.to(device), y.to(device)
        output, _, _ = model(x_obs, x_fut)
        loss = criterion(output, y)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        losses.append(loss.item())

    scheduler.step()
    avg_loss = np.mean(losses)
    if (epoch + 1) % 5 == 0 or epoch == 0:
        print(f"Epoch {epoch+1}/{n_epochs}, Loss: {avg_loss:.4f}, LR: {scheduler.get_last_lr()[0]:.5f}")

print('Training complete')

Generate 24h-ahead forecasts on 2018 test set

In [None]:
model.eval()
test_start = train_end
test_end = len(observed_data)

all_preds = []
all_actuals = []
all_times = []

with torch.no_grad():
    for i in range(test_start, test_end - prediction_length, prediction_length):
        if i - context_length < 0:
            continue

        x_obs = torch.from_numpy(observed_data[i - context_length : i]).unsqueeze(0).to(device)
        x_fut = torch.from_numpy(future_data[i : i + prediction_length]).unsqueeze(0).to(device)

        output, _, _ = model(x_obs, x_fut)

        # Denormalize to MW
        pred_mw = output.squeeze().cpu().numpy() * target_std + target_mean
        actual_mw = target_norm[i : i + prediction_length] * target_std + target_mean
        times = df['time'].iloc[i : i + prediction_length].values

        all_preds.append(pred_mw)
        all_actuals.append(actual_mw)
        all_times.append(times)

print(f"Generated {len(all_preds)} forecast windows across 2018")

Evaluate: MAE, RMSE, MAPE vs TSO baseline

In [None]:
# TFT metrics
flat_preds = np.concatenate(all_preds)
flat_actuals = np.concatenate(all_actuals)

tft_mae = np.mean(np.abs(flat_actuals - flat_preds))
tft_rmse = np.sqrt(np.mean((flat_actuals - flat_preds) ** 2))
mask_nonzero = flat_actuals > 10
tft_mape = np.mean(np.abs((flat_actuals[mask_nonzero] - flat_preds[mask_nonzero]) / flat_actuals[mask_nonzero])) * 100

# TSO baseline metrics
tso_values = []
for i in range(len(all_preds)):
    idx_start = test_start + i * prediction_length
    tso_values.append(df[tso_col].iloc[idx_start : idx_start + prediction_length].values)

flat_tso = np.concatenate(tso_values)
tso_mae = np.mean(np.abs(flat_actuals - flat_tso))
tso_rmse = np.sqrt(np.mean((flat_actuals - flat_tso) ** 2))
tso_mape = np.mean(np.abs((flat_actuals[mask_nonzero] - flat_tso[mask_nonzero]) / flat_actuals[mask_nonzero])) * 100

# Improvement
mae_imp = (1 - tft_mae / tso_mae) * 100
rmse_imp = (1 - tft_rmse / tso_rmse) * 100

print(f"{'Metric':<10} {'TFT':>10} {'TSO':>10} {'Improv.':>10}")
print('-' * 42)
print(f"{'MAE (MW)':<10} {tft_mae:>10.1f} {tso_mae:>10.1f} {mae_imp:>9.1f}%")
print(f"{'RMSE (MW)':<10} {tft_rmse:>10.1f} {tso_rmse:>10.1f} {rmse_imp:>9.1f}%")
print(f"{'MAPE (%)':<10} {tft_mape:>10.1f} {tso_mape:>10.1f}")

Predicted vs actual — sample week

In [None]:
sample_windows = range(9, 16)

fig, ax = plt.subplots(figsize=(14, 5))

hours_offset = 0
for w in sample_windows:
    if w >= len(all_preds):
        break
    x_range = range(hours_offset, hours_offset + prediction_length)
    ax.plot(x_range, all_actuals[w], color='#1a1a2e', linewidth=1.5,
            label='Actual' if w == 9 else None)
    ax.plot(x_range, all_preds[w], color='coral', linewidth=1.5,
            label='TFT Predicted' if w == 9 else None)
    tso_idx = test_start + w * prediction_length
    tso_vals = df[tso_col].iloc[tso_idx : tso_idx + prediction_length].values
    ax.plot(x_range, tso_vals, color='grey', linestyle='--', linewidth=1,
            label='TSO Forecast' if w == 9 else None)
    hours_offset += prediction_length

ax.set_xlabel('Hours')
ax.set_ylabel('MW')
ax.set_title('TFT Solar Generation — Predicted vs Actual (Sample Week 2018)')
ax.legend()
plt.tight_layout()
plt.show()

Export JSON for dashboard

In [None]:
import os
os.makedirs('../dashboard/public/data', exist_ok=True)

# Build sample data from sample windows
sample_data = []
for w in sample_windows:
    if w >= len(all_preds):
        break
    tso_idx = test_start + w * prediction_length
    tso_vals = df[tso_col].iloc[tso_idx : tso_idx + prediction_length].values
    times = all_times[w]
    for h in range(prediction_length):
        t = pd.Timestamp(times[h])
        sample_data.append({
            'time': t.strftime('%Y-%m-%d %H:%M'),
            'actual': round(float(all_actuals[w][h]), 1),
            'predicted': round(float(all_preds[w][h]), 1),
            'tso': round(float(tso_vals[h]), 1),
        })

output = {
    'target': 'solar',
    'model': 'TFT (Temporal Fusion Transformer)',
    'prediction_length_hours': prediction_length,
    'context_length_hours': context_length,
    'metrics': {
        'mae': round(float(tft_mae), 1),
        'rmse': round(float(tft_rmse), 1),
        'mape': round(float(tft_mape), 1),
        'tso_mae': round(float(tso_mae), 1),
        'tso_rmse': round(float(tso_rmse), 1),
        'tso_mape': round(float(tso_mape), 1),
        'mae_improvement_pct': round(float(mae_imp), 1),
        'rmse_improvement_pct': round(float(rmse_imp), 1),
    },
    'sample_data': sample_data,
}

with open('../dashboard/public/data/tft_solar.json', 'w') as f:
    json.dump(output, f, indent=2)

print(f"Saved tft_solar.json ({len(sample_data)} sample hours)")
print(f"TFT MAE: {output['metrics']['mae']} MW")
print(f"TSO MAE: {output['metrics']['tso_mae']} MW")
print(f"Improvement: {output['metrics']['mae_improvement_pct']}%")