# 05 — Transformer global con exógenas

**Objetivo:** forecasting de `Weekly_Sales` semanal por `Store` usando Transformer global con covariables exógenas.

## Supuesto experimental (oracle exog)
Se asume disponibilidad de todas las covariables exógenas durante el horizonte de predicción (escenario oracle).

## Outputs estándar
- `outputs/predictions/transformer_exog_predictions.csv` con: `Store, Date, y_true, y_pred, model`
- `outputs/metrics/transformer_exog_metrics_global.csv`
- `outputs/metrics/transformer_exog_metrics_by_store.csv`
- `outputs/figures/transformer_exog_plot_*.png`

In [1]:
# 0) Imports y configuración
from __future__ import annotations

import json
import sys
from pathlib import Path

import numpy as np
import pandas as pd

PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.common import (
    compute_metrics,
    load_data,
    make_features,
    save_outputs,
    temporal_split,
)

MODEL_NAME = 'transformer_exog'
SEED = 42
np.random.seed(SEED)

DATA_PATH = PROJECT_ROOT / 'data' / 'Walmart_Sales.csv'
METADATA_PATH = PROJECT_ROOT / 'outputs' / 'metadata.json'
OUTPUTS_DIR = PROJECT_ROOT / 'outputs'

## 1) Cargar metadata (split + features)
Esto garantiza consistencia entre modelos.

In [2]:
metadata = json.loads(METADATA_PATH.read_text(encoding='utf-8'))
split = metadata['split']
feature_cols = metadata['features']
print('Split:', split)
print('N features:', len(feature_cols))

Split: {'train_start': '2010-02-05', 'train_end': '2011-12-02', 'val_start': '2011-12-09', 'val_end': '2012-01-27', 'test_start': '2012-02-03', 'test_end': '2012-10-26'}
N features: 19


## 2) Carga de datos + features
- Parseo/orden
- Construcción de lags/rolling (sin leakage)
- Exógenas alineadas por fecha

In [3]:
df = load_data(DATA_PATH)
df_feat, _ = make_features(df, add_calendar=True)

# Importante: para entrenar, debes decidir cómo tratar NaNs creados por lags/rolling
# Opción típica: descartar filas con NaNs en features (por store al inicio)
model_df = df_feat.dropna(subset=feature_cols + ['Weekly_Sales']).copy()
model_df.shape

(4095, 22)

## 3) Split temporal
Reutiliza exactamente el split definido en el notebook 00.

In [4]:
train_df, val_df, test_df, split_cfg = temporal_split(df)

# Aplicar el split sobre model_df (ya sin NaNs por lags)
train = model_df[model_df['Date'].between(split_cfg.train_start, split_cfg.train_end)].copy()
val = model_df[model_df['Date'].between(split_cfg.val_start, split_cfg.val_end)].copy()
test = model_df[model_df['Date'].between(split_cfg.test_start, split_cfg.test_end)].copy()

print(len(train), len(val), len(test))

1980 360 1755


## 4) Entrenamiento del modelo
Implementación Transformer global con covariables exógenas.
Incluye representaciones de Store (one-hot o embedding).

In [5]:
# TODO: implementar entrenamiento Transformer con exógenas
# Debe producir predicciones para TEST (ideal: también para VAL).
y_pred_test = np.full(shape=len(test), fill_value=test['Weekly_Sales'].mean())

## 5) Métricas (MAE, RMSE, sMAPE)
Se reporta:
- Global
- Por store

In [6]:
from warnings import filterwarnings



filterwarnings("ignore")



try:

    import torch

    from torch import nn

    from torch.utils.data import DataLoader, TensorDataset

except Exception as exc:

    raise ImportError(

        "PyTorch no está instalado. Instala con: pip install torch"

    ) from exc



from sklearn.preprocessing import StandardScaler



torch.manual_seed(SEED)

np.random.seed(SEED)



# Helper para features autoregresivas (sin leakage)

def _compute_feature_vector(y_hist, date, exog_row, cfg):

    lags = cfg["lags"]

    rollings = cfg["rollings"]

    add_calendar = bool(cfg.get("add_calendar", True))

    exog_cols = cfg["exog_cols"]

    feat = {}

    for k in lags:

        feat[f"lag_{k}"] = y_hist[-k] if len(y_hist) >= k else np.nan

    for w in rollings:

        if len(y_hist) >= w:

            window = np.array(y_hist[-w:], dtype=float)

            feat[f"roll_mean_{w}"] = float(window.mean())

            feat[f"roll_std_{w}"] = float(window.std(ddof=0))

        else:

            feat[f"roll_mean_{w}"] = np.nan

            feat[f"roll_std_{w}"] = np.nan

    for c in exog_cols:

        feat[c] = float(exog_row[c])

    if add_calendar:
        iso = pd.Timestamp(date).isocalendar()
        week_val = iso.week if hasattr(iso, 'week') else iso[1]
        year_val = iso.year if hasattr(iso, 'year') else iso[0]
        feat["weekofyear"] = int(week_val)
        feat["month"] = int(pd.Timestamp(date).month)
        feat["year"] = int(year_val)

    vec = [feat.get(c, np.nan) for c in feature_cols]

    return vec



# Configuración base + search space

EPOCHS_MAX = 200

PATIENCE_ES = 15

MIN_DELTA = 1e-4

CLIP_NORM = 1.0

WEIGHT_DECAY = 1e-4

REDUCE_LR_FACTOR = 0.5

REDUCE_LR_PATIENCE = 5

EMB_DIM = 16

# Secuencia autoregresiva (ventana).
# Nota: con splits por fecha, una ventana demasiado grande puede dejar el train vacío (num_samples=0).
WINDOW_CANDIDATES = [52, 26, 13]
FORCE_WINDOW = 26  # ventana fija (pon None para auto)
WINDOW = int(FORCE_WINDOW) if FORCE_WINDOW is not None else int(WINDOW_CANDIDATES[0])



transformer_search = [

    {"d_model": 64, "nhead": 4, "num_layers": 2, "dropout": 0.1, "lr": 1e-3},

    {"d_model": 64, "nhead": 4, "num_layers": 2, "dropout": 0.2, "lr": 3e-4},

    {"d_model": 128, "nhead": 8, "num_layers": 4, "dropout": 0.1, "lr": 3e-4},

    {"d_model": 128, "nhead": 8, "num_layers": 2, "dropout": 0.2, "lr": 1e-3},

]



# Anti-leakage

assert split_cfg.train_end < split_cfg.val_start < split_cfg.val_end < split_cfg.test_start <= split_cfg.test_end



# Mapping Store -> idx para embeddings

stores_sorted = sorted(model_df["Store"].unique())

store_to_idx = {s: i for i, s in enumerate(stores_sorted)}

num_stores = len(store_to_idx)



feature_cols_model = feature_cols
DEFAULT_LAGS = sorted({int(c.split("_")[1]) for c in feature_cols if c.startswith("lag_")})
DEFAULT_ROLLINGS = sorted({int(c.split("_")[2]) for c in feature_cols if c.startswith("roll_mean_")})
EXOG_COLUMNS = [c for c in feature_cols if not c.startswith("lag_") and not c.startswith("roll_")]



# Escaladores SOLO con train

scaler_x = StandardScaler()

scaler_y = StandardScaler()

train_fit = train.copy()

scaler_x.fit(train_fit[feature_cols_model].values)

scaler_y.fit(train_fit[["Weekly_Sales"]].values)

assert hasattr(scaler_x, "mean_") and hasattr(scaler_y, "mean_")



def build_sequences(df_in: pd.DataFrame, window: int = WINDOW):

    sequences, targets, dates, stores_idx = [], [], [], []

    df_in = df_in.sort_values(["Store", "Date"]).copy()

    for store, g in df_in.groupby("Store"):

        g = g.sort_values("Date")

        X = scaler_x.transform(g[feature_cols_model].values)

        y = scaler_y.transform(g[["Weekly_Sales"]].values).ravel()

        date_arr = g["Date"].values

        s_idx = store_to_idx[int(store)]

        for t in range(window, len(g)):

            sequences.append(X[t - window : t])

            targets.append(y[t])

            dates.append(date_arr[t])

            stores_idx.append(s_idx)

    return (

        np.array(sequences),

        np.array(targets),

        np.array(dates),

        np.array(stores_idx, dtype=int),

    )



def _try_build_for_window(window: int):
    X_seq, y_seq, d_seq, s_seq = build_sequences(model_df, window=window)
    train_mask = (d_seq >= split_cfg.train_start) & (d_seq <= split_cfg.train_end)
    val_mask = (d_seq >= split_cfg.val_start) & (d_seq <= split_cfg.val_end)
    test_mask = (d_seq >= split_cfg.test_start) & (d_seq <= split_cfg.test_end)
    return X_seq, y_seq, d_seq, s_seq, train_mask, val_mask, test_mask

# Reporte de muestras por ventana (útil para justificar decisiones)
window_report = []
for w in WINDOW_CANDIDATES:
    _X, _y, _d, _s, _tr, _va, _te = _try_build_for_window(w)
    window_report.append({"WINDOW": w, "n_train": int(_tr.sum()), "n_val": int(_va.sum()), "n_test": int(_te.sum())})
window_report_df = pd.DataFrame(window_report)
display(window_report_df)

# Selección de WINDOW
if FORCE_WINDOW is not None:
    WINDOW = int(FORCE_WINDOW)
    if WINDOW not in WINDOW_CANDIDATES:
        raise ValueError(f"FORCE_WINDOW={WINDOW} no está en WINDOW_CANDIDATES={WINDOW_CANDIDATES}")
    row = window_report_df[window_report_df["WINDOW"] == WINDOW].iloc[0]
    if int(row.n_train) <= 0 or int(row.n_val) <= 0 or int(row.n_test) <= 0:
        raise ValueError(
            "WINDOW forzada no es viable (alguna partición queda vacía). "
            f"FORCE_WINDOW={WINDOW}; conteos: train/val/test={int(row.n_train)}/{int(row.n_val)}/{int(row.n_test)}"
        )
    print(f"[Transformer] WINDOW forzada: {WINDOW} (train/val/test: {int(row.n_train)}/{int(row.n_val)}/{int(row.n_test)})")
else:
    WINDOW = None
    for w in WINDOW_CANDIDATES:
        row = window_report_df[window_report_df["WINDOW"] == w].iloc[0]
        if int(row.n_train) > 0 and int(row.n_val) > 0 and int(row.n_test) > 0:
            WINDOW = int(w)
            break
    if WINDOW is None:
        msg = (
            "No hay muestras tras construir secuencias para ninguna WINDOW candidata.\n"
            f"Candidatas: {WINDOW_CANDIDATES}\n"
            f"Split train: {split_cfg.train_start} -> {split_cfg.train_end}\n"
            f"Split val:   {split_cfg.val_start} -> {split_cfg.val_end}\n"
            f"Split test:  {split_cfg.test_start} -> {split_cfg.test_end}\n"
            "Sugerencia: reduce WINDOW, revisa que Date sea datetime64, o ajusta el split."
        )
        raise ValueError(msg)
    row = window_report_df[window_report_df["WINDOW"] == WINDOW].iloc[0]
    print(f"[Transformer] WINDOW seleccionada automáticamente: {WINDOW} (train/val/test: {int(row.n_train)}/{int(row.n_val)}/{int(row.n_test)})")

# Construir secuencias con la WINDOW elegida
X_seq, y_seq, d_seq, s_seq, train_mask, val_mask, test_mask = _try_build_for_window(WINDOW)

# Split por fecha (sin leakage)
assert not (train_mask & val_mask).any() and not (train_mask & test_mask).any() and not (val_mask & test_mask).any()

X_train, y_train, s_train = X_seq[train_mask], y_seq[train_mask], s_seq[train_mask]

X_val, y_val, s_val = X_seq[val_mask], y_seq[val_mask], s_seq[val_mask]

X_test_seq, y_test_seq, s_test_seq = X_seq[test_mask], y_seq[test_mask], s_seq[test_mask]

test_dates = d_seq[test_mask]

test_stores = s_seq[test_mask]



train_loader = DataLoader(

    TensorDataset(

        torch.tensor(X_train, dtype=torch.float32),

        torch.tensor(s_train, dtype=torch.long),

        torch.tensor(y_train, dtype=torch.float32),

    ),

    batch_size=64,

    shuffle=True,

)



val_loader = DataLoader(

    TensorDataset(

        torch.tensor(X_val, dtype=torch.float32),

        torch.tensor(s_val, dtype=torch.long),

        torch.tensor(y_val, dtype=torch.float32),

    ),

    batch_size=64,

    shuffle=False,

)



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



class TransformerRegressor(nn.Module):

    def __init__(self, input_size: int, d_model: int, nhead: int, num_layers: int, dropout: float, max_len: int, emb_dim: int, num_stores: int):

        super().__init__()

        self.input_proj = nn.Linear(input_size, d_model)

        self.store_emb = nn.Embedding(num_stores, emb_dim)

        self.store_proj = nn.Linear(emb_dim, d_model)

        self.pos_emb = nn.Embedding(max_len, d_model)

        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dropout=dropout, batch_first=True)

        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.fc = nn.Linear(d_model, 1)

        self.max_len = max_len



    def forward(self, x, store_idx):

        b, t, _ = x.size()

        pos_idx = torch.arange(t, device=x.device)

        pos = self.pos_emb(pos_idx).unsqueeze(0).expand(b, t, -1)

        emb = self.store_proj(self.store_emb(store_idx)).unsqueeze(1).expand(b, t, -1)

        h = self.input_proj(x) + emb + pos

        h = self.encoder(h)

        out = h[:, -1, :]

        return self.fc(out).squeeze(-1)



def train_eval(config_run: dict):

    model = TransformerRegressor(

        input_size=X_train.shape[-1],

        d_model=config_run["d_model"],

        nhead=config_run["nhead"],

        num_layers=config_run["num_layers"],

        dropout=config_run["dropout"],

        max_len=WINDOW,

        emb_dim=EMB_DIM,

        num_stores=num_stores,

    ).to(device)



    criterion = nn.MSELoss()

    optimizer = torch.optim.Adam(

        model.parameters(), lr=config_run["lr"], weight_decay=WEIGHT_DECAY

    )

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(

        optimizer, mode="min", factor=REDUCE_LR_FACTOR, patience=REDUCE_LR_PATIENCE, verbose=False

    )



    best_val = float("inf")

    best_state = None

    epochs_no_improve = 0

    history = []



    for epoch in range(EPOCHS_MAX):

        model.train()

        train_losses = []

        for xb, sb, yb in train_loader:

            xb, sb, yb = xb.to(device), sb.to(device), yb.to(device)

            optimizer.zero_grad()

            preds = model(xb, sb)

            loss = criterion(preds, yb)

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_NORM)

            optimizer.step()

            train_losses.append(loss.item())



        model.eval()

        val_losses = []

        with torch.no_grad():

            for xb, sb, yb in val_loader:

                xb, sb, yb = xb.to(device), sb.to(device), yb.to(device)

                preds = model(xb, sb)

                val_losses.append(criterion(preds, yb).item())



        train_loss = float(np.mean(train_losses))

        val_loss = float(np.mean(val_losses)) if val_losses else train_loss

        scheduler.step(val_loss)

        current_lr = optimizer.param_groups[0]["lr"]

        history.append((epoch + 1, train_loss, val_loss, current_lr))



        if val_loss + MIN_DELTA < best_val:

            best_val = val_loss

            best_state = {k: v.cpu() for k, v in model.state_dict().items()}

            epochs_no_improve = 0

        else:

            epochs_no_improve += 1



        if (epoch + 1) % 5 == 0:

            print(f"Epoch {epoch+1}/{EPOCHS_MAX} | train={train_loss:.4f} | val={val_loss:.4f} | lr={current_lr:.2e}")



        if epochs_no_improve >= PATIENCE_ES:

            print(f"Early stopping at epoch {epoch+1} (best val {best_val:.4f})")

            break



    if best_state is not None:

        model.load_state_dict({k: v.to(device) for k, v in best_state.items()})



    return model, best_val, history



def predict_test(model):

    model.eval()

    preds = []

    for store, g_future in test.groupby("Store"):

        g_future = g_future.sort_values("Date")

        g_hist = train[train["Store"] == store].sort_values("Date")

        if g_hist.empty:

            continue

        y_hist = g_hist["Weekly_Sales"].tolist()

        feat_hist = []

        for _, row in g_hist.iterrows():

            feat_hist.append(

                _compute_feature_vector(

                    y_hist[: g_hist.index.get_loc(row.name) + 1],

                    row["Date"],

                    row,

                    {'lags': DEFAULT_LAGS, 'rollings': DEFAULT_ROLLINGS, 'add_calendar': True, 'exog_cols': EXOG_COLUMNS, 'feature_cols': feature_cols},

                )

            )



        store_preds = []

        s_idx = store_to_idx[int(store)]

        for _, row in g_future.iterrows():

            feat_vec = _compute_feature_vector(

                y_hist,

                row["Date"],

                row,

                {'lags': DEFAULT_LAGS, 'rollings': DEFAULT_ROLLINGS, 'add_calendar': True, 'exog_cols': EXOG_COLUMNS, 'feature_cols': feature_cols},

            )

            feat_hist.append(feat_vec)

            seq = np.array(feat_hist[-WINDOW :], dtype=float)

            seq = np.nan_to_num(seq, nan=train["Weekly_Sales"].mean())

            seq_scaled = scaler_x.transform(seq)

            xb = torch.tensor(seq_scaled, dtype=torch.float32).unsqueeze(0).to(device)

            sb = torch.tensor([s_idx], dtype=torch.long).to(device)

            with torch.no_grad():

                yhat_scaled = model(xb, sb).cpu().numpy().ravel()[0]

            yhat = scaler_y.inverse_transform([[yhat_scaled]])[0][0]

            y_hist.append(float(yhat))

            store_preds.append(float(yhat))



        preds.append(

            pd.DataFrame(

                {

                    "Store": g_future["Store"].values,

                    "Date": g_future["Date"].values,

                    "y_pred": store_preds,

                }

            )

        )



    if not preds:

        return pd.DataFrame(columns=["Store", "Date", "y_pred"])

    return pd.concat(preds, ignore_index=True)



results = []

for cfg in transformer_search:

    print(f"\n=== Transformer config {cfg} ===")

    model, best_val, history = train_eval(cfg)

    pred_df = predict_test(model)

    pred_df = pred_df.merge(

        test[["Store", "Date", "Weekly_Sales"]], on=["Store", "Date"], how="left"

    ).rename(columns={"Weekly_Sales": "y_true"})

    name = f"transformer_exog__dm{cfg['d_model']}__nh{cfg['nhead']}__nl{cfg['num_layers']}__do{cfg['dropout']}__lr{cfg['lr']}"

    pred_df["model"] = name

    metrics = compute_metrics(pred_df["y_true"].values, pred_df["y_pred"].values)

    results.append({"cfg": cfg, "metrics": metrics, "name": name})

    save_outputs(

        model_name=name,

        predictions=pred_df,

        metrics_global=pd.DataFrame([{**{"model": name}, **metrics}]),

        metrics_by_store=pred_df.groupby("Store").apply(

            lambda g: pd.Series(compute_metrics(g["y_true"].values, g["y_pred"].values))

        ).reset_index().assign(model=name),

        output_dir=OUTPUTS_DIR,

    )



results_df = pd.DataFrame([

    {"model": r["name"], "MAE": r["metrics"]["MAE"], "RMSE": r["metrics"]["RMSE"], "sMAPE": r["metrics"]["sMAPE"]}

    for r in results

])

results_df = results_df.sort_values(["sMAPE", "MAE"]).reset_index(drop=True)

best_model_name = results_df.loc[0, "model"]

results_df.head()

Unnamed: 0,WINDOW,n_train,n_val,n_test
0,52,0,0,1755
1,26,810,360,1755
2,13,1395,360,1755


[Transformer] WINDOW forzada: 26 (train/val/test: 810/360/1755)

=== Transformer config {'d_model': 64, 'nhead': 4, 'num_layers': 2, 'dropout': 0.1, 'lr': 0.001} ===


Epoch 5/200 | train=0.0578 | val=0.5159 | lr=1.00e-03


Epoch 10/200 | train=0.0383 | val=0.5500 | lr=5.00e-04


Epoch 15/200 | train=0.0264 | val=0.5677 | lr=5.00e-04


Early stopping at epoch 19 (best val 0.3670)



=== Transformer config {'d_model': 64, 'nhead': 4, 'num_layers': 2, 'dropout': 0.2, 'lr': 0.0003} ===


Epoch 5/200 | train=0.1074 | val=0.3016 | lr=3.00e-04


Epoch 10/200 | train=0.0795 | val=0.2925 | lr=3.00e-04


Epoch 15/200 | train=0.0518 | val=0.3070 | lr=1.50e-04


Epoch 20/200 | train=0.0403 | val=0.3041 | lr=7.50e-05


Early stopping at epoch 23 (best val 0.2724)



=== Transformer config {'d_model': 128, 'nhead': 8, 'num_layers': 4, 'dropout': 0.1, 'lr': 0.0003} ===


Epoch 5/200 | train=0.0975 | val=0.3997 | lr=3.00e-04


Epoch 10/200 | train=0.0617 | val=0.5137 | lr=1.50e-04


Epoch 15/200 | train=0.0260 | val=0.4057 | lr=7.50e-05


Early stopping at epoch 18 (best val 0.3882)



=== Transformer config {'d_model': 128, 'nhead': 8, 'num_layers': 2, 'dropout': 0.2, 'lr': 0.001} ===


Epoch 5/200 | train=0.0808 | val=0.3927 | lr=1.00e-03


Epoch 10/200 | train=0.0507 | val=0.6803 | lr=1.00e-03


Epoch 15/200 | train=0.0275 | val=0.4293 | lr=5.00e-04


Epoch 20/200 | train=0.0222 | val=0.4883 | lr=2.50e-04


Early stopping at epoch 23 (best val 0.3446)


Unnamed: 0,model,MAE,RMSE,sMAPE
0,transformer_exog__dm64__nh4__nl2__do0.1__lr0.001,95112.183075,126195.599627,10.259245
1,transformer_exog__dm128__nh8__nl4__do0.1__lr0....,111218.728259,142691.357145,14.172786
2,transformer_exog__dm128__nh8__nl2__do0.2__lr0.001,135133.266824,193913.299095,14.27225
3,transformer_exog__dm64__nh4__nl2__do0.2__lr0.0003,127755.594059,168160.650048,15.426463


## 5) Métricas (MAE, RMSE, sMAPE)
Se reporta:
- Global
- Por store

In [7]:
# Re-entrenar mejor config para guardar outputs consistentes
best_cfg = next(r["cfg"] for r in results if r["name"] == best_model_name)
print("Re-entrenando mejor config para outputs:", best_cfg)

best_model, best_val, _ = train_eval(best_cfg)

pred_df = predict_test(best_model)
pred_df = pred_df.merge(
    test[["Store", "Date", "Weekly_Sales"]], on=["Store", "Date"], how="left"
).rename(columns={"Weekly_Sales": "y_true"})
pred_df["model"] = best_model_name


Re-entrenando mejor config para outputs: {'d_model': 64, 'nhead': 4, 'num_layers': 2, 'dropout': 0.1, 'lr': 0.001}


Epoch 5/200 | train=0.0812 | val=0.4016 | lr=1.00e-03


Epoch 10/200 | train=0.0433 | val=0.3636 | lr=1.00e-03


Epoch 15/200 | train=0.0253 | val=0.3848 | lr=5.00e-04


Epoch 20/200 | train=0.0245 | val=0.3977 | lr=2.50e-04


Early stopping at epoch 22 (best val 0.3320)


In [8]:
# Resumen rápido: mejor config y (si existe) baseline de 20 epochs
if "metrics_global_df" in globals():
    best_row = metrics_global_df.iloc[0]
elif "results_df" in globals():
    best_row = results_df.iloc[0]
else:
    raise ValueError("No hay resultados para mostrar.")

print("Mejor config (ordenada por sMAPE):")
print(best_row)

# Intentar cargar baseline (si ya existe en outputs)
if "baseline_path" not in globals():
    import pathlib
    baseline_path = pathlib.Path(OUTPUTS_DIR) / "metrics" / "transformer_exog_metrics_global.csv"

if baseline_path.exists():
    baseline_df = pd.read_csv(baseline_path)
    print("\nBaseline (EPOCHS=20) encontrado:")
    print(baseline_df)
    print("Comparación sMAPE delta:", float(best_row["sMAPE"]) - float(baseline_df.loc[0, "sMAPE"]))
else:
    print("\nBaseline (EPOCHS=20) no encontrado; ejecuta el baseline para comparar.")


Mejor config (ordenada por sMAPE):
model    transformer_exog__dm64__nh4__nl2__do0.1__lr0.001
MAE                                          95112.183075
RMSE                                        126195.599627
sMAPE                                           10.259245
Name: 0, dtype: object

Baseline (EPOCHS=20) encontrado:
                                               model           MAE  \
0  transformer_exog__dm128__nh8__nl4__do0.1__lr0....  62559.516655   

           RMSE     sMAPE  
0  80319.226581  7.016417  
Comparación sMAPE delta: 3.2428278221706


## 6) Guardado de outputs estándar

In [9]:
# Métricas globales y por tienda
metrics_global = compute_metrics(pred_df["y_true"].values, pred_df["y_pred"].values)
metrics_global_df = pd.DataFrame([
    {"model": best_model_name, **metrics_global}
])

metrics_by_store_df = (
    pred_df.groupby("Store")
    .apply(lambda g: pd.Series(compute_metrics(g["y_true"].values, g["y_pred"].values)))
    .reset_index()
    .rename(columns={0: "MAE", 1: "RMSE", 2: "sMAPE"})
)
metrics_by_store_df.insert(0, "model", best_model_name)
metrics_by_store_df = metrics_by_store_df.sort_values("Store")

metrics_global_df, metrics_by_store_df.head()


(                                              model            MAE  \
 0  transformer_exog__dm64__nh4__nl2__do0.1__lr0.001  102597.771243   
 
             RMSE      sMAPE      WAPE  
 0  137821.155509  10.849608  0.098326  ,
                                               model  Store            MAE  \
 0  transformer_exog__dm64__nh4__nl2__do0.1__lr0.001      1  141721.285126   
 1  transformer_exog__dm64__nh4__nl2__do0.1__lr0.001      2  115566.480548   
 2  transformer_exog__dm64__nh4__nl2__do0.1__lr0.001      3   36653.131510   
 3  transformer_exog__dm64__nh4__nl2__do0.1__lr0.001      4  109806.163420   
 4  transformer_exog__dm64__nh4__nl2__do0.1__lr0.001      5   54412.150799   
 
             RMSE      sMAPE      WAPE  
 0  170548.724544   9.079771  0.088465  
 1  145903.086920   6.088872  0.060433  
 2   48110.459505   8.094447  0.086407  
 3  138641.377560   4.987412  0.050472  
 4   67176.554778  14.642130  0.163465  )

In [10]:
paths = save_outputs(
    model_name=MODEL_NAME,
    predictions=pred_df,
    metrics_global=metrics_global_df,
    metrics_by_store=metrics_by_store_df,
    output_dir=OUTPUTS_DIR,
)
paths

{'predictions': '/home/sagemaker-user/TFMAXEL/outputs/predictions/transformer_exog_predictions.csv',
 'metrics_global': '/home/sagemaker-user/TFMAXEL/outputs/metrics/transformer_exog_metrics_global.csv',
 'metrics_by_store': '/home/sagemaker-user/TFMAXEL/outputs/metrics/transformer_exog_metrics_by_store.csv'}

## 7) Figuras
- 3 tiendas: real vs predicción en test
- Distribución del error (`y_true - y_pred`)

Guardar PNGs en `outputs/figures/`.

In [11]:
import matplotlib.pyplot as plt
import seaborn as sns

FIG_DIR = OUTPUTS_DIR / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)

# Selección de 3 tiendas (mayor media de ventas en test)
top_stores = (
    pred_df.groupby("Store")["y_true"]
    .mean()
    .sort_values(ascending=False)
    .head(3)
    .index
    .tolist()
)

for store in top_stores:
    g = pred_df[pred_df["Store"] == store].sort_values("Date")
    plt.figure(figsize=(10, 4))
    plt.plot(g["Date"], g["y_true"], label="y_true")
    plt.plot(g["Date"], g["y_pred"], label="y_pred")
    plt.title(f"Store {store} — Transformer")
    plt.xlabel("Date")
    plt.ylabel("Weekly_Sales")
    plt.legend()
    plt.tight_layout()
    plt.savefig(FIG_DIR / f"{MODEL_NAME}_plot_store_{store}.png", dpi=150)
    plt.close()

# Distribución de error
errors = pred_df["y_true"] - pred_df["y_pred"]
plt.figure(figsize=(8, 4))
sns.histplot(errors, bins=30, kde=True)
plt.title("Error distribution (y_true - y_pred)")
plt.xlabel("Error")
plt.tight_layout()
plt.savefig(FIG_DIR / f"{MODEL_NAME}_plot_error_dist.png", dpi=150)
plt.close()