# 04 — LSTM global con exógenas

**Objetivo:** forecasting de `Weekly_Sales` semanal por `Store` usando LSTM global con covariables exógenas.

## Supuesto experimental (oracle exog)
Se asume disponibilidad de todas las covariables exógenas durante el horizonte de predicción (escenario oracle).

## Outputs estándar
- `outputs/predictions/lstm_exog_predictions.csv` con: `Store, Date, y_true, y_pred, model`
- `outputs/metrics/lstm_exog_metrics_global.csv`
- `outputs/metrics/lstm_exog_metrics_by_store.csv`
- `outputs/figures/lstm_exog_plot_*.png`

In [1]:
# 0) Imports y configuración
from __future__ import annotations

import json
import sys
from pathlib import Path

import numpy as np
import pandas as pd

PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.common import (
    compute_metrics,
    load_data,
    make_features,
    save_outputs,
    temporal_split,
)

MODEL_NAME = 'lstm_exog'
SEED = 42
np.random.seed(SEED)

DATA_PATH = PROJECT_ROOT / 'data' / 'Walmart_Sales.csv'
METADATA_PATH = PROJECT_ROOT / 'outputs' / 'metadata.json'
OUTPUTS_DIR = PROJECT_ROOT / 'outputs'

## 1) Cargar metadata (split + features)
Esto garantiza consistencia entre modelos.

In [2]:
metadata = json.loads(METADATA_PATH.read_text(encoding='utf-8'))
split = metadata['split']
feature_cols = metadata['features']
print('Split:', split)
print('N features:', len(feature_cols))

Split: {'train_start': '2010-02-05', 'train_end': '2012-07-06', 'val_start': '2012-07-13', 'val_end': '2012-08-31', 'test_start': '2012-09-07', 'test_end': '2012-10-26'}
N features: 16


## 2) Carga de datos + features
- Parseo/orden
- Construcción de lags/rolling (sin leakage)
- Exógenas alineadas por fecha

In [3]:
df = load_data(DATA_PATH)
df_feat, _ = make_features(df, add_calendar=True)

# Importante: para entrenar, debes decidir cómo tratar NaNs creados por lags/rolling
# Opción típica: descartar filas con NaNs en features (por store al inicio)
model_df = df_feat.dropna(subset=feature_cols + ['Weekly_Sales']).copy()
model_df.shape

(4095, 22)

## 3) Split temporal
Reutiliza exactamente el split definido en el notebook 00.

In [4]:
train_df, val_df, test_df, split_cfg = temporal_split(df)

# Aplicar el split sobre model_df (ya sin NaNs por lags)
train = model_df[model_df['Date'].between(split_cfg.train_start, split_cfg.train_end)].copy()
val = model_df[model_df['Date'].between(split_cfg.val_start, split_cfg.val_end)].copy()
test = model_df[model_df['Date'].between(split_cfg.test_start, split_cfg.test_end)].copy()

print(len(train), len(val), len(test))

3375 360 360


## 4) Entrenamiento del modelo
Implementación LSTM global con covariables exógenas.
Incluye representaciones de Store (one-hot o embedding).

In [5]:
# TODO: implementar entrenamiento LSTM con exógenas
# Debe producir predicciones para TEST (ideal: también para VAL).
y_pred_test = np.full(shape=len(test), fill_value=test['Weekly_Sales'].mean())

## 5) Métricas (MAE, RMSE, sMAPE)
Se reporta:
- Global
- Por store

In [7]:
# Helper para construir features autoregresivas (sin leakage)

def _compute_feature_vector(y_hist, date, exog_row, cfg):

    lags = cfg["lags"]

    rollings = cfg["rollings"]

    add_calendar = bool(cfg.get("add_calendar", True))

    exog_cols = cfg["exog_cols"]

    feat = {}

    for k in lags:

        feat[f"lag_{k}"] = y_hist[-k] if len(y_hist) >= k else np.nan

    for w in rollings:

        if len(y_hist) >= w:

            window = np.array(y_hist[-w:], dtype=float)

            feat[f"roll_mean_{w}"] = float(window.mean())

            feat[f"roll_std_{w}"] = float(window.std(ddof=0))

        else:

            feat[f"roll_mean_{w}"] = np.nan

            feat[f"roll_std_{w}"] = np.nan

    for c in exog_cols:

        feat[c] = float(exog_row[c])

    if add_calendar:

        iso = pd.Timestamp(date).isocalendar()

        feat["weekofyear"] = int(iso.week)

        feat["month"] = int(pd.Timestamp(date).month)

        feat["year"] = int(pd.Timestamp(date).year)

    vec = [feat.get(c, np.nan) for c in feature_cols]

    return vec



# ... resto del código previo ...



from warnings import filterwarnings

filterwarnings("ignore")



try:

    import torch

    from torch import nn

    from torch.utils.data import DataLoader, TensorDataset

except Exception as exc:

    raise ImportError("PyTorch no está instalado. Instala con: pip install torch") from exc



from sklearn.preprocessing import StandardScaler



torch.manual_seed(SEED)

np.random.seed(SEED)



# Configuración base + search space controlado

EPOCHS_MAX = 200

PATIENCE_ES = 15

MIN_DELTA = 1e-4

CLIP_NORM = 1.0

WEIGHT_DECAY = 1e-4

REDUCE_LR_FACTOR = 0.5

REDUCE_LR_PATIENCE = 5

EMB_DIM = 16

WINDOW = 52



# Grilla pequeña (<=8 configs)

lstm_search = [

    {"hidden_size": 64, "num_layers": 2, "dropout": 0.1, "lr": 1e-3},

    {"hidden_size": 64, "num_layers": 2, "dropout": 0.2, "lr": 3e-4},

    {"hidden_size": 128, "num_layers": 2, "dropout": 0.1, "lr": 3e-4},

    {"hidden_size": 128, "num_layers": 3, "dropout": 0.2, "lr": 1e-3},

]



# Anti-leakage: máscaras por fecha no deben solaparse

assert split_cfg.train_end < split_cfg.val_start < split_cfg.val_end < split_cfg.test_start <= split_cfg.test_end



# Mapping Store -> idx para embeddings

stores_sorted = sorted(model_df["Store"].unique())

store_to_idx = {s: i for i, s in enumerate(stores_sorted)}

num_stores = len(store_to_idx)



feature_cols_model = feature_cols  # lags/rolling/exógenas/calendario

DEFAULT_LAGS = sorted({int(c.split("_")[1]) for c in feature_cols if c.startswith("lag_")})

DEFAULT_ROLLINGS = sorted({int(c.split("_")[2]) for c in feature_cols if c.startswith("roll_mean_")})

EXOG_COLUMNS = [c for c in feature_cols if not c.startswith("lag_") and not c.startswith("roll_")]



# Escaladores SOLO con train

scaler_x = StandardScaler()

scaler_y = StandardScaler()

train_fit = train.copy()

scaler_x.fit(train_fit[feature_cols_model].values)

scaler_y.fit(train_fit[["Weekly_Sales"]].values)

assert hasattr(scaler_x, "mean_") and hasattr(scaler_y, "mean_")



def build_sequences(df_in: pd.DataFrame, window: int = WINDOW):

    sequences, targets, dates, stores_idx = [], [], [], []

    df_in = df_in.sort_values(["Store", "Date"]).copy()

    for store, g in df_in.groupby("Store"):

        g = g.sort_values("Date")

        X = scaler_x.transform(g[feature_cols_model].values)

        y = scaler_y.transform(g[["Weekly_Sales"]].values).ravel()

        date_arr = g["Date"].values

        s_idx = store_to_idx[int(store)]

        for t in range(window, len(g)):

            sequences.append(X[t - window : t])

            targets.append(y[t])

            dates.append(date_arr[t])

            stores_idx.append(s_idx)

    return (

        np.array(sequences),

        np.array(targets),

        np.array(dates),

        np.array(stores_idx, dtype=int),

    )



X_seq, y_seq, d_seq, s_seq = build_sequences(model_df, window=WINDOW)



# Split por fecha (sin leakage)

train_mask = (d_seq >= split_cfg.train_start) & (d_seq <= split_cfg.train_end)

val_mask = (d_seq >= split_cfg.val_start) & (d_seq <= split_cfg.val_end)

test_mask = (d_seq >= split_cfg.test_start) & (d_seq <= split_cfg.test_end)

assert not (train_mask & val_mask).any() and not (train_mask & test_mask).any() and not (val_mask & test_mask).any()



X_train, y_train, s_train = X_seq[train_mask], y_seq[train_mask], s_seq[train_mask]

X_val, y_val, s_val = X_seq[val_mask], y_seq[val_mask], s_seq[val_mask]

X_test_seq, y_test_seq, s_test_seq = X_seq[test_mask], y_seq[test_mask], s_seq[test_mask]

test_dates = d_seq[test_mask]

test_stores = s_seq[test_mask]



train_loader = DataLoader(

    TensorDataset(

        torch.tensor(X_train, dtype=torch.float32),

        torch.tensor(s_train, dtype=torch.long),

        torch.tensor(y_train, dtype=torch.float32),

    ),

    batch_size=64,

    shuffle=True,

)



val_loader = DataLoader(

    TensorDataset(

        torch.tensor(X_val, dtype=torch.float32),

        torch.tensor(s_val, dtype=torch.long),

        torch.tensor(y_val, dtype=torch.float32),

    ),

    batch_size=64,

    shuffle=False,

)



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



class LSTMRegressor(nn.Module):

    def __init__(self, input_size: int, hidden_size: int, num_layers: int, dropout: float, emb_dim: int, num_stores: int):

        super().__init__()

        self.store_emb = nn.Embedding(num_stores, emb_dim)

        self.lstm = nn.LSTM(input_size + emb_dim, hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout)

        self.fc = nn.Linear(hidden_size, 1)



    def forward(self, x, store_idx):

        emb = self.store_emb(store_idx)

        emb_expanded = emb.unsqueeze(1).expand(-1, x.size(1), -1)

        x_cat = torch.cat([x, emb_expanded], dim=-1)

        out, _ = self.lstm(x_cat)

        last = out[:, -1, :]

        return self.fc(last).squeeze(-1)



def train_eval(config_run: dict):

    model = LSTMRegressor(

        input_size=X_train.shape[-1],

        hidden_size=config_run["hidden_size"],

        num_layers=config_run["num_layers"],

        dropout=config_run["dropout"],

        emb_dim=EMB_DIM,

        num_stores=num_stores,

    ).to(device)

    criterion = nn.MSELoss()


## 5) Métricas (MAE, RMSE, sMAPE)
Se reporta:
- Global
- Por store

In [8]:
pred_df = pd.DataFrame({
    'Store': test['Store'].astype(int).values,
    'Date': test['Date'].values,
    'y_true': test['Weekly_Sales'].values,
    'y_pred': np.asarray(y_pred_test, dtype=float),
    'model': MODEL_NAME,
})

global_metrics = compute_metrics(pred_df['y_true'].values, pred_df['y_pred'].values)
metrics_global_df = pd.DataFrame([{'model': MODEL_NAME, **global_metrics}])

by_store = []
for store, g in pred_df.groupby('Store'):
    m = compute_metrics(g['y_true'].values, g['y_pred'].values)
    by_store.append({'model': MODEL_NAME, 'Store': int(store), **m})
metrics_by_store_df = pd.DataFrame(by_store).sort_values('Store')

metrics_global_df, metrics_by_store_df.head()

(       model            MAE           RMSE      sMAPE
 0  lstm_exog  441124.924609  518987.216845  46.104407,
        model  Store           MAE          RMSE       sMAPE
 0  lstm_exog      1  5.320871e+05  5.377143e+05   41.427339
 1  lstm_exog      2  8.449641e+05  8.478403e+05   58.743041
 2  lstm_exog      3  6.008675e+05  6.010464e+05   84.246824
 3  lstm_exog      4  1.108559e+06  1.109581e+06   70.660465
 4  lstm_exog      5  6.927589e+05  6.929573e+05  103.826028)

## 6) Guardado de outputs estándar

In [14]:
paths = save_outputs(
    model_name=MODEL_NAME,
    predictions=pred_df,
    metrics_global=metrics_global_df,
    metrics_by_store=metrics_by_store_df,
    output_dir=OUTPUTS_DIR,
)
paths

{'predictions': 'c:\\Users\\usuario\\Documents\\Master AI\\TFM\\MEMORIA 2.0\\outputs\\predictions\\lstm_exog_predictions.csv',
 'metrics_global': 'c:\\Users\\usuario\\Documents\\Master AI\\TFM\\MEMORIA 2.0\\outputs\\metrics\\lstm_exog_metrics_global.csv',
 'metrics_by_store': 'c:\\Users\\usuario\\Documents\\Master AI\\TFM\\MEMORIA 2.0\\outputs\\metrics\\lstm_exog_metrics_by_store.csv'}

## 7) Figuras
- 3 tiendas: real vs predicción en test
- Distribución del error (`y_true - y_pred`)

Guardar PNGs en `outputs/figures/`.

In [9]:
import matplotlib.pyplot as plt
import seaborn as sns

FIG_DIR = OUTPUTS_DIR / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)

# Selección de 3 tiendas (mayor media de ventas en test)
top_stores = (
    pred_df.groupby("Store")["y_true"]
    .mean()
    .sort_values(ascending=False)
    .head(3)
    .index
    .tolist()
)

for store in top_stores:
    g = pred_df[pred_df["Store"] == store].sort_values("Date")
    plt.figure(figsize=(10, 4))
    plt.plot(g["Date"], g["y_true"], label="y_true")
    plt.plot(g["Date"], g["y_pred"], label="y_pred")
    plt.title(f"Store {store} — LSTM")
    plt.xlabel("Date")
    plt.ylabel("Weekly_Sales")
    plt.legend()
    plt.tight_layout()
    plt.savefig(FIG_DIR / f"{MODEL_NAME}_plot_store_{store}.png", dpi=150)
    plt.close()

# Distribución de error
errors = pred_df["y_true"] - pred_df["y_pred"]
plt.figure(figsize=(8, 4))
sns.histplot(errors, bins=30, kde=True)
plt.title("Error distribution (y_true - y_pred)")
plt.xlabel("Error")
plt.tight_layout()
plt.savefig(FIG_DIR / f"{MODEL_NAME}_plot_error_dist.png", dpi=150)
plt.close()

In [12]:
# Reconstruir results_df si no está en memoria
import pandas as pd

if "results_df" not in globals():
    if "results" in globals() and len(results) > 0:
        rows = []
        for r in results:
            row = {k: v for k, v in r.items() if k not in {"state_dict", "model_state"}}
            cfg = row.pop("config", {})
            if isinstance(cfg, dict):
                row.update(cfg)
            rows.append(row)
        results_df = pd.DataFrame(rows)
    elif "metrics_global_df" in globals():
        # Fallback: usar métricas globales si no hubo búsqueda
        results_df = metrics_global_df.copy()
    else:
        raise ValueError("No hay datos para construir results_df; vuelve a correr la celda de entrenamiento.")

if not results_df.empty and all(col in results_df.columns for col in ["sMAPE", "MAE"]):
    results_df = results_df.sort_values(["sMAPE", "MAE"]).reset_index(drop=True)

results_df.head()


Unnamed: 0,model,MAE,RMSE,sMAPE
0,lstm_exog,441124.924609,518987.216845,46.104407


In [13]:
# Resumen rápido: mejor config y (si existe) baseline de 20 epochs

best_row = results_df.iloc[0]

print("Mejor config (ordenada por sMAPE):")

print(best_row)



import pathlib

baseline_path = pathlib.Path(OUTPUTS_DIR) / "metrics" / "lstm_exog_metrics_global.csv"

if baseline_path.exists():

    baseline_df = pd.read_csv(baseline_path)

    print("\nBaseline (EPOCHS=20) encontrado:")

    print(baseline_df)

    print("Comparación sMAPE delta:", float(best_row["sMAPE"]) - float(baseline_df.loc[0, "sMAPE"]))

else:

    print("\nBaseline (EPOCHS=20) no encontrado; ejecuta el baseline para comparar.")

Mejor config (ordenada por sMAPE):
model        lstm_exog
MAE      441124.924609
RMSE     518987.216845
sMAPE        46.104407
Name: 0, dtype: object

Baseline (EPOCHS=20) encontrado:
       model           MAE         RMSE     sMAPE
0  lstm_exog  75827.527682  98575.42425  8.016137
Comparación sMAPE delta: 38.088270612307355
