## Deep Sequence Models: LSTM vs GRU Bake-off


In [None]:
# ============================================
# Deep sequence models: LSTM vs GRU bake-off
# ============================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

plt.rcParams["figure.dpi"] = 130

# -----------------------------
# 0) Spark -> Pandas & features
# -----------------------------
# Use the same filtered dataset (no leakage after last failure)
train_df = df_final.filter(df_final["RUL_minutes"].isNotNull())
pdf = train_df.toPandas()

# Add cyclic time-of-day
if "timestamp_bin" in pdf.columns:
    tod_min = pdf["timestamp_bin"].dt.hour * 60 + pdf["timestamp_bin"].dt.minute
    pdf["tod_sin"] = np.sin(2*np.pi * tod_min / 1440.0)
    pdf["tod_cos"] = np.cos(2*np.pi * tod_min / 1440.0)

drop_cols = [
    "timestamp_bin", "failure", "next_failure_time",
    "last_failure_time", "minutes_since_last_failure", "RUL_minutes"
]
y = pdf["RUL_minutes"].astype(float).to_numpy()

feat_cols = [c for c in pdf.columns if c not in drop_cols and pd.api.types.is_numeric_dtype(pdf[c])]
X = pdf[feat_cols].replace([np.inf, -np.inf], np.nan).astype(float)

# time-ordered split identical to classic models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=False)
train_meds = X_train.median(numeric_only=True)
X_train = X_train.fillna(train_meds)
X_test  = X_test.fillna(train_meds)

# -----------------------------
# 1) Build sequences (same cut)
# -----------------------------
X_full = pd.concat([X_train, X_test], axis=0)[feat_cols].to_numpy(dtype=np.float32)
y_full = np.concatenate([y_train, y_test]).astype(np.float32)
cut = len(y_train)  # first index of the classical TEST set

SEQ_LEN = 30  # ~1 hour if your bins are 2 min; change if needed

Xs, ys, tgt_idx = [], [], []
for i in range(len(X_full) - SEQ_LEN):
    Xs.append(X_full[i:i+SEQ_LEN])
    ys.append(y_full[i+SEQ_LEN])
    tgt_idx.append(i + SEQ_LEN)
Xs = np.asarray(Xs, dtype=np.float32)
ys = np.asarray(ys, dtype=np.float32)
tgt_idx = np.asarray(tgt_idx)

# Split sequences so that the target belongs to train/test by the same time cut
mask_test = tgt_idx >= cut
Xseq_tr, Yseq_tr = Xs[~mask_test], ys[~mask_test]
Xseq_te, Yseq_te = Xs[mask_test],  ys[mask_test]

# Normalize using TRAIN only
mu  = Xseq_tr.mean(axis=(0,1), keepdims=True)
std = Xseq_tr.std(axis=(0,1), keepdims=True); std[std == 0] = 1e-6
Xseq_tr = (Xseq_tr - mu) / std
Xseq_te = (Xseq_te - mu) / std

# Create a small validation tail from training (time-ordered, no shuffle)
val_frac = 0.1
n_tr = len(Xseq_tr)
n_val = int(n_tr * val_frac)
Xseq_val, Yseq_val = Xseq_tr[-n_val:], Yseq_tr[-n_val:]
Xseq_tr,  Yseq_tr  = Xseq_tr[:-n_val], Yseq_tr[:-n_val]

# ---------------------------------------------------
# 2) Bin-weighted loss to balance long vs short RUL
# ---------------------------------------------------
# Define bins and compute inverse-frequency weights on TRAIN targets only
bin_edges = np.array([0, 5, 10, 20, 40, 80, 160, 1e9], dtype=np.float32)
tr_bins = np.digitize(Yseq_tr, bin_edges) - 1
counts = np.bincount(np.clip(tr_bins, 0, len(bin_edges)-2), minlength=len(bin_edges)-1)
# class-balanced weighting: total/(#bins * count)
class_w = (len(Yseq_tr) / ((len(bin_edges)-1) * np.maximum(counts, 1))).astype(np.float32)
# gentle cap to avoid extreme weights
class_w = np.clip(class_w, 0.3, 5.0)

def make_sample_weights(y):
    b = np.digitize(y, bin_edges) - 1
    b = np.clip(b, 0, len(class_w)-1)
    return class_w[b]

w_tr = make_sample_weights(Yseq_tr)
w_val = make_sample_weights(Yseq_val)
w_te  = make_sample_weights(Yseq_te)  # not used in loss, just for analysis if needed

# ----------------------
# 3) PyTorch data pipes
# ----------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

def to_loader(X, y, w, batch=256, shuffle=False):
    X_t = torch.tensor(X, dtype=torch.float32)
    y_t = torch.tensor(y, dtype=torch.float32)
    w_t = torch.tensor(w, dtype=torch.float32)
    ds = TensorDataset(X_t, y_t, w_t)
    return DataLoader(ds, batch_size=batch, shuffle=shuffle, pin_memory=True)

train_ld = to_loader(Xseq_tr,  Yseq_tr,  w_tr, shuffle=True)
val_ld   = to_loader(Xseq_val, Yseq_val, w_val, shuffle=False)
test_ld  = to_loader(Xseq_te,  Yseq_te,  w_te, shuffle=False)

n_features = Xseq_tr.shape[2]

# ----------------------
# 4) Model definitions
# ----------------------
class LSTMRegressor(nn.Module):
    def __init__(self, n_features, hidden=64, layers=2, dropout=0.2):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=n_features, hidden_size=hidden,
            num_layers=layers, batch_first=True, dropout=dropout
        )
        self.fc = nn.Linear(hidden, 1)
    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :]).squeeze(1)

class GRURegressor(nn.Module):
    def __init__(self, n_features, hidden=64, layers=2, dropout=0.2):
        super().__init__()
        self.gru = nn.GRU(
            input_size=n_features, hidden_size=hidden,
            num_layers=layers, batch_first=True, dropout=dropout
        )
        self.fc = nn.Linear(hidden, 1)
    def forward(self, x):
        out, _ = self.gru(x)
        return self.fc(out[:, -1, :]).squeeze(1)

# weighted MSE
def weighted_mse(pred, target, weight):
    return torch.mean((pred - target) ** 2 * weight)

# ----------------------
# 5) Train/eval routine
# ----------------------
def r2_score_np(y_true, y_pred):
    y_true = np.asarray(y_true); y_pred = np.asarray(y_pred)
    ss_res = np.sum((y_true - y_pred)**2)
    ss_tot = np.sum((y_true - y_true.mean())**2) + 1e-12
    return 1.0 - ss_res/ss_tot

def within_tol(y_true, y_pred, tol):
    return 100.0 * np.mean(np.abs(y_pred - y_true) <= tol)

@torch.no_grad()
def predict_model(model, loader):
    model.eval()
    preds, trues = [], []
    for xb, yb, _ in loader:
        xb = xb.to(device)
        yhat = model(xb).cpu().numpy()
        preds.append(yhat)
        trues.append(yb.numpy())
    return np.concatenate(trues), np.concatenate(preds)

def train_one(model, epochs=50, lr=1e-3, patience=6, clip=1.0):
    model = model.to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    best_state, best_val = None, float("inf")
    train_hist, val_hist = [], []

    for ep in range(1, epochs+1):
        # ---- train
        model.train()
        running = 0.0
        for xb, yb, wb in train_ld:
            xb, yb, wb = xb.to(device), yb.to(device), wb.to(device)
            opt.zero_grad(set_to_none=True)
            pred = model(xb)
            loss = weighted_mse(pred, yb, wb)
            loss.backward()
            if clip is not None:
                nn.utils.clip_grad_norm_(model.parameters(), clip)
            opt.step()
            running += loss.item()
        train_loss = running / max(1, len(train_ld))
        train_hist.append(train_loss)

        # ---- validate
        model.eval()
        v_running = 0.0
        with torch.no_grad():
            for xb, yb, wb in val_ld:
                xb, yb, wb = xb.to(device), yb.to(device), wb.to(device)
                pred = model(xb)
                v_running += weighted_mse(pred, yb, wb).item()
        val_loss = v_running / max(1, len(val_ld))
        val_hist.append(val_loss)

        print(f"Epoch {ep:02d} | train {train_loss:.4f}  val {val_loss:.4f}")

        # early stopping
        if val_loss + 1e-6 < best_val:
            best_val = val_loss
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            wait = 0
        else:
            wait += 1
            if wait >= patience:
                print(f"Early stopping at epoch {ep} (best val {best_val:.4f})")
                break

    if best_state is not None:
        model.load_state_dict(best_state)
    return model, train_hist, val_hist

def evaluate(name, model):
    yt, yp = predict_model(model, test_ld)
    out = {
        "Model": name,
        "MAE (min)": mean_absolute_error(yt, yp),
        "RMSE (min)": mean_squared_error(yt, yp, squared=False),
        "R²": r2_score_np(yt, yp),
        "≤5 min %": within_tol(yt, yp, 5),
        "≤10 min %": within_tol(yt, yp, 10),
        "Bias (min)": float(np.mean(yp - yt)),
        "N": len(yt)
    }
    return out, yt, yp

# ----------------------
# 6) Run small grid
# ----------------------
configs = [
    ("LSTM", 32, 2), ("LSTM", 64, 2), ("LSTM", 128, 2),
    ("GRU",  32, 2), ("GRU",  64, 2), ("GRU",  128, 2),
]

results = []
pred_cache = {}   # name -> (yt, yp)
for kind, hidden, layers in configs:
    print(f"\n=== Training {kind} hidden={hidden} layers={layers} ===")
    if kind.lower() == "lstm":
        net = LSTMRegressor(n_features, hidden=hidden, layers=layers, dropout=0.2)
    else:
        net = GRURegressor(n_features, hidden=hidden, layers=layers, dropout=0.2)

    net, tr_hist, va_hist = train_one(net, epochs=50, lr=1e-3, patience=6, clip=1.0)
    name = f"{kind}-{hidden}x{layers}"
    res, yt, yp = evaluate(name, net)
    results.append(res)
    pred_cache[name] = (yt, yp)

# ----------------------
# 7) Show results
# ----------------------
metrics_df = pd.DataFrame(results).sort_values("RMSE (min)").reset_index(drop=True)
display(metrics_df.style.format({
    "MAE (min)": "{:.2f}", "RMSE (min)": "{:.2f}", "R²": "{:.3f}",
    "≤5 min %": "{:.1f}", "≤10 min %": "{:.1f}", "Bias (min)": "{:.2f}"
}))

best_name = metrics_df.iloc[0]["Model"]
yt_best, yp_best = pred_cache[best_name]

# Quick visual: best deep model Pred vs Actual + error histogram
fig, ax = plt.subplots(1, 2, figsize=(13,5))
# Pred vs Actual
mn, mx = float(np.percentile(yt_best, 1)), float(np.percentile(yt_best, 99))
pad = 0.05*(mx - mn + 1e-6)
ax[0].scatter(yt_best, yp_best, s=10, alpha=0.5)
ax[0].plot([mn-pad, mx+pad], [mn-pad, mx+pad], "k--", lw=1)
ax[0].set_title(f"{best_name}: Pred vs Actual")
ax[0].set_xlabel("Actual RUL (min)"); ax[0].set_ylabel("Predicted RUL (min)")
ax[0].grid(True, alpha=0.3)
# Error histogram
err = yp_best - yt_best
ax[1].hist(err, bins=60, alpha=0.8)
ax[1].axvline(0, color="k", lw=1)
ax[1].axvline(err.mean(), color="tab:red", ls="--", lw=1, label=f"bias={err.mean():.2f}")
ax[1].set_title(f"{best_name}: Error (Pred-Actual)")
ax[1].set_xlabel("Error (min)"); ax[1].set_ylabel("Count"); ax[1].legend()
ax[1].grid(True, alpha=0.3)
plt.tight_layout(); plt.show()
