## Model Comparison: RandomForest vs XGBoost vs LSTM

In [None]:
# ========= RF vs XGB vs LSTM (sequence) — unified comparison =========
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from io import BytesIO
import boto3, joblib, types

# ------------------------------
# 0) Load RF/XGB from S3 (joblib)
# ------------------------------
bucket = "hqpsusu-ml-data-bucket"
rf_key  = "final_project/models/rf_model.pkl"
xgb_key = "final_project/models/xgb_model.pkl"

def load_joblib_from_s3(bucket, key):
    buf = BytesIO()
    boto3.client("s3").download_fileobj(bucket, key, buf)
    buf.seek(0)
    return joblib.load(buf)

rf_loaded  = load_joblib_from_s3(bucket, rf_key)
xgb_loaded = load_joblib_from_s3(bucket, xgb_key)

# --------------------------------------
# 1) Produce predictions for each model
# --------------------------------------
models = {}

# A) Tabular models (same X_test / y_test)
y_true_tab = np.asarray(y_test)  # from your sklearn split
y_pred_rf  = rf_loaded.predict(X_test)
y_pred_xgb = xgb_loaded.predict(X_test)
models["RandomForest (S3)"] = (y_true_tab, y_pred_rf)
models["XGBoost (S3)"]      = (y_true_tab, y_pred_xgb)

# B) LSTM (sequence) — optional, only if the vars exist
try:
    import torch
    have = all(v in globals() for v in ["model", "X_test_t"])
    if have:
        # y_test for LSTM could be numpy (from your code) or torch tensor
        if "y_test" in globals() and isinstance(globals()["y_test"], (np.ndarray, list, pd.Series)):
            y_true_lstm = np.asarray(globals()["y_test"])
        elif "y_test_t" in globals():
            y_true_lstm = globals()["y_test_t"].detach().cpu().numpy()
        else:
            y_true_lstm = None

        with torch.no_grad():
            y_pred_lstm = model(X_test_t).detach().cpu().numpy()

        if y_true_lstm is not None and len(y_true_lstm) == len(y_pred_lstm):
            models["LSTM (seq)"] = (y_true_lstm, y_pred_lstm)
        else:
            print("ℹ️ LSTM found but y_test shapes not aligned — skipping LSTM in comparison.")
    else:
        print("ℹ️ LSTM variables not found — skipping LSTM in comparison.")
except Exception as e:
    print(f"ℹ️ Skipping LSTM due to: {e}")

# --------------------------------------
# 2) Metrics
# --------------------------------------
def metrics(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    mask   = np.isfinite(y_true) & np.isfinite(y_pred)
    yt, yp = y_true[mask], y_pred[mask]
    err, ae = (yp - yt), np.abs(yp - yt)
    eps = 1e-9
    return {
        "MAE": mean_absolute_error(yt, yp),
        "RMSE": mean_squared_error(yt, yp, squared=False),
        "R²": r2_score(yt, yp),
        "Bias": float(err.mean()),
        "sMAPE %": 100.0 * np.mean(2*ae / (np.abs(yt)+np.abs(yp)+eps)),
        "≤5 min %": 100.0 * np.mean(ae <= 5),
        "≤10 min %": 100.0 * np.mean(ae <= 10),
        "N": int(yt.size),
    }

summary_rows = []
for name, (yt, yp) in models.items():
    m = metrics(yt, yp); m["Model"] = name
    summary_rows.append(m)
results = pd.DataFrame(summary_rows).set_index("Model").round(3).sort_values("MAE")
print(results)

# --------------------------------------
# 3) Helpers for plots
# --------------------------------------
def binned_xy(y_true, y_pred, nbins=40):
    df = pd.DataFrame({"y": y_true, "yp": y_pred}).dropna()
    q = min(nbins, max(2, int(df.shape[0]**0.5)))
    df["bin"] = pd.qcut(df["y"], q=q, duplicates="drop")
    g = df.groupby("bin").agg(y_mean=("y","mean"), yp_mean=("yp","mean"))
    return g["y_mean"].values, g["yp_mean"].values

plt.rcParams["figure.dpi"] = 140
fig = plt.figure(figsize=(16, 9))
grid = fig.add_gridspec(2, 3, hspace=0.35, wspace=0.3)

colors = ["tab:blue", "tab:orange", "tab:green", "tab:red", "tab:purple"]
names  = list(models.keys())

# 3A) Calibration (binned)
ax1 = fig.add_subplot(grid[0,0])
mn = min(np.min(v[0]) for v in models.values())
mx = max(np.max(v[0]) for v in models.values())
ax1.plot([mn, mx], [mn, mx], "k--", lw=1, label="Ideal")
for (name, (yt, yp)), c in zip(models.items(), colors):
    xb, yb = binned_xy(yt, yp)
    ax1.plot(xb, yb, marker="o", ms=3, lw=1.8, label=name, color=c)
ax1.set_title("Calibration (binned): Predicted vs Actual")
ax1.set_xlabel("Actual RUL (min)"); ax1.set_ylabel("Predicted RUL (min)")
ax1.legend(); ax1.grid(alpha=0.25)

# 3B) Accuracy vs Tolerance (CDF style)
ax2 = fig.add_subplot(grid[0,1])
tol = np.arange(0, 61, 1)
for (name, (yt, yp)), c in zip(models.items(), colors):
    ae = np.abs(yp - yt)
    acc = [(ae <= t).mean()*100.0 for t in tol]
    ax2.plot(tol, acc, lw=2, label=name, color=c)
ax2.set_title("Accuracy vs Tolerance (higher is better)")
ax2.set_xlabel("Tolerance (± minutes)"); ax2.set_ylabel("Within-tolerance (%)")
ax2.set_ylim(0, 100); ax2.legend(); ax2.grid(alpha=0.25)

# 3C) Absolute error distributions (boxplot)
ax3 = fig.add_subplot(grid[0,2])
box_data = [np.abs(models[n][1] - models[n][0]) for n in names]
ax3.boxplot(box_data, labels=names, showmeans=True)
ax3.set_title("Absolute Error Distribution"); ax3.set_ylabel("|Error| (min)")
ax3.grid(axis="y", alpha=0.25)

# 3D) Residuals vs Actual
ax4 = fig.add_subplot(grid[1,0])
for (name, (yt, yp)), c in zip(models.items(), colors):
    ax4.scatter(yt, (yp - yt), s=6, alpha=0.22, label=name, color=c)
ax4.axhline(0, color="k", ls="--", lw=1)
ax4.set_title("Residuals vs Actual"); ax4.set_xlabel("Actual RUL (min)"); ax4.set_ylabel("Pred - Actual (min)")
ax4.legend(); ax4.grid(alpha=0.25)

# 3E) MAE by Actual-RUL bins
ax5 = fig.add_subplot(grid[1,1])
bins = [0, 5, 10, 20, 40, 80, 160, 9999]
labels = ["0–5","5–10","10–20","20–40","40–80","80–160",">160"]
x = np.arange(len(labels)); w = 0.8 / max(1, len(names))
for i, ((name, (yt, yp)), c) in enumerate(zip(models.items(), colors)):
    idx = np.digitize(yt, bins, right=True) - 1
    dfb = pd.DataFrame({"bin": idx, "ae": np.abs(yp - yt)})
    mae_bins = dfb.groupby("bin")["ae"].mean().reindex(range(len(labels))).values
    ax5.bar(x + (i - (len(names)-1)/2)*w, mae_bins, width=w, label=name, color=c)
ax5.set_xticks(x, labels); ax5.set_ylabel("MAE (min)")
ax5.set_title("MAE by Actual-RUL Bin (lower is better)")
ax5.legend(); ax5.grid(axis="y", alpha=0.25)

# 3F) Metrics table
ax6 = fig.add_subplot(grid[1,2]); ax6.axis("off")
tbl = ax6.table(cellText=results.reset_index().values,
                colLabels=["Model"] + list(results.columns),
                cellLoc="center", loc="center")
tbl.auto_set_font_size(False); tbl.set_fontsize(9)
ax6.set_title("Summary Metrics", pad=8)

plt.suptitle("Model Comparison: RF vs XGBoost vs LSTM", fontsize=16, y=0.98)
plt.show()


## Final Model Comparison (Presentation-Ready)

In [None]:
# ============================
# Presentation-ready comparison
# ============================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error

# ---- Helper: register any (name, y_true, y_pred) that truly align ----
pairs = []

def add_pair(name, y_true, y_pred):
    y_true = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    if len(y_true) != len(y_pred):
        print(f"⚠️ Skipping {name}: length mismatch y_true={len(y_true)} vs y_pred={len(y_pred)}")
        return
    if not np.isfinite(y_true).all() or not np.isfinite(y_pred).all():
        mask = np.isfinite(y_true) & np.isfinite(y_pred)
        print(f"ℹ️ {name}: dropping {np.size(mask)-mask.sum()} non-finite rows")
        y_true, y_pred = y_true[mask], y_pred[mask]
    pairs.append({"name": name, "y_true": y_true, "y_pred": y_pred})

# ---- Try to collect models that may exist in your notebook ----
# Classical models (RF/XGB) usually share the same y_test_tab/y_test_classic. Adjust names if needed.
if "y_pred_rf" in locals() and "y_test" in locals():
    add_pair("Random Forest", y_test, y_pred_rf)
if "y_pred_xgb" in locals() and "y_test" in locals():
    add_pair("XGBoost", y_test, y_pred_xgb)

# LSTM: if you used the snippet where LSTM reused 'y_test' and 'y_pred',
# we try both common possibilities:
if "y_pred_lstm" in locals() and "y_test_lstm" in locals():
    add_pair("LSTM", y_test_lstm, y_pred_lstm)
elif "y_pred" in locals() and "y_test" in locals():
    add_pair("LSTM", y_test, y_pred)

if not pairs:
    raise RuntimeError("No valid (y_true, y_pred) pairs found. Make sure prediction variables are in memory.")

# ---- Metrics table ----
def smape(y_true, y_pred, eps=1e-6):
    denom = (np.abs(y_true) + np.abs(y_pred) + eps)
    return 100.0 * np.mean(np.abs(y_pred - y_true) / denom)

rows = []
for p in pairs:
    yt, yp = p["y_true"], p["y_pred"]
    mae  = mean_absolute_error(yt, yp)
    rmse = mean_squared_error(yt, yp, squared=False)
    bias = float(np.mean(yp - yt))
    s_mape = smape(yt, yp)
    rows.append({"Model": p["name"], "MAE (min)": mae, "RMSE (min)": rmse, "Bias (min)": bias, "sMAPE (%)": s_mape})

metrics_df = pd.DataFrame(rows).sort_values("RMSE (min)").reset_index(drop=True)
print("✅ Summary metrics (lower is better):")
display(metrics_df.style.format({"MAE (min)": "{:.2f}", "RMSE (min)": "{:.2f}", "Bias (min)": "{:.2f}", "sMAPE (%)": "{:.1f}"}))

# ---- Global ranges for comparable axes ----
all_true = np.concatenate([p["y_true"] for p in pairs])
low, high = np.percentile(all_true, [1, 99])
pad = 0.05 * (high - low + 1e-6)
xy_min, xy_max = low - pad, high + pad

# Error range (clip for readability, but still show full histogram tails via bins)
all_err = np.concatenate([p["y_pred"] - p["y_true"] for p in pairs])
e_low, e_high = np.percentile(all_err, [1, 99])
e_pad = 0.05 * (e_high - e_low + 1e-6)
err_min, err_max = e_low - e_pad, e_high + e_pad

# ---- Figure 1: Per-model Predicted vs Actual ----
cols = len(pairs)
fig, axes = plt.subplots(1, cols, figsize=(6*cols, 5))
axes = np.atleast_1d(axes)

for ax, p in zip(axes, pairs):
    yt, yp, name = p["y_true"], p["y_pred"], p["name"]
    # subsample for readability if huge
    n = len(yt)
    idx = np.linspace(0, n-1, min(n, 300)).astype(int)
    ax.scatter(yt[idx], yp[idx], alpha=0.45, s=20, edgecolor="k", linewidths=0.2)
    ax.plot([xy_min, xy_max], [xy_min, xy_max], "r--", lw=1.5, label="Ideal = Predicted")
    ax.set_xlim(xy_min, xy_max)
    ax.set_ylim(xy_min, xy_max)
    ax.set_title(f"{name}: Predicted vs Actual")
    ax.set_xlabel("Actual RUL (minutes)")
    ax.set_ylabel("Predicted RUL (minutes)")
    ax.grid(True, alpha=0.3)
    ax.legend(loc="upper left")

plt.tight_layout()
plt.show()

# ---- Figure 2: Overlaid Error Histogram (all models) ----
plt.figure(figsize=(7,5))
bins = 60
for p in pairs:
    err = p["y_pred"] - p["y_true"]
    plt.hist(np.clip(err, err_min, err_max), bins=bins, alpha=0.45, label=p["name"])
# vertical line at 0 and per-model mean bias
plt.axvline(0, color="k", lw=1)
for p in pairs:
    mu = float(np.mean(p["y_pred"] - p["y_true"]))
    plt.axvline(mu, linestyle="--", lw=1, label=f"{p['name']} mean={mu:.1f}")
plt.xlim(err_min, err_max)
plt.title("Prediction Error Distribution (Pred - Actual)")
plt.xlabel("Error (minutes)")
plt.ylabel("Count")
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()

# ---- Figure 3: Time-slice (first N samples) ----
N = 200
fig, axes = plt.subplots(len(pairs), 1, figsize=(12, 3.2*len(pairs)), sharex=True)
axes = np.atleast_1d(axes)
for ax, p in zip(axes, pairs):
    yt, yp = p["y_true"][:N], p["y_pred"][:N]
    ax.plot(yt, marker="o", ms=3, lw=1, label="Actual RUL")
    ax.plot(yp, marker="x", ms=3, lw=1, label="Predicted RUL")
    ax.set_title(f"{p['name']}: First {N} samples")
    ax.set_ylabel("RUL (min)")
    ax.grid(True, alpha=0.3)
axes[-1].set_xlabel("Sample Index")
axes[0].legend(loc="upper right")
plt.tight_layout()
plt.show()

# ---- Optional: LSTM training loss curve if you tracked epoch_losses ----
if "epoch_losses" in locals() and len(epoch_losses) > 0:
    plt.figure(figsize=(7,4))
    plt.plot(range(1, len(epoch_losses)+1), epoch_losses, marker="o")
    plt.title("LSTM Training Loss (per epoch)")
    plt.xlabel("Epoch")
    plt.ylabel("MSE Loss")
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("ℹ️ No LSTM epoch loss history found. To record it, append the mean batch loss each epoch to `epoch_losses` during training.")


## Full Model Comparison: RF vs XGB vs LSTM (Same Data)

In [None]:
# ============================================================
# Full Model Comparison: RF vs XGB vs LSTM
# ============================================================
# This notebook section:
# 1) Uses the same features and temporal split for ALL models.
# 2) Trains Random Forest, XGBoost, and LSTM.
# 3) Evaluates with multiple metrics (MAE, RMSE, R², Bias, sMAPE, tolerance%).
# 4) Produces visual comparisons: scatter, histograms, tolerance curves, residuals, bins, time slices.
# 5) Includes LSTM training loss for transparency.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

plt.rcParams["figure.dpi"] = 130

# -------------------------------------------------------
# 0) Spark → Pandas & feature matrix used by ALL models
# -------------------------------------------------------
# Keep only rows with a defined RUL (no leakage from "after last failure")
train_df = df_final.filter(df_final["RUL_minutes"].isNotNull())
pdf = train_df.toPandas()

# Cyclic time-of-day (optional but helpful and consistent across models)
if "timestamp_bin" in pdf.columns:
    tod_min = pdf["timestamp_bin"].dt.hour * 60 + pdf["timestamp_bin"].dt.minute
    pdf["tod_sin"] = np.sin(2*np.pi * tod_min / 1440.0)
    pdf["tod_cos"] = np.cos(2*np.pi * tod_min / 1440.0)

# Drop non-features / leakage columns
drop_cols = [
    "timestamp_bin", "failure", "next_failure_time",
    "last_failure_time", "minutes_since_last_failure", "RUL_minutes"
]
y = pdf["RUL_minutes"].astype(float).to_numpy()

# numeric features only, exclude leakage
feat_cols = [c for c in pdf.columns if c not in drop_cols and pd.api.types.is_numeric_dtype(pdf[c])]
X = pdf[feat_cols].replace([np.inf, -np.inf], np.nan).astype(float)

# time-ordered split (same for all models)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=False)

# Impute with TRAIN medians (no leakage)
train_meds = X_train.median(numeric_only=True)
X_train = X_train.fillna(train_meds)
X_test  = X_test.fillna(train_meds)

print(f"Features: {len(feat_cols)} | Train {X_train.shape} | Test {X_test.shape}")

# -------------------------------------------------------
# 1) Classical models (RF, XGB)
# -------------------------------------------------------
rf = RandomForestRegressor(n_estimators=300, n_jobs=-1, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

xgb_model = xgb.XGBRegressor(
    objective="reg:squarederror", n_estimators=600, learning_rate=0.05,
    max_depth=6, subsample=0.8, colsample_bytree=0.8,
    tree_method="hist", n_jobs=-1, random_state=42
)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

# -------------------------------------------------------
# 2) LSTM using the *same* features & same split
# -------------------------------------------------------
# Build full matrix in the same order so we can slice a test set that aligns in time
X_full = pd.concat([X_train, X_test], axis=0)[feat_cols].to_numpy(dtype=np.float32)
y_full = np.concatenate([y_train, y_test])
cut = len(y_train)   # index where test begins

SEQ_LEN = 30  # 30 two-minute bins ~ 1 hour, adjust if you like

# Sliding-window sequences (target at the window's end)
Xs, ys, tgt_idx = [], [], []
for i in range(len(X_full) - SEQ_LEN):
    Xs.append(X_full[i:i+SEQ_LEN])
    ys.append(y_full[i+SEQ_LEN])
    tgt_idx.append(i + SEQ_LEN)
Xs, ys, tgt_idx = np.asarray(Xs, np.float32), np.asarray(ys, np.float32), np.asarray(tgt_idx)

# Train/test split for sequences aligned by the same temporal cut
mask_test = tgt_idx >= cut
Xseq_tr, Yseq_tr = Xs[~mask_test], ys[~mask_test]
Xseq_te, Yseq_te = Xs[mask_test],  ys[mask_test]

# Normalize with TRAIN sequences only
mu  = Xseq_tr.mean(axis=(0,1), keepdims=True)
std = Xseq_tr.std(axis=(0,1), keepdims=True); std[std == 0] = 1e-6
Xseq_tr = (Xseq_tr - mu) / std
Xseq_te = (Xseq_te - mu) / std

# LSTM model
class LSTMRegressor(nn.Module):
    def __init__(self, n_features, hidden=64, layers=2):
        super().__init__()
        self.lstm = nn.LSTM(n_features, hidden, num_layers=layers, batch_first=True, dropout=0.2)
        self.fc   = nn.Linear(hidden, 1)
    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :]).squeeze(-1)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = LSTMRegressor(n_features=Xseq_tr.shape[2]).to(device)
opt   = torch.optim.Adam(model.parameters(), lr=1e-3)
lossf = nn.MSELoss()

train_ds = TensorDataset(torch.tensor(Xseq_tr), torch.tensor(Yseq_tr))
train_ld = DataLoader(train_ds, batch_size=256, shuffle=True)

epoch_losses = []
model.train()
for ep in range(6):
    total = 0.0
    for xb, yb in train_ld:
        xb, yb = xb.to(device), yb.to(device)
        opt.zero_grad()
        pred = model(xb)
        loss = lossf(pred, yb)
        loss.backward(); opt.step()
        total += loss.item()
    epoch_losses.append(total/len(train_ld))
    print(f"Epoch {ep+1}: loss={epoch_losses[-1]:.4f}")

model.eval()
with torch.no_grad():
    y_pred_lstm = model(torch.tensor(Xseq_te, device=device)).cpu().numpy()

# -------------------------------------------------------
# 3) Metrics & comparison table
# -------------------------------------------------------
def r2_score_(y_true, y_pred):
    y_true = np.asarray(y_true); y_pred = np.asarray(y_pred)
    ss_res = np.sum((y_true - y_pred)**2)
    ss_tot = np.sum((y_true - y_true.mean())**2) + 1e-12
    return 1.0 - ss_res/ss_tot

def smape(y_true, y_pred, eps=1e-6):
    return 100.0*np.mean(np.abs(y_pred - y_true) / (np.abs(y_true)+np.abs(y_pred)+eps))

def within_tol(y_true, y_pred, tol):
    return 100.0*np.mean(np.abs(y_pred - y_true) <= tol)

def row(name, yt, yp):
    return {
        "Model": name,
        "MAE (min)": mean_absolute_error(yt, yp),
        "RMSE (min)": mean_squared_error(yt, yp, squared=False),
        "R²": r2_score_(yt, yp),
        "Bias (min)": float(np.mean(yp - yt)),
        "sMAPE (%)": smape(yt, yp),
        "≤5 min %": within_tol(yt, yp, 5),
        "≤10 min %": within_tol(yt, yp, 10),
        "N": len(yt)
    }

# Note: LSTM test horizon equals the classical test horizon (aligned by `cut`)
results = [
    row("RandomForest", y_test, y_pred_rf),
    row("XGBoost",      y_test, y_pred_xgb),
    row("LSTM (seq)",   Yseq_te, y_pred_lstm)
]
metrics_df = pd.DataFrame(results).sort_values("RMSE (min)").reset_index(drop=True)
display(metrics_df.style.format({
    "MAE (min)": "{:.2f}", "RMSE (min)": "{:.2f}", "R²": "{:.3f}",
    "Bias (min)": "{:.2f}", "sMAPE (%)": "{:.1f}", "≤5 min %": "{:.1f}", "≤10 min %": "{:.1f}"
}))

# -------------------------------------------------------
# 4) Visual comparisons
# -------------------------------------------------------
pairs = [
    ("RandomForest", y_test, y_pred_rf),
    ("XGBoost",      y_test, y_pred_xgb),
    ("LSTM (seq)",   Yseq_te, y_pred_lstm)
]

# Common axis limits for Pred vs Actual
all_true = np.concatenate([p[1] for p in pairs])
a1, a2 = np.percentile(all_true, [1, 99])
pad = 0.05*(a2-a1+1e-6)
xy_min, xy_max = a1-pad, a2+pad

# --- Figure A: Predicted vs Actual (per model)
fig, axes = plt.subplots(1, 3, figsize=(17, 5))
for ax, (name, yt, yp) in zip(axes, pairs):
    n = len(yt); idx = np.linspace(0, n-1, min(n, 500)).astype(int)
    ax.scatter(yt[idx], yp[idx], s=14, alpha=0.5, edgecolors="none")
    ax.plot([xy_min, xy_max], [xy_min, xy_max], "k--", lw=1)
    ax.set_title(f"{name} — Pred vs Actual")
    ax.set_xlabel("Actual RUL (min)"); ax.set_ylabel("Predicted RUL (min)")
    ax.set_xlim(xy_min, xy_max); ax.set_ylim(xy_min, xy_max)
    ax.grid(True, alpha=0.3)
plt.tight_layout(); plt.show()

# --- Figure B: Error hist overlay + bias lines
plt.figure(figsize=(9,5))
bins = 70
all_err = []
for name, yt, yp in pairs:
    err = yp - yt
    all_err.append(err)
    plt.hist(err, bins=bins, alpha=0.45, label=name)
plt.axvline(0, color="k", lw=1)
for (name, yt, yp), err in zip(pairs, all_err):
    mu = float(np.mean(err))
    plt.axvline(mu, ls="--", lw=1, label=f"{name} bias={mu:.1f}")
plt.title("Prediction Error (Pred - Actual)"); plt.xlabel("Error (min)"); plt.ylabel("Count")
plt.grid(True, alpha=0.3); plt.legend(); plt.tight_layout(); plt.show()

# --- Figure C: Accuracy vs tolerance curve
plt.figure(figsize=(8,5))
tols = np.arange(0, 61, 2)
for name, yt, yp in pairs:
    acc = [within_tol(yt, yp, t) for t in tols]
    plt.plot(tols, acc, label=name)
plt.xlabel("Tolerance (± minutes)"); plt.ylabel("Within tolerance (%)")
plt.title("Accuracy vs Tolerance (higher is better)")
plt.grid(True, alpha=0.3); plt.legend(); plt.tight_layout(); plt.show()

# --- Figure D: Residuals vs Actual (bias pattern check)
fig, axes = plt.subplots(1, 3, figsize=(17,5), sharey=True)
for ax, (name, yt, yp) in zip(axes, pairs):
    res = yp - yt
    n = len(yt); idx = np.linspace(0, n-1, min(n, 500)).astype(int)
    ax.scatter(yt[idx], res[idx], s=12, alpha=0.5, edgecolors="none")
    ax.axhline(0, color="k", lw=1)
    ax.set_title(f"{name} — Residuals vs Actual")
    ax.set_xlabel("Actual RUL (min)"); ax.set_ylabel("Residual (min)")
    ax.grid(True, alpha=0.3)
plt.tight_layout(); plt.show()

# --- Figure E: MAE by actual-RUL bins
def binned_mae(yt, yp, edges):
    idx = np.digitize(yt, edges) - 1
    out = []
    for b in range(len(edges)-1):
        m = (idx == b)
        out.append(np.mean(np.abs(yp[m]-yt[m])) if np.any(m) else np.nan)
    return np.array(out)

edges = np.array([0,5,10,20,40,80,160,1e9])
xt = ["0–5","5–10","10–20","20–40","40–80","80–160",">160"]
width = 0.25
fig, ax = plt.subplots(figsize=(10,5))
for i, (name, yt, yp) in enumerate(pairs):
    mae_bins = binned_mae(yt, yp, edges)
    ax.bar(np.arange(len(xt))+i*width, mae_bins, width=width, label=name)
ax.set_xticks(np.arange(len(xt))+width); ax.set_xticklabels(xt)
ax.set_ylabel("MAE (min)"); ax.set_title("MAE by Actual-RUL Bin (lower is better)")
ax.grid(True, axis="y", alpha=0.3); ax.legend(); plt.tight_layout(); plt.show()

# --- Figure F: First N time steps (per model)
N = 220
fig, axes = plt.subplots(3, 1, figsize=(14, 8), sharex=True)
for ax, (name, yt, yp) in zip(axes, pairs):
    ax.plot(yt[:N], lw=1.2, label="Actual")
    ax.plot(yp[:N], lw=1.2, label="Pred")
    ax.set_title(f"{name} — First {N} samples"); ax.set_ylabel("RUL (min)")
    ax.grid(True, alpha=0.3)
axes[-1].set_xlabel("Sample index"); axes[0].legend(loc="upper right")
plt.tight_layout(); plt.show()

# --- (Optional) LSTM epoch losses
plt.figure(figsize=(6.5,3.5))
plt.plot(range(1,len(epoch_losses)+1), epoch_losses, marker="o")
plt.title("LSTM training loss"); plt.xlabel("Epoch"); plt.ylabel("MSE")
plt.grid(True, alpha=0.3); plt.tight_layout(); plt.show()
