# 06 - Time Series (ED vitals) - LSTM


In [None]:
import os, glob, pandas as pd, warnings

warnings.filterwarnings("ignore")

# === Project-wide dataset base (Windows path provided by user) ===
DATA_BASE = r"D:/HealthAI Project/data"

def _glob_rel(rel_pattern):
    pat = os.path.join(DATA_BASE, "**", rel_pattern)
    return glob.glob(pat, recursive=True)

def _ci_filename_search(basenames):
    hits = []
    for root, _, files in os.walk(DATA_BASE):
        lowfiles = [f.lower() for f in files]
        for name in basenames:
            name_low = name.lower()
            for i,f in enumerate(lowfiles):
                if f == name_low:
                    hits.append(os.path.join(root, files[i]))
    return hits

def find_one_any(rel_candidates, must=False, friendly=""):
    # 1) Exact-rel glob pass
    for rel in rel_candidates:
        hits = _glob_rel(rel)
        if hits:
            print("[found] %s -> %s" % (rel, hits[0]))
            return hits[0]
    # 2) CI basename pass
    basenames = [os.path.basename(r) for r in rel_candidates]
    ci_hits = _ci_filename_search(basenames)
    if ci_hits:
        print("[found-ci] one of %s -> %s" % (basenames, ci_hits[0]))
        return ci_hits[0]
    if must:
        raise FileNotFoundError(f"Could not find: {friendly or rel_candidates} under {DATA_BASE}")
    return None

def read_csv_auto(path):
    comp = "gzip" if str(path).lower().endswith(".gz") else None
    return pd.read_csv(path, compression=comp, low_memory=False)

print("Using DATA_BASE:", DATA_BASE)

Using DATA_BASE: D:/HealthAI Project/data


In [3]:
import pandas as pd, numpy as np, os

# assumes you already have these helpers in your env:
#   find_one_any([...]), read_csv_auto(path)
vs_p = find_one_any(["vitalsign.csv"])
es_p = find_one_any(["edstays.csv"])

print("Using:", vs_p)
print("Using:", es_p)

vs = read_csv_auto(vs_p)
es = read_csv_auto(es_p)

# -------------------------
# normalize columns & detect keys
# -------------------------
vs.columns = [c.lower() for c in vs.columns]
es.columns = [c.lower() for c in es.columns]

time_candidates = ["charttime","edcharttime","time","chart_time"]
stay_candidates = ["stay_id","edstay_id","ed_stay_id"]
time_col = next((c for c in time_candidates if c in vs.columns), None)
stay_col = next((c for c in stay_candidates if c in vs.columns), None)
if time_col is None or stay_col is None:
    raise RuntimeError(f"Could not detect time/stay columns in vitalsign: {vs.columns.tolist()}")

# -------------------------
# standardize vital names
# -------------------------
canon_map = {
    "heart_rate":"heart_rate", "heartrate":"heart_rate", "hr":"heart_rate",
    "sbp":"sbp", "systolic_bp":"sbp", "systolic":"sbp",
    "dbp":"dbp", "diastolic_bp":"dbp", "diastolic":"dbp",
    "resp_rate":"resp_rate", "respiratory_rate":"resp_rate", "rr":"resp_rate",
    "spo2":"spo2", "o2sat":"spo2", "oxygen_saturation":"spo2",
    "temperature":"temperature", "temp":"temperature", "temp_c":"temperature", "temp_f":"temp_f",
}
ren = {}
for c in list(vs.columns):
    lc = c.lower()
    if lc in canon_map:
        ren[c] = canon_map[lc]
vs = vs.rename(columns=ren)

candidate_vitals = ["heart_rate","sbp","dbp","resp_rate","spo2","temperature","temp_f"]
present_vitals = [c for c in candidate_vitals if c in vs.columns]

# if no heart rate recognized, try fuzzy fallback
if "heart_rate" not in present_vitals:
    hr_alt = next((c for c in vs.columns if ("heart" in c and "rate" in c)), None)
    if hr_alt:
        vs = vs.rename(columns={hr_alt: "heart_rate"})
        present_vitals.append("heart_rate")
if "heart_rate" not in present_vitals:
    raise RuntimeError("No heart-rate column detected after normalization.")

# convert temp_f -> temperature (°C) if needed
if "temp_f" in vs.columns and "temperature" not in vs.columns:
    vs["temperature"] = (pd.to_numeric(vs["temp_f"], errors="coerce") - 32.0) * (5.0/9.0)
    vs = vs.drop(columns=["temp_f"])
    present_vitals = [c for c in present_vitals if c != "temp_f"] + (["temperature"] if "temperature" not in present_vitals else [])

# -------------------------
# parse times, align to known stays
# -------------------------
vs[time_col] = pd.to_datetime(vs[time_col], errors="coerce")
vs = vs.dropna(subset=[time_col, stay_col])

stay_col_es = "stay_id" if "stay_id" in es.columns else ("edstay_id" if "edstay_id" in es.columns else ("ed_stay_id" if "ed_stay_id" in es.columns else None))
if stay_col_es and stay_col_es != stay_col:
    es = es.rename(columns={stay_col_es: stay_col})

valid = set(es[stay_col].unique())
vs = vs[vs[stay_col].isin(valid)]

# -------------------------
# coerce vitals to numeric BEFORE resampling
# -------------------------
for c in present_vitals:
    vs[c] = pd.to_numeric(vs[c], errors="coerce")

# -------------------------
# resample each stay to 15-min grid using ONLY numeric columns
# -------------------------
def resample_group(g):
    num_cols = [c for c in present_vitals if c in g.columns]
    if not num_cols:
        return pd.DataFrame(columns=[time_col, stay_col, "heart_rate"])  # empty safe frame
    g_num = g[[time_col] + num_cols].copy()
    g_num = g_num.set_index(time_col).sort_index()
    r = g_num.resample("15min").mean()
    # in-stay fill (time-aware, then directional)
    r = r.interpolate(method="time").ffill().bfill()
    r = r.reset_index()
    return r

res = []
for sid, g in vs.groupby(stay_col, sort=False):
    r = resample_group(g)
    if not r.empty:
        r[stay_col] = sid
        r = r.rename(columns={time_col: "time"})
        res.append(r)

if not res:
    raise RuntimeError("No resampled data produced; check input columns and data quality.")
df = pd.concat(res, ignore_index=True)

# -------------------------
# GLOBAL fill after concatenation (handles entire-stay-missing cases)
# -------------------------
vital_cols = [c for c in ["sbp","dbp","resp_rate","spo2","temperature","heart_rate"] if c in df.columns]
# drop rows without stay id or time
df = df.dropna(subset=[stay_col, "time"])

# fill remaining NaNs per column with global medians
for c in vital_cols:
    med = pd.to_numeric(df[c], errors="coerce").median(skipna=True)
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(med)

# ensure target available
if "heart_rate" not in df.columns:
    raise RuntimeError("Target 'heart_rate' is missing after preprocessing.")
df = df.dropna(subset=["heart_rate"])

# -------------------------
# build windows (filter out any windows containing non-finite values)
# -------------------------
preferred_features = ["sbp", "dbp", "resp_rate", "spo2", "temperature", "heart_rate"]
features = [c for c in preferred_features if c in df.columns and c != "heart_rate"]
target = "heart_rate"
SEQ_LEN = 8

def make_windows(df, seq_len=8):
    Xs, ys, sids = [], [], []
    for sid, g in df.sort_values([stay_col, "time"]).groupby(stay_col, sort=False):
        g = g.dropna(subset=[target])
        cols = [c for c in features + [target] if c in g.columns]
        if (not cols) or target not in cols:
            continue
        G = g[cols].to_numpy(dtype=float)
        for i in range(len(G) - seq_len):
            X_win = G[i:i+seq_len, :]
            y_val = G[i+seq_len, cols.index(target)]
            # keep only all-finite windows
            if np.isfinite(X_win).all() and np.isfinite(y_val):
                Xs.append(X_win)
                ys.append(y_val)
                sids.append(sid)
    if len(Xs) == 0:
        return np.empty((0, seq_len, len(features)+1)), np.empty((0,), dtype="float32"), np.empty((0,))
    return np.array(Xs), np.array(ys, dtype="float32"), np.array(sids)

X_all, y_all, sid_all = make_windows(df, SEQ_LEN)
if X_all.shape[0] == 0:
    raise RuntimeError("No training windows were created; not enough clean sequential data per stay.")

# -------------------------
# split by stay
# -------------------------
rng = np.random.default_rng(42)
stays = np.unique(sid_all); rng.shuffle(stays)
n = len(stays)
train_ids = set(stays[:int(0.7*n)])
val_ids   = set(stays[int(0.7*n):int(0.85*n)])
test_ids  = set(stays[int(0.85*n):])

def pick(ids):
    m = np.array([s in ids for s in sid_all])
    return X_all[m], y_all[m]

X_tr, y_tr = pick(train_ids)
X_va, y_va = pick(val_ids)
X_te, y_te = pick(test_ids)

if X_tr.size == 0 or X_va.size == 0 or X_te.size == 0:
    raise RuntimeError(f"Empty split(s): train {X_tr.shape[0]}, val {X_va.shape[0]}, test {X_te.shape[0]}.")

# -------------------------
# impute any non-finite in 3D arrays by feature medians (last safety net)
# -------------------------
def impute_3d_inplace(X):
    F = X.shape[-1]
    flat = X.reshape(-1, F)
    med = np.nanmedian(flat, axis=0)
    # replace non-finite with medians
    mask = ~np.isfinite(X)
    if mask.any():
        idx_feat = np.where(mask)[2]
        X[mask] = np.take(med, idx_feat)

impute_3d_inplace(X_tr)
impute_3d_inplace(X_va)
impute_3d_inplace(X_te)

# -------------------------
# scale features
# -------------------------
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_tr2 = X_tr.reshape(-1, X_tr.shape[-1])
scaler.fit(X_tr2)
X_tr = scaler.transform(X_tr2).reshape(X_tr.shape)
X_va = scaler.transform(X_va.reshape(-1, X_va.shape[-1])).reshape(X_va.shape)
X_te = scaler.transform(X_te.reshape(-1, X_te.shape[-1])).reshape(X_te.shape)

# -------------------------
# normalize targets
# -------------------------
# compute on finite entries only
finite_mask = np.isfinite(y_tr)
y_mean = float(y_tr[finite_mask].mean()) if finite_mask.any() else 0.0
y_std  = float(y_tr[finite_mask].std()) if finite_mask.any() else 1.0
if not np.isfinite(y_std) or y_std == 0.0:
    y_std = 1.0

y_trn = (y_tr - y_mean) / y_std
y_van = (y_va - y_mean) / y_std
y_ten = (y_te - y_mean) / y_std

[found] vitalsign.csv -> D:/HealthAI Project/data\MIMIC IV\vitalsign.csv
[found] edstays.csv -> D:/HealthAI Project/data\MIMIC IV\edstays.csv
Using: D:/HealthAI Project/data\MIMIC IV\vitalsign.csv
Using: D:/HealthAI Project/data\MIMIC IV\edstays.csv


In [None]:
# -------------------------
# LSTM regressor
# -------------------------
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"
Xtr_t = torch.tensor(X_tr, dtype=torch.float32, device=device)
Xva_t = torch.tensor(X_va, dtype=torch.float32, device=device)
Xte_t = torch.tensor(X_te, dtype=torch.float32, device=device)
ytr_t = torch.tensor(y_trn.reshape(-1, 1), dtype=torch.float32, device=device)
yva_t = torch.tensor(y_van.reshape(-1, 1), dtype=torch.float32, device=device)
yte_t = torch.tensor(y_ten.reshape(-1, 1), dtype=torch.float32, device=device)

train_loader = DataLoader(TensorDataset(Xtr_t, ytr_t), batch_size=64, shuffle=True)
val_loader   = DataLoader(TensorDataset(Xva_t, yva_t), batch_size=128)
test_loader  = DataLoader(TensorDataset(Xte_t, yte_t), batch_size=128)

input_dim = X_tr.shape[-1]
hidden = 64
layers = 1
dropout = 0.1

class LSTMReg(nn.Module):
    def __init__(self, input_dim, hidden, layers, dropout):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden, num_layers=layers,
                            batch_first=True, dropout=dropout if layers > 1 else 0.0)
        self.head = nn.Sequential(nn.Linear(hidden, 64), nn.ReLU(), nn.Linear(64, 1))
    def forward(self, x):
        out, _ = self.lstm(x)
        last = out[:, -1, :]
        return self.head(last)

model = LSTMReg(input_dim, hidden, layers, dropout).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

def eval_rmse(loader):
    model.eval(); se = 0.0; n = 0
    with torch.no_grad():
        for xb, yb in loader:
            pred = model(xb)
            # ensure no NaNs creep in
            pred = torch.nan_to_num(pred)
            yb   = torch.nan_to_num(yb)
            se += ((pred - yb) ** 2).sum().item()
            n  += yb.numel()
    return (se / max(n, 1)) ** 0.5 * float(y_std)

best = float("inf"); best_state = None
for epoch in range(10):
    model.train()
    for xb, yb in train_loader:
        opt.zero_grad()
        pred = model(xb)
        pred = torch.nan_to_num(pred)
        yb   = torch.nan_to_num(yb)
        loss = loss_fn(pred, yb)
        if not torch.isfinite(loss):
            continue  
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
        opt.step()
    val_rmse = eval_rmse(val_loader)
    if val_rmse < best and np.isfinite(val_rmse):
        best = val_rmse
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
    print(f"Epoch {epoch+1:02d} | Val RMSE (bpm): {val_rmse:.2f}")

if best_state:
    model.load_state_dict(best_state)

test_rmse = eval_rmse(test_loader)
print(f"Test RMSE (bpm): {test_rmse:.2f}")

Epoch 01 | Val RMSE (bpm): 1.36
Epoch 02 | Val RMSE (bpm): 1.27
Epoch 03 | Val RMSE (bpm): 1.31
Epoch 04 | Val RMSE (bpm): 1.28
Epoch 05 | Val RMSE (bpm): 1.27
Epoch 06 | Val RMSE (bpm): 1.27
Epoch 07 | Val RMSE (bpm): 1.25
Epoch 08 | Val RMSE (bpm): 1.23
Epoch 09 | Val RMSE (bpm): 1.26
Epoch 10 | Val RMSE (bpm): 1.23
Test RMSE (bpm): 1.41


In [None]:
# =====================
# SAVE INFERENCE ARTIFACTS
# =====================
from pathlib import Path
import joblib
import json
import torch

ART_DIR = Path("./Models/Vitals_Model")
ART_DIR.mkdir(parents=True, exist_ok=True)

# 1) Save raw PyTorch state_dict (+ training metadata)
torch.save(
    {
        "state_dict": model.state_dict(),
        "input_dim": int(input_dim),
        "hidden": int(hidden),
        "layers": int(layers),
        "dropout": float(dropout),
        "seq_len": int(SEQ_LEN),
        "features": list(features),           # order matters
        "target": str(target),
        "y_mean": float(y_mean),
        "y_std": float(y_std),
    },
    ART_DIR / "lstm_reg.pt",
)
print(f"[saved] {ART_DIR/'lstm_reg.pt'}")

# 2) Save TorchScript (portable, no class definition needed at load)
try:
    model_eval = model.to("cpu").eval()
    example = torch.zeros(1, SEQ_LEN, input_dim, dtype=torch.float32)
    traced = torch.jit.trace(model_eval, example)
    traced.save(str(ART_DIR / "lstm_reg_ts.pt"))
    print(f"[saved] {ART_DIR/'lstm_reg_ts.pt'}")
except Exception as e:
    print(f"[warn] TorchScript export failed: {e}")

# 3) Save the feature scaler
joblib.dump(scaler, ART_DIR / "scaler.joblib")
print(f"[saved] {ART_DIR/'scaler.joblib'}")

# 4) Save extra metadata as JSON (handy for quick checks)
meta = {
    "features": list(features),
    "seq_len": int(SEQ_LEN),
    "target": str(target),
    "y_mean": float(y_mean),
    "y_std": float(y_std),
    "vital_columns_used": [c for c in ["sbp","dbp","resp_rate","spo2","temperature","heart_rate"] if c in df.columns],
}
with open(ART_DIR / "meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2)
print(f"[saved] {ART_DIR/'meta.json'}")

[saved] vitals_artifacts\lstm_reg.pt
[saved] vitals_artifacts\lstm_reg_ts.pt
[saved] vitals_artifacts\scaler.joblib
[saved] vitals_artifacts\meta.json
