# LSTM Baseline: Patient Label Prediction

Train a unidirectional LSTM on lagged vital signs from `train_data_lagged.csv` and evaluate ROC AUC on `test_data_lagged.csv`.

**Lags and LSTM**: With LSTM, you have two options for temporal input:
1. **Raw time series** – Create sliding windows of consecutive rows (no explicit lag columns); the LSTM learns from the raw sequence.
2. **Pre-computed lags** (this notebook) – The lagged CSV already encodes a lookback window. Each row has `lag6..lag1` and current values per vital. We reshape these into `(timesteps, channels)` so the LSTM receives a sequence of shape `(7, 5)` – 7 time points (oldest→newest) and 5 sensor channels.

## 0. Install Dependencies (run once)

Run the cell below to install required packages.

**From terminal (alternative):**
```bash
pip install tensorflow gdown pandas scikit-learn
```
If using **Google Colab**, dependencies are preinstalled; you can skip this step.

In [None]:
%pip install -q tensorflow gdown pandas scikit-learn

## Load Data from Google Drive

Loads data from the [shared Drive folder](https://drive.google.com/drive/folders/13NvOvSW1W0shkAxnYyZJlHFqokhHjUeI).
- **Google Colab**: Mounts Drive; set `COLAB_DATA_PATH` to your folder path after mounting.
- **Local/Jupyter**: Downloads via gdown. Ensure the folder is shared so anyone with the link can view.

In [None]:
from pathlib import Path
import gdown

GOOGLE_DRIVE_FOLDER_ID = "13NvOvSW1W0shkAxnYyZJlHFqokhHjUeI"
GOOGLE_DRIVE_URL = f"https://drive.google.com/drive/folders/{GOOGLE_DRIVE_FOLDER_ID}"

# For Colab: set this to the path of your data folder after mounting (e.g. /content/drive/MyDrive/Medhack_data)
# For Shared Drives: /content/drive/Shareddrives/<YourDriveName>/<folder_name>
COLAB_DATA_PATH = "/content/drive/MyDrive/Medhack_data"

def get_data_dir():
    """Load data from Google Drive. Returns Path to folder containing the lagged CSV files."""
    try:
        from google.colab import drive
        drive.mount("/content/drive", force_remount=False)
        data_dir = Path(COLAB_DATA_PATH)
        if not (data_dir / "train_data_lagged.csv").exists():
            raise FileNotFoundError(
                f"train_data_lagged.csv not found in {data_dir}. "
                f"Update COLAB_DATA_PATH to match your Drive folder."
            )
        return data_dir
    except ImportError:
        # Not in Colab: download via gdown
        output_dir = Path("data_from_drive")
        output_dir.mkdir(exist_ok=True)
        gdown.download_folder(url=GOOGLE_DRIVE_URL, output=str(output_dir), quiet=False)
        if (output_dir / "train_data_lagged.csv").exists():
            return output_dir
        for sub in output_dir.iterdir():
            if sub.is_dir() and (sub / "train_data_lagged.csv").exists():
                return sub
        raise FileNotFoundError(
            f"train_data_lagged.csv not found in {output_dir}. "
            f"Ensure the Drive folder contains train_data_lagged.csv, test_data_lagged.csv, holdout_data_lagged.csv."
        )

DATA_DIR = Path(get_data_dir())
print(f"DATA_DIR = {DATA_DIR}")

## Setup

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import tensorflow as tf

# DATA_DIR is set by the "Load Data from Google Drive" cell above
MODELS_DIR = Path('../models')
MODELS_DIR.mkdir(exist_ok=True)

VITAL_COLS = ['heart_rate', 'systolic_bp', 'diastolic_bp', 'respiratory_rate', 'oxygen_saturation']
N_LAGS = 6
TRAIN_SAMPLE_SIZE = 500_000  # LSTM can handle more than SVM; increase for full training

ModuleNotFoundError: No module named 'tensorflow'

## Load and Reshape Data

Reshape flat lag columns into `(n_samples, timesteps, features)`. Each sample: `[lag6, lag5, lag4, lag3, lag2, lag1, current]` for each of 5 vitals → shape `(7, 5)`.

In [None]:
def reshape_lagged_to_sequences(df: pd.DataFrame, vital_cols: list, n_lags: int) -> np.ndarray:
    """Reshape lagged columns into (n_samples, timesteps, n_vitals).
    
    Timestep order: lag_n, ..., lag_1, current (oldest to newest).
    """
    sequences = []
    for col in vital_cols:
        # Columns: lag6, lag5, ..., lag1, current (col)
        lag_cols = [f'{col}_lag{i}' for i in range(n_lags, 0, -1)]  # lag6..lag1
        seq_cols = lag_cols + [col]
        sequences.append(df[seq_cols].values)  # (n_samples, 7)
    # Stack: (n_samples, 7, 5) - each row is (timesteps, vitals)
    X = np.stack(sequences, axis=-1)
    return X


train = pd.read_csv(DATA_DIR / 'train_data_lagged.csv')
test = pd.read_csv(DATA_DIR / 'test_data_lagged.csv')

X_train_full = reshape_lagged_to_sequences(train, VITAL_COLS, N_LAGS)
y_train_full = train['label'].values
X_test = reshape_lagged_to_sequences(test, VITAL_COLS, N_LAGS)
y_test = test['label'].values

print(f"Train sequences: {X_train_full.shape}")  # (n, 7, 5)
print(f"Test sequences: {X_test.shape}")

## Stratified Sample and Standardization

In [None]:
splitter = StratifiedShuffleSplit(n_splits=1, train_size=TRAIN_SAMPLE_SIZE, random_state=42)
idx, _ = next(splitter.split(X_train_full, y_train_full))
X_train = X_train_full[idx]
y_train = y_train_full[idx]

print(f"Sampled train: {X_train.shape}")
print(pd.Series(y_train).value_counts().sort_index())

In [None]:
# Standardize: fit on train, apply to train and test
n_samples, n_timesteps, n_features = X_train.shape
X_train_flat = X_train.reshape(-1, n_features)
scaler = StandardScaler()
scaler.fit(X_train_flat)

X_train_scaled = scaler.transform(X_train_flat).reshape(n_samples, n_timesteps, n_features)
X_test_flat = X_test.reshape(-1, n_features)
X_test_scaled = scaler.transform(X_test_flat).reshape(X_test.shape[0], n_timesteps, n_features)

print(f"Scaled train: {X_train_scaled.shape}, test: {X_test_scaled.shape}")

## Build LSTM Model

In [None]:
def build_lstm_model(n_timesteps: int, n_features: int, n_classes: int = 4, class_weights: dict | None = None):
    """Unidirectional LSTM for time series classification."""
    model = tf.keras.Sequential([
        tf.keras.layers.LSTM(64, return_sequences=True, input_shape=(n_timesteps, n_features)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.LSTM(32),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(n_classes, activation='softmax'),
    ])
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy'],
    )
    return model


n_classes = len(np.unique(y_train))
class_counts = np.bincount(y_train)
total = len(y_train)
class_weights = {i: total / (n_classes * c) for i, c in enumerate(class_counts)}
print(f"Class weights: {class_weights}")

model = build_lstm_model(n_timesteps, n_features, n_classes)

## Train

In [None]:
history = model.fit(
    X_train_scaled,
    y_train,
    validation_split=0.1,
    epochs=10,
    batch_size=512,
    class_weight=class_weights,
    verbose=1,
)

model.save(MODELS_DIR / 'lstm_baseline.keras')
print(f"Model saved to {MODELS_DIR / 'lstm_baseline.keras'}")

## Evaluate ROC AUC on Test

In [None]:
y_proba = model.predict(X_test_scaled)
roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='macro')
print(f"Test ROC AUC (macro, OVR): {roc_auc:.4f}")

## Holdout Predictions (optional)

In [None]:
holdout = pd.read_csv(DATA_DIR / 'holdout_data_lagged.csv')
X_holdout = reshape_lagged_to_sequences(holdout, VITAL_COLS, N_LAGS)
X_holdout_flat = X_holdout.reshape(-1, n_features)
X_holdout_scaled = scaler.transform(X_holdout_flat).reshape(X_holdout.shape[0], n_timesteps, n_features)

y_holdout_proba = model.predict(X_holdout_scaled)
print(f"Holdout predictions shape: {y_holdout_proba.shape}")

if 'label' in holdout.columns:
    roc_auc_holdout = roc_auc_score(holdout['label'], y_holdout_proba, multi_class='ovr', average='macro')
    print(f"Holdout ROC AUC (macro, OVR): {roc_auc_holdout:.4f}")
else:
    print("Holdout has no labels; predictions ready for submission.")