# Create Time Lagged Features

This notebook creates 6 time lagged features for each vital sign in `train_data.csv` and `test_data.csv`.

Lags are computed **within each encounter** (by `encounter_id`) based on the `timestamp` column to preserve temporal ordering. Lag-1 is the value from 5 seconds ago, lag-2 from 10 seconds ago, etc.

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

VITAL_COLS = ['heart_rate', 'systolic_bp', 'diastolic_bp', 'respiratory_rate', 'oxygen_saturation']
N_LAGS = 6
DATA_DIR = Path('../data')

## Lagging Function

In [None]:
def add_lagged_features(df: pd.DataFrame, vital_cols: list, n_lags: int) -> pd.DataFrame:
    """Add time lagged features within each encounter."""
    df = df.sort_values(['encounter_id', 'timestamp']).reset_index(drop=True)
    out = df.copy()
    
    for col in vital_cols:
        for lag in range(1, n_lags + 1):
            out[f'{col}_lag{lag}'] = df.groupby('encounter_id')[col].shift(lag)
    
    return out

## Imputation Functions

In [None]:
def impute_vitals(df: pd.DataFrame, vital_cols: list) -> pd.DataFrame:
    """Impute missing vitals with the median of the time step before and after (within each encounter)."""
    df = df.sort_values(['encounter_id', 'timestamp']).reset_index(drop=True)
    out = df.copy()
    for col in vital_cols:
        prev = out.groupby('encounter_id')[col].shift(1)
        next_ = out.groupby('encounter_id')[col].shift(-1)
        # Median of before and after (NaN-safe: if one is NaN, the other is used)
        med = pd.concat([prev, next_], axis=1).median(axis=1)
        out[col] = out[col].fillna(med)
        # Fallback: if still missing (e.g. first/last row, or both neighbors NaN), use encounter median
        out[col] = out.groupby('encounter_id')[col].transform(lambda x: x.fillna(x.median()))
    return out

In [9]:
def impute_lagged_features(df: pd.DataFrame, vital_cols: list, n_lags: int) -> pd.DataFrame:
    """Impute missing lagged features using within-encounter median.

    The first N rows of each encounter will have NaN lag values due to shifting.
    These are filled with the median of the available (non-NaN) lag values
    for the same column within that encounter.
    """
    out = df.copy()
    lag_cols = [f'{col}_lag{lag}' for col in vital_cols for lag in range(1, n_lags + 1)]

    for col in lag_cols:
        out[col] = out.groupby('encounter_id')[col].transform(
            lambda x: x.fillna(x.median())
        )

    return out

In [12]:
train = pd.read_csv(DATA_DIR / 'train_data.csv', parse_dates=['timestamp'])
print(f"Train shape before: {train.shape}")
print(f"Missing values in vitals before imputation:\n{train[VITAL_COLS].isna().sum()}\n")

# 1. Impute missing vitals before lagging
train = impute_vitals(train, VITAL_COLS)
assert train[VITAL_COLS].isna().sum().sum() == 0, "Vital imputation failed: missing values remain"
print("Vital imputation complete - no missing values in vitals")

# 2. Compute lagged features
train_lagged = add_lagged_features(train, VITAL_COLS, N_LAGS)
print(f"Train shape after lagging: {train_lagged.shape}")
print(f"New columns: {[c for c in train_lagged.columns if c not in train.columns]}")

# 3. Impute missing lag values (NaN at start of each encounter)
train_lagged = impute_lagged_features(train_lagged, VITAL_COLS, N_LAGS)

train_lagged.to_csv(DATA_DIR / 'train_data_lagged.csv', index=False)
print(f"Saved to {DATA_DIR / 'train_data_lagged.csv'}")

Train shape before: (2109600, 8)
Missing values in vitals before imputation:
heart_rate           42252
systolic_bp          42197
diastolic_bp         42320
respiratory_rate     42278
oxygen_saturation    42618
dtype: int64

Vital imputation complete - no missing values in vitals
Train shape after lagging: (2109600, 38)
New columns: ['heart_rate_lag1', 'heart_rate_lag2', 'heart_rate_lag3', 'heart_rate_lag4', 'heart_rate_lag5', 'heart_rate_lag6', 'systolic_bp_lag1', 'systolic_bp_lag2', 'systolic_bp_lag3', 'systolic_bp_lag4', 'systolic_bp_lag5', 'systolic_bp_lag6', 'diastolic_bp_lag1', 'diastolic_bp_lag2', 'diastolic_bp_lag3', 'diastolic_bp_lag4', 'diastolic_bp_lag5', 'diastolic_bp_lag6', 'respiratory_rate_lag1', 'respiratory_rate_lag2', 'respiratory_rate_lag3', 'respiratory_rate_lag4', 'respiratory_rate_lag5', 'respiratory_rate_lag6', 'oxygen_saturation_lag1', 'oxygen_saturation_lag2', 'oxygen_saturation_lag3', 'oxygen_saturation_lag4', 'oxygen_saturation_lag5', 'oxygen_saturation_lag6

KeyboardInterrupt: 

## Process Holdout Data

In [11]:
holdout = pd.read_csv(DATA_DIR / 'holdout_data.csv', parse_dates=['timestamp'])
holdout = impute_vitals(holdout, VITAL_COLS)
holdout_lagged = add_lagged_features(holdout, VITAL_COLS, N_LAGS)
holdout_lagged = impute_lagged_features(holdout_lagged, VITAL_COLS, N_LAGS)
holdout_lagged.to_csv(DATA_DIR / 'holdout_data_lagged.csv', index=False)
print(f"Holdout: shape={holdout_lagged.shape}, saved to {DATA_DIR / 'holdout_data_lagged.csv'}")

=== Final validation: no missing values ===



NameError: name 'test_lagged' is not defined

In [None]:
train = pd.read_csv(DATA_DIR / 'train_data.csv', parse_dates=['timestamp'])
print(f"Train shape before: {train.shape}")

train_lagged = add_lagged_features(train, VITAL_COLS, N_LAGS)
print(f"Train shape after: {train_lagged.shape}")
print(f"New columns: {[c for c in train_lagged.columns if c not in train.columns]}")

train_lagged.to_csv(DATA_DIR / 'train_data_lagged.csv', index=False)
print(f"Saved to {DATA_DIR / 'train_data_lagged.csv'}")

## Process Test Data

In [None]:
test = pd.read_csv(DATA_DIR / 'test_data.csv', parse_dates=['timestamp'])
test = impute_vitals(test, VITAL_COLS)
test_lagged = add_lagged_features(test, VITAL_COLS, N_LAGS)
test_lagged = impute_lagged_features(test_lagged, VITAL_COLS, N_LAGS)
test_lagged.to_csv(DATA_DIR / 'test_data_lagged.csv', index=False)
print(f"Test: shape={test_lagged.shape}, saved to {DATA_DIR / 'test_data_lagged.csv'}")

## Verify

In [None]:
# Validate no missing values across all datasets
print("=== Final validation: no missing values ===\n")
for name, df in [('train', train_lagged), ('test', test_lagged), ('holdout', holdout_lagged)]:
    missing = df.isna().sum().sum()
    assert missing == 0, f"{name}: {missing} missing values remain after imputation"
    print(f"{name}: shape={df.shape}, missing values={missing} âœ“")

# Quick sanity check: first encounter, first 10 rows
print("\n--- Sample: first encounter (train) ---")
enc_id = train_lagged['encounter_id'].iloc[0]
sample = train_lagged[train_lagged['encounter_id'] == enc_id].head(10)
cols_to_show = ['timestamp', 'heart_rate', 'heart_rate_lag1', 'heart_rate_lag2', 'heart_rate_lag6']
display(sample[cols_to_show])