In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    precision_recall_curve,
    confusion_matrix,
    classification_report
)

import lightgbm as lgb
import shap
import matplotlib.pyplot as plt

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)


#Data Ingestion and Structural Inspection


In [None]:
# Load raw ICU time-series data and perform minimal schema normalization
# to keep preprocessing assumptions explicit and reproducible

df = pd.read_csv("/content/Sepsis Prediction.csv")

if "Unnamed: 0" in df.columns:
    df = df.drop(columns=["Unnamed: 0"])

df = df.rename(columns={
    "Patient_ID": "patient_id",
    "Hour": "icu_hour"
})

df["icu_hour"] = pd.to_numeric(df["icu_hour"], errors="coerce")

print(df.shape)
print(df.columns)


(1552210, 43)
Index(['icu_hour', 'HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
       'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
       'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
       'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
       'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets', 'Age', 'Gender', 'Unit1', 'Unit2',
       'HospAdmTime', 'ICULOS', 'SepsisLabel', 'patient_id'],
      dtype='object')


#Temporal Alignment and Patient-Level Ordering

In [None]:
# Enforce strict temporal ordering within each patient trajectory
# to preserve prospective prediction validity and prevent future information leakage

df = df.sort_values(
    by=["patient_id", "icu_hour"],
    kind="mergesort"
).reset_index(drop=True)

print(df["SepsisLabel"].value_counts())



SepsisLabel
0    1524294
1      27916
Name: count, dtype: int64


#Prospective Early Warning Label Construction

In [None]:
HORIZON = 6  # hours ahead
df["EW_Label"] = 0

# Early warning labels are constructed per patient trajectory
# to ensure horizon-based labeling respects individual sepsis onset timing

for pid, g in df.groupby("patient_id", sort=False):
    sepsis_onsets = g.loc[g["SepsisLabel"] == 1, "icu_hour"].to_numpy()
    if len(sepsis_onsets) == 0:
        continue

    hours = g["icu_hour"].to_numpy()
    time_to_sepsis = np.min(
        sepsis_onsets.reshape(-1, 1) - hours,
        axis=0
    )

    ew_mask = (time_to_sepsis > 0) & (time_to_sepsis <= HORIZON)
    df.loc[g.index, "EW_Label"] = ew_mask.astype(int)

print(df["EW_Label"].value_counts())


EW_Label
0    1538131
1      14079
Name: count, dtype: int64


#Feature Definition and Cohort Construction

In [None]:
feature_cols = [
    'HR','O2Sat','Temp','SBP','MAP','DBP','Resp','EtCO2',
    'BaseExcess','HCO3','FiO2','pH','PaCO2','SaO2','AST',
    'BUN','Alkalinephos','Calcium','Chloride','Creatinine',
    'Bilirubin_direct','Glucose','Lactate','Magnesium',
    'Phosphate','Potassium','Bilirubin_total','TroponinI',
    'Hct','Hgb','PTT','WBC','Fibrinogen','Platelets',
    'Age','Gender'
]



# Patient-Level Trainâ€“Test Partitioning

In [None]:
# Split is performed at the patient level, not the observation level,
# to avoid information leakage across correlated ICU time points

patients = df["patient_id"].unique()

train_p, test_p = train_test_split(
    patients,
    test_size=0.2,
    random_state=RANDOM_SEED
)

train_mask = df["patient_id"].isin(train_p)
test_mask  = df["patient_id"].isin(test_p)

X_train = df.loc[train_mask, feature_cols]
y_train = df.loc[train_mask, "EW_Label"]

X_test  = df.loc[test_mask, feature_cols]
y_test  = df.loc[test_mask, "EW_Label"]


# Missing-Value Imputation and Feature Preparation

In [None]:
# Median imputation is applied to preserve distribution robustness
# under heavy missingness common in ICU physiological variables

imputer = SimpleImputer(strategy="median")

X_train = pd.DataFrame(
    imputer.fit_transform(X_train),
    columns=feature_cols,
    index=X_train.index
)

X_test = pd.DataFrame(
    imputer.transform(X_test),
    columns=feature_cols,
    index=X_test.index
)


# Gradient Boosted Tree Model Development

In [None]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data  = lgb.Dataset(X_test, label=y_test)

params = {
    "objective": "binary",
    "metric": ["auc", "average_precision"],
    "learning_rate": 0.05,
    "num_leaves": 64,
    "max_depth": 7,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "is_unbalance": True,
    "seed": RANDOM_SEED,
    "verbosity": -1
}

model = lgb.train(
    params,
    train_data,
    num_boost_round=500,
    valid_sets=[train_data, test_data],
    valid_names=["train", "test"],
    callbacks=[lgb.early_stopping(50)]
)


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[77]	train's auc: 0.728721	train's average_precision: 0.0319378	test's auc: 0.648488	test's average_precision: 0.0161258


# Discriminative Performance Evaluation

In [None]:
# Model performance is reported without threshold optimization
# to avoid optimistic bias and reflect pre-deployment assessment

y_pred = model.predict(X_test)

print("AUROC:", roc_auc_score(y_test, y_pred))
print("PR-AUC:", average_precision_score(y_test, y_pred))


AUROC: 0.6484882205669746
PR-AUC: 0.016125848726270695


# Global Model Interpretability Analysis

In [None]:
# Global feature importance is examined using SHAP
# to assess population-level drivers of early sepsis risk


explainer = shap.TreeExplainer(model)

X_sample = X_test.sample(5000, random_state=RANDOM_SEED)
shap_values = explainer.shap_values(X_sample)

# LightGBM binary output handling
shap_vals = shap_values[1] if isinstance(shap_values, list) else shap_values

shap.summary_plot(shap_vals, X_sample, plot_type="bar", show=False)
plt.savefig("figures/shap_global_bar.png", dpi=300)
plt.close()



  shap.summary_plot(shap_vals, X_sample, plot_type="bar", show=False)
