# ICU Mortality

## Import Data

In [3]:
import pandas as pd

adm = pd.read_csv("ADMISSIONS.csv.gz")
icu = pd.read_csv("ICUSTAYS.csv.gz")
diag = pd.read_csv("DIAGNOSES_ICD.csv.gz")
d_diag = pd.read_csv("D_ICD_DIAGNOSES.csv.gz")

## Look at columns

In [24]:
print(adm.columns.tolist())
print(icu.columns.tolist())
print(diag.columns.tolist())
print(d_diag.columns.tolist())


['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION', 'INSURANCE', 'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY', 'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS', 'HOSPITAL_EXPIRE_FLAG', 'HAS_CHARTEVENTS_DATA']
['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'DBSOURCE', 'FIRST_CAREUNIT', 'LAST_CAREUNIT', 'FIRST_WARDID', 'LAST_WARDID', 'INTIME', 'OUTTIME', 'LOS']
['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD9_CODE']
['ROW_ID', 'ICD9_CODE', 'SHORT_TITLE', 'LONG_TITLE']


## Build a cohort with a label (using icu and adm only)

In [19]:
# Define outcome (in-hospital mortality)
adm["MORTALITY"] = adm["HOSPITAL_EXPIRE_FLAG"]  # 0/1 label

# Merge ICU info + outcome
cohort = icu.merge(
    adm[["SUBJECT_ID", "HADM_ID", "MORTALITY", "ADMITTIME", "DISCHTIME"]],
    on=["SUBJECT_ID", "HADM_ID"],
    how="left"
)

# Optional: keep only first ICU stay per hospital admission (simpler)
cohort = cohort.sort_values(["SUBJECT_ID", "HADM_ID", "INTIME"])

cohort_first = cohort.groupby(["SUBJECT_ID", "HADM_ID"]).first().reset_index()


In [None]:
# Join a few admission-level predictors
features_cols = [
    "SUBJECT_ID", "HADM_ID", "ICUSTAY_ID",
    "MORTALITY",
    "ADMISSION_TYPE",
    "ADMISSION_LOCATION",
    "DISCHARGE_LOCATION",
    "INSURANCE",
    "MARITAL_STATUS",
    "ETHNICITY",
    "LOS"   # ICU length of stay
]

data = cohort_first.merge(
    adm[[
        "SUBJECT_ID",
        "HADM_ID",
        "ADMISSION_TYPE",
        "ADMISSION_LOCATION",
        "DISCHARGE_LOCATION",
        "INSURANCE",
        "MARITAL_STATUS",
        "ETHNICITY"
    ]],
    on=["SUBJECT_ID", "HADM_ID"],
    how="left"
)

data = data[features_cols]

# Remove leakage variable
bad_cols = ["DISCHARGE_LOCATION"]
data = data.drop(columns=bad_cols)


Discharge location is giving away if the patient is dead or alive, so we are dropping that.

## One-hot encode + split

In [21]:
# One-hot encode categorical variables
cat_cols = [
    "ADMISSION_TYPE",
    "ADMISSION_LOCATION",
    "INSURANCE",
    "MARITAL_STATUS",
    "ETHNICITY"
]

# X = features, include LOS (numeric)
X = pd.get_dummies(data[cat_cols + ["LOS"]], drop_first=True)

# y = label
y = data["MORTALITY"]

# replace any remaining missing values
X = X.fillna(0)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# prediction
pred = model.predict_proba(X_test)[:, 1]

# performance
auc = roc_auc_score(y_test, pred)
print("AUC:", auc)


AUC: 0.7150404420891374


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=200).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
X.isna().sum().sum()


np.int64(0)