In [9]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import mutual_info_classif


In [66]:
DATA_DIR = ""

adm_path = os.path.join(DATA_DIR, "ADMISSIONS.csv.gz")
icu_path = os.path.join(DATA_DIR, "ICUSTAYS.csv.gz")
patients_path = os.path.join(DATA_DIR, "PATIENTS.csv.gz")

adm = pd.read_csv(adm_path, compression="gzip")
icu = pd.read_csv(icu_path, compression="gzip")
patients = pd.read_csv(patients_path, compression="gzip")

adm["ADMITTIME"] = pd.to_datetime(adm["ADMITTIME"], errors="coerce")
adm["DISCHTIME"] = pd.to_datetime(adm["DISCHTIME"], errors="coerce")
icu["INTIME"] = pd.to_datetime(icu["INTIME"], errors="coerce")
icu["OUTTIME"] = pd.to_datetime(icu["OUTTIME"], errors="coerce")

adm["MORTALITY"] = adm["HOSPITAL_EXPIRE_FLAG"].astype(int)


In [11]:
cohort = icu.merge(
    adm[["SUBJECT_ID","HADM_ID","MORTALITY","ADMITTIME"]],
    on=["SUBJECT_ID","HADM_ID"],
    how="left"
)

cohort = cohort.sort_values(["SUBJECT_ID","HADM_ID","INTIME"])

cohort_first = cohort.groupby(["SUBJECT_ID","HADM_ID"]).first().reset_index()

cohort_key = cohort_first[["SUBJECT_ID","HADM_ID","ICUSTAY_ID","INTIME"]]


In [12]:
data_base = cohort_first.merge(
    adm[[
        "SUBJECT_ID","HADM_ID",
        "ADMISSION_TYPE","ADMISSION_LOCATION",
        "DISCHARGE_LOCATION",
        "INSURANCE","MARITAL_STATUS","ETHNICITY"
    ]],
    on=["SUBJECT_ID","HADM_ID"],
    how="left"
)


In [16]:
# =============================
# NEW MEMORY-SAFE VITALS BLOCK
# =============================

# 1. Map ICUSTAY_ID → INTIME (small, safe)
icu_time_map = cohort_first[["ICUSTAY_ID","INTIME"]].drop_duplicates()
icu_time_map = icu_time_map.set_index("ICUSTAY_ID")["INTIME"]
icu_ids = set(icu_time_map.index)
print("Number of ICU stays:", len(icu_ids))

# 2. Identify vital ITEMIDs
d_items = pd.read_csv(os.path.join(DATA_DIR, "D_ITEMS.csv.gz"), compression="gzip")
d_items["LABEL_UP"] = d_items["LABEL"].str.upper()

target_vitals = {
    "HEART RATE",
    "SYSTOLIC BLOOD PRESSURE",
    "DIASTOLIC BLOOD PRESSURE",
    "RESPIRATORY RATE",
    "TEMPERATURE",
    "SPO2",
    "O2 SATURATION",
    "GCS"
}

vital_items = d_items[d_items["LABEL_UP"].isin(target_vitals)]
vital_itemids = set(vital_items["ITEMID"])
print("Vital ITEMIDs:", vital_itemids)

# 3. Stream CHARTEVENTS in chunks
chart_path = os.path.join(DATA_DIR, "CHARTEVENTS.csv.gz")  # change extension if needed
usecols = ["ICUSTAY_ID","ITEMID","CHARTTIME","VALUENUM"]

kept = []
total_kept = 0
chunk_idx = 0

MAX_CHUNKS = 200          # <- hard cap on how many chunks we read
MAX_ROWS_KEPT = 2_000_000 # <- optional cap on how many rows we keep

for chunk in pd.read_csv(chart_path, usecols=usecols, chunksize=300_000):
    chunk_idx += 1
    if chunk_idx > MAX_CHUNKS:
        print(f"Reached MAX_CHUNKS={MAX_CHUNKS}, stopping early.")
        break

    # Filter early: only our ICU stays + vital itemids
    sub = chunk[
        chunk["ICUSTAY_ID"].isin(icu_ids) &
        chunk["ITEMID"].isin(vital_itemids)
    ].copy()

    if sub.empty:
        if chunk_idx % 25 == 0:
            print(f"chunk {chunk_idx}: kept 0 rows (total {total_kept})")
        continue

    sub["CHARTTIME"] = pd.to_datetime(sub["CHARTTIME"], errors="coerce")
    sub["INTIME"] = sub["ICUSTAY_ID"].map(icu_time_map)
    sub["HOUR"] = (sub["CHARTTIME"] - sub["INTIME"]).dt.total_seconds()/3600

    sub = sub[(sub["HOUR"] >= 0) & (sub["HOUR"] <= 24)]

    if not sub.empty:
        kept.append(sub[["ICUSTAY_ID","ITEMID","VALUENUM"]])
        total_kept += len(sub)

    if chunk_idx % 25 == 0:
        print(f"chunk {chunk_idx}: kept {len(sub)} rows (total {total_kept})")

    if total_kept >= MAX_ROWS_KEPT:
        print(f"Reached MAX_ROWS_KEPT={MAX_ROWS_KEPT}, stopping early.")
        break


chart_events = pd.concat(kept, ignore_index=True) if kept else pd.DataFrame(
    columns=["ICUSTAY_ID","ITEMID","VALUENUM"]
)

print("chart_events shape:", chart_events.shape)
print(chart_events.head())

# 4. Aggregate to wide vitals feature table
chart_agg = chart_events.groupby(["ICUSTAY_ID","ITEMID"])["VALUENUM"] \
                        .agg(["mean","min","max"]) \
                        .reset_index()

itemid_to_label = dict(zip(vital_items["ITEMID"], vital_items["LABEL_UP"]))

pivot = chart_agg.pivot_table(
    index="ICUSTAY_ID",
    columns="ITEMID",
    values=["mean","min","max"]
)

pivot.columns = [
    f"VITAL_{itemid_to_label.get(item,'UNK').replace(' ','_').upper()}_{stat.upper()}"
    for stat, item in pivot.columns
]

chart_wide = pivot.reset_index()
print("chart_wide:", chart_wide.shape)
print(chart_wide.head())


Number of ICU stays: 57786
Vital ITEMIDs: {646, 618, 220045, 220210, 211}
chunk 25: kept 13101 rows (total 279012)
chunk 50: kept 6944 rows (total 563884)
chunk 75: kept 12965 rows (total 877218)
chunk 100: kept 9882 rows (total 1196853)
chunk 125: kept 4435 rows (total 1432415)
chunk 150: kept 4615 rows (total 1541671)
chunk 175: kept 3717 rows (total 1643698)
chunk 200: kept 3476 rows (total 1758717)
Reached MAX_CHUNKS=200, stopping early.
chart_events shape: (1758717, 3)
   ICUSTAY_ID  ITEMID  VALUENUM
0    241249.0  220045      86.0
1    241249.0  220210      21.0
2    241249.0  220045      85.0
3    241249.0  220210      19.0
4    241249.0  220045      87.0
chart_wide: (27104, 16)
   ICUSTAY_ID  VITAL_HEART_RATE_MAX  VITAL_RESPIRATORY_RATE_MAX  \
0    200001.0                   NaN                         NaN   
1    200010.0                   NaN                         NaN   
2    200011.0                   NaN                         NaN   
3    200016.0                   NaN  

In [19]:
output = pd.read_csv(os.path.join(DATA_DIR, "OUTPUTEVENTS.csv.gz"), compression="gzip",
                     usecols=["SUBJECT_ID","HADM_ID","ICUSTAY_ID","CHARTTIME","VALUE"])

output["CHARTTIME"] = pd.to_datetime(output["CHARTTIME"], errors="coerce")
output = output[output["HADM_ID"].isin(cohort_key["HADM_ID"])]

output = output.merge(cohort_key, on=["SUBJECT_ID","HADM_ID","ICUSTAY_ID"], how="inner")
output["HOUR"] = (output["CHARTTIME"] - output["INTIME"]).dt.total_seconds()/3600
output = output[(output["HOUR"]>=0) & (output["HOUR"]<=24)]

urine_agg = output.groupby("ICUSTAY_ID")["VALUE"].agg(["sum","mean"]).reset_index()
urine_agg.columns = ["ICUSTAY_ID","URINE_SUM_24H","URINE_MEAN_24H"]


In [21]:
input_cv = pd.read_csv(os.path.join(DATA_DIR,"INPUTEVENTS_CV.csv.gz"), compression="gzip",
                       usecols=["SUBJECT_ID","HADM_ID","ICUSTAY_ID","ITEMID","CHARTTIME","AMOUNT"])
input_mv = pd.read_csv(os.path.join(DATA_DIR,"INPUTEVENTS_MV.csv.gz"), compression="gzip",
                       usecols=["SUBJECT_ID","HADM_ID","ICUSTAY_ID","ITEMID","STARTTIME","AMOUNT"])

input_cv["CHARTTIME"] = pd.to_datetime(input_cv["CHARTTIME"], errors="coerce")
input_mv["STARTTIME"] = pd.to_datetime(input_mv["STARTTIME"], errors="coerce")

input_cv = input_cv[input_cv["HADM_ID"].isin(cohort_key["HADM_ID"])]
input_mv = input_mv[input_mv["HADM_ID"].isin(cohort_key["HADM_ID"])]

input_cv = input_cv.merge(cohort_key, on=["SUBJECT_ID","HADM_ID","ICUSTAY_ID"])
input_mv = input_mv.merge(cohort_key, on=["SUBJECT_ID","HADM_ID","ICUSTAY_ID"])

input_cv["HOUR"] = (input_cv["CHARTTIME"] - input_cv["INTIME"]).dt.total_seconds()/3600
input_mv["HOUR"] = (input_mv["STARTTIME"] - input_mv["INTIME"]).dt.total_seconds()/3600

input_cv = input_cv[(input_cv["HOUR"]>=0)&(input_cv["HOUR"]<=24)]
input_mv = input_mv[(input_mv["HOUR"]>=0)&(input_mv["HOUR"]<=24)]

fluids = pd.concat([
    input_cv[["ICUSTAY_ID","AMOUNT"]],
    input_mv[["ICUSTAY_ID","AMOUNT"]]
])

fluids_agg = fluids.groupby("ICUSTAY_ID")["AMOUNT"].agg(["sum","mean"]).reset_index()
fluids_agg.columns = ["ICUSTAY_ID","FLUID_SUM_24H","FLUID_MEAN_24H"]


In [22]:
diag = pd.read_csv(os.path.join(DATA_DIR,"DIAGNOSES_ICD.csv.gz"), compression="gzip")

def root(code):
    if pd.isna(code): return None
    return str(code).replace(".","")[:3]

diag["ICD3"] = diag["ICD9_CODE"].apply(root)

prefixes = {
    "CMI_MI": {"410","412"},
    "CMI_CHF": {"428"},
    "CMI_COPD": {"490","491","492","494","496"},
    "CMI_DIAB": {"250"},
    "CMI_RENAL": {"585"},
    "CMI_LIVER": {"571"},
}

def map_flags(icd):
    out = {}
    for name, pref in prefixes.items():
        out[name] = int(icd in pref)
    return pd.Series(out)

diag_flags = diag["ICD3"].apply(map_flags)
diag_with_flags = pd.concat([diag["HADM_ID"], diag_flags], axis=1)
comorb = diag_with_flags.groupby("HADM_ID").max().reset_index()


In [37]:
# --- ORIGINAL MERGES ---
data_full = data_base.copy()
data_full = data_full.merge(chart_wide, on="ICUSTAY_ID", how="left")
data_full = data_full.merge(urine_agg, on="ICUSTAY_ID", how="left")
data_full = data_full.merge(fluids_agg, on="ICUSTAY_ID", how="left")
data_full = data_full.merge(comorb, on="HADM_ID",  how="left")

print("After merges:", data_full.shape)

# ===============================================================
# 1) FORCE ALL COLUMN NAMES TO STRINGS (fixes MultiIndex issues)
# ===============================================================
data_full.columns = [str(c) for c in data_full.columns]

# ===============================================================
# 2) DROP ALL DATETIME COLUMNS (massively important)
# ===============================================================
datetime_cols = data_full.select_dtypes(
    include=["datetime64[ns]", "datetime64[ns, UTC]"]
).columns.tolist()

print("Dropping datetime columns:", datetime_cols)
data_full = data_full.drop(columns=datetime_cols)

# ===============================================================
# 3) DROP DUPLICATE COLUMNS (caused by merges)
# ===============================================================
data_full = data_full.loc[:, ~data_full.columns.duplicated()]

print("After cleaning:", data_full.shape)
data_full.head()


After merges: (57786, 45)
Dropping datetime columns: ['INTIME', 'OUTTIME', 'ADMITTIME']
After cleaning: (57786, 36)


Unnamed: 0,SUBJECT_ID,HADM_ID,ROW_ID,ICUSTAY_ID,DBSOURCE,FIRST_CAREUNIT,LAST_CAREUNIT,FIRST_WARDID,LAST_WARDID,LOS,...,URINE_SUM_24H,URINE_MEAN_24H,FLUID_SUM_24H,FLUID_MEAN_24H,CMI_MI,CMI_CHF,CMI_COPD,CMI_DIAB,CMI_RENAL,CMI_LIVER
0,2,163353,1,243653,carevue,NICU,NICU,56,56,0.0918,...,,,,,0,0,0,0,0,0
1,3,145834,2,211552,carevue,MICU,MICU,12,12,6.0646,...,497.0,38.230769,15866.802737,82.639598,1,1,0,0,0,0
2,4,185777,3,294638,carevue,MICU,MICU,52,52,1.6785,...,2150.0,537.5,3165.0,166.578947,0,0,0,0,0,1
3,5,178980,4,214757,carevue,NICU,NICU,56,56,0.0844,...,,,,,0,0,0,0,0,0
4,6,107064,5,228232,carevue,SICU,SICU,33,33,3.6729,...,2095.0,72.241379,13315.0,154.825581,0,0,0,0,0,0


In [39]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# 0) Force all column names to strings
data_full.columns = [str(c) for c in data_full.columns]

# 1) Identify datetime columns and drop them from features
datetime_cols = data_full.select_dtypes(
    include=["datetime64[ns]", "datetime64[ns, UTC]"]
).columns.tolist()
print("Datetime columns (will be dropped from features):", datetime_cols)

# 2) Define target and columns to drop
target_col = "MORTALITY"

id_cols = ["SUBJECT_ID", "HADM_ID", "ICUSTAY_ID"]
leakage_cols = ["DISCHARGE_LOCATION"]   # if not present, it's ignored

drop_cols = [c for c in id_cols + leakage_cols + datetime_cols if c in data_full.columns]
print("Dropping from features:", drop_cols)

# 3) Feature columns = everything except ids, leakage, datetime, target
feature_cols = [c for c in data_full.columns if c not in drop_cols + [target_col]]
print("Number of candidate feature columns:", len(feature_cols))

# 4) Build X_raw, y
X_raw = data_full[feature_cols].copy()
y = data_full[target_col].astype(int)

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y, test_size=0.2, random_state=42, stratify=y
)

print("X_train_raw:", X_train_raw.shape)
print("X_test_raw:", X_test_raw.shape)
print("y_train mean:", y_train.mean())
print("y_test mean:", y_test.mean())

# 5) Categorical vs numeric
cat_cols = X_train_raw.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = [c for c in X_train_raw.columns if c not in cat_cols]

print("Categorical columns:", cat_cols)
print("Numeric columns count:", len(num_cols))

# 6) One-hot encode categoricals
X_train = pd.get_dummies(X_train_raw, columns=cat_cols, drop_first=True)
X_test  = pd.get_dummies(X_test_raw,  columns=cat_cols, drop_first=True)

# Align columns
X_train, X_test = X_train.align(X_test, join="left", axis=1)
X_test = X_test.fillna(0)

# 7) Impute numeric missing with median
for c in num_cols:
    if c in X_train.columns:
        med = X_train[c].median()
        X_train[c] = X_train[c].fillna(med)
        X_test[c]  = X_test[c].fillna(med)

print("After encoding + imputation:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)

feature_names = X_train.columns.tolist()

# 8) Scale (now ONLY numeric/bool columns, no datetime)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)


Datetime columns (will be dropped from features): []
Dropping from features: ['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'DISCHARGE_LOCATION']
Number of candidate feature columns: 31
X_train_raw: (46228, 31)
X_test_raw: (11558, 31)
y_train mean: 0.1005883879899628
y_test mean: 0.10062294514621907
Categorical columns: ['DBSOURCE', 'FIRST_CAREUNIT', 'LAST_CAREUNIT', 'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'INSURANCE', 'MARITAL_STATUS', 'ETHNICITY']
Numeric columns count: 23
After encoding + imputation:
X_train: (46228, 96)
X_test: (11558, 96)


In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

base_clf = LogisticRegression(
    max_iter=4000,
    penalty="l2",
    solver="liblinear"
)
base_clf.fit(X_train_scaled, y_train)

base_pred = base_clf.predict_proba(X_test_scaled)[:, 1]
base_auc = roc_auc_score(y_test, base_pred)
print("Baseline AUC (all features):", base_auc)


Baseline AUC (all features): 0.7246818593336221


In [41]:
# Turn scaled arrays back into DataFrames
X_train_df = pd.DataFrame(X_train_scaled, columns=feature_names)
X_test_df  = pd.DataFrame(X_test_scaled,  columns=feature_names)

# 1) Drop duplicate columns
X_train_df = X_train_df.loc[:, ~X_train_df.columns.duplicated()]
X_test_df  = X_test_df.loc[:, X_train_df.columns]  # ensure identical set

feature_names = X_train_df.columns.tolist()

# 2) Drop constant columns (vectorized for speed + correctness)
nunique_series = X_train_df.nunique(dropna=False)
const_cols = nunique_series[nunique_series <= 1].index.tolist()

print("Number of constant columns:", len(const_cols))

X_train_fs = X_train_df.drop(columns=const_cols)
X_test_fs  = X_test_df.drop(columns=const_cols, errors="ignore")

print("After dropping constant cols:")
print("X_train_fs:", X_train_fs.shape)
print("X_test_fs:", X_test_fs.shape)

feature_names = X_train_fs.columns.tolist()


Number of constant columns: 0
After dropping constant cols:
X_train_fs: (46228, 96)
X_test_fs: (11558, 96)


In [44]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score

# Candidate K values to try
max_features = X_train_fs.shape[1]
candidate_ks = [10, 20, 40, 60, 80, max_features]  # adjust if you want

print("Number of features available:", max_features)
print("Candidate K values:", candidate_ks)

results = []

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for k in candidate_ks:
    k_eff = min(k, max_features)

    mi_pipe = Pipeline([
        ("mi", SelectKBest(mutual_info_classif, k=k_eff)),
        ("clf", LogisticRegression(
            max_iter=4000,
            penalty="l2",
            solver="liblinear"
        ))
    ])

    cv_scores = cross_val_score(
        mi_pipe,
        X_train_fs,
        y_train,
        scoring="roc_auc",
        cv=cv,
        n_jobs=-1
    )

    mean_auc = cv_scores.mean()
    std_auc = cv_scores.std()
    results.append((k_eff, mean_auc, std_auc))

    print(f"k={k_eff:3d} | CV AUC={mean_auc:.4f} ± {std_auc:.4f}")

# Pick best K based on mean CV AUC
best_k, best_auc, best_std = max(results, key=lambda x: x[1])
print("\nBest K from CV:", best_k)
print("Best CV AUC   :", best_auc, "±", best_std)


Number of features available: 96
Candidate K values: [10, 20, 40, 60, 80, 96]
k= 10 | CV AUC=0.7285 ± 0.0045
k= 20 | CV AUC=0.7522 ± 0.0130
k= 40 | CV AUC=0.7714 ± 0.0104
k= 60 | CV AUC=0.7746 ± 0.0124
k= 80 | CV AUC=0.7755 ± 0.0130
k= 96 | CV AUC=0.7759 ± 0.0120

Best K from CV: 96
Best CV AUC   : 0.7759390045647165 ± 0.012023242665891968


In [59]:
final_mi_pipe = Pipeline([
    ("mi", SelectKBest(mutual_info_classif, k=20)),
    ("clf", LogisticRegression(
        max_iter=4000,
        penalty="l2",
        solver="liblinear"
    ))
])

final_mi_pipe.fit(X_train_fs, y_train)
pred_mi_test = final_mi_pipe.predict_proba(X_test_fs)[:, 1]
auc_mi_test = roc_auc_score(y_test, pred_mi_test)

print(f"Final MI + LR model with k=20")
print("Test AUC:", auc_mi_test)


Final MI + LR model with k=20
Test AUC: 0.7653065892102864


In [67]:
# ---- Top 20 MI features ----
selector = final_mi_pipe.named_steps["mi"]
support_mask = selector.get_support()

mi_scores_all = selector.scores_
all_features = X_train_fs.columns

selected_features = all_features[support_mask]
selected_scores = mi_scores_all[support_mask]

# Sort selected features by MI score descending
mi_ranked = sorted(
    zip(selected_features, selected_scores),
    key=lambda x: x[1],a
    reverse=True
)

print("\nTop MI-selected features (up to 20):")
for name, score in mi_ranked[:20]:
    print(f"{name}: {score:.4f}")



Top MI-selected features (up to 20):
ADMISSION_TYPE_EMERGENCY: 0.0210
ADMISSION_LOCATION_PHYS REFERRAL/NORMAL DELI: 0.0172
URINE_SUM_24H: 0.0169
LOS: 0.0162
FIRST_CAREUNIT_NICU: 0.0147
ADMISSION_TYPE_NEWBORN: 0.0144
LAST_WARDID: 0.0132
INSURANCE_Medicare: 0.0131
FIRST_WARDID: 0.0128
LAST_CAREUNIT_NICU: 0.0128
FIRST_CAREUNIT_MICU: 0.0125
LAST_CAREUNIT_MICU: 0.0101
INSURANCE_Private: 0.0089
ADMISSION_LOCATION_EMERGENCY ROOM ADMIT: 0.0087
URINE_MEAN_24H: 0.0086
LAST_CAREUNIT_CSRU: 0.0085
FLUID_SUM_24H: 0.0079
FLUID_MEAN_24H: 0.0059
CMI_CHF: 0.0055
CMI_LIVER: 0.0055


In [75]:
clf_l1 = LogisticRegression(
    penalty="l1",
    C=1.0,
    solver="liblinear",
    max_iter=5000
)
clf_l1.fit(X_train_fs, y_train)

coef = clf_l1.coef_.ravel()
coef_series = pd.Series(coef, index=X_train_fs.columns)

nonzero_features = coef_series[coef_series != 0].index.tolist()
print("L1-selected feature count: 20")
print("Sample L1-selected features:", nonzero_features[:30])

X_train_l1 = X_train_fs[nonzero_features[:90]]
X_test_l1  = X_test_fs[nonzero_features[:90]]

clf_l1_small = LogisticRegression(
    penalty="l2",
    C=1.0,
    solver="liblinear",
    max_iter=4000
)
clf_l1_small.fit(X_train_l1, y_train)

pred_l1 = clf_l1_small.predict_proba(X_test_l1)[:, 1]
auc_l1 = roc_auc_score(y_test, pred_l1)
print("AUC with L1-selected features:", auc_l1)


L1-selected feature count: 20
Sample L1-selected features: ['ROW_ID', 'FIRST_WARDID', 'LAST_WARDID', 'LOS', 'VITAL_RESPIRATORY_RATE_MAX', 'VITAL_SPO2_MAX', 'VITAL_HEART_RATE_MEAN', 'VITAL_RESPIRATORY_RATE_MEAN', 'VITAL_SPO2_MEAN', 'VITAL_HEART_RATE_MIN', 'VITAL_RESPIRATORY_RATE_MIN', 'VITAL_SPO2_MIN', 'URINE_SUM_24H', 'URINE_MEAN_24H', 'FLUID_SUM_24H', 'FLUID_MEAN_24H', 'CMI_MI', 'CMI_CHF', 'CMI_COPD', 'CMI_DIAB', 'CMI_RENAL', 'CMI_LIVER', 'DBSOURCE_carevue', 'DBSOURCE_metavision', 'FIRST_CAREUNIT_CSRU', 'FIRST_CAREUNIT_MICU', 'FIRST_CAREUNIT_NICU', 'FIRST_CAREUNIT_SICU', 'FIRST_CAREUNIT_TSICU', 'LAST_CAREUNIT_CSRU']
AUC with L1-selected features: 0.725084774783829


In [71]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)
rf.fit(X_train_fs, y_train)

rf_importances = pd.Series(rf.feature_importances_, index=X_train_fs.columns).sort_values(ascending=False)

print("Top 20 RF features:")
print(rf_importances.head(20))

K_rf = 20
top_rf_features = rf_importances.head(K_rf).index.tolist()
print("Using top-K RF features:", K_rf)

X_train_rf = X_train_fs[top_rf_features]
X_test_rf  = X_test_fs[top_rf_features]

clf_rf_fs = LogisticRegression(
    max_iter=4000,
    penalty="l2",
    solver="liblinear"
)
clf_rf_fs.fit(X_train_rf, y_train)

pred_rf_fs = clf_rf_fs.predict_proba(X_test_rf)[:, 1]
auc_rf_fs = roc_auc_score(y_test, pred_rf_fs)
print("AUC with top-%d RF features:" % K_rf, auc_rf_fs)


Top 20 RF features:
LOS                                        0.121074
URINE_MEAN_24H                             0.104842
URINE_SUM_24H                              0.103733
FLUID_SUM_24H                              0.093736
FLUID_MEAN_24H                             0.090573
ROW_ID                                     0.079156
FIRST_WARDID                               0.030270
LAST_WARDID                                0.029841
CMI_DIAB                                   0.013679
CMI_CHF                                    0.013119
MARITAL_STATUS_MARRIED                     0.013078
CMI_RENAL                                  0.011050
CMI_COPD                                   0.010795
ETHNICITY_WHITE                            0.010671
ADMISSION_LOCATION_EMERGENCY ROOM ADMIT    0.010669
CMI_MI                                     0.010567
INSURANCE_Medicare                         0.010342
MARITAL_STATUS_SINGLE                      0.010199
MARITAL_STATUS_WIDOWED                     0

In [72]:
final_features = list(
    set(top_mi_features) |
    set(nonzero_features) |
    set(top_rf_features)
)

print("Baseline AUC (all features):", base_auc)
print("AUC with MI-selected features:", auc_mi_test)
print("AUC with L1-selected features:", auc_l1)
print("AUC with RF-selected features:", auc_rf_fs)


Baseline AUC (all features): 0.7246818593336221
AUC with MI-selected features: 0.7653065892102864
AUC with L1-selected features: 0.72466043558047
AUC with RF-selected features: 0.7166088266690159
