In [1]:
import gc
import gzip
import pandas as pd
import numpy as np
import os
from pathlib import Path

In [2]:
# Compressed file paths
mimic_file_path = r"CSV/Exports/Temp/Doctors_Dataset/o01_mimic_for_ext_val.csv"
first_stay_path= r"CSV/Imports/o03_icu_first_stay.csv"

patients_path= r"../00_Datasets/mimic-iv-3_1/hosp/patients.csv.gz"
admissions_path= r"../00_Datasets/mimic-iv-3_1/hosp/admissions.csv.gz"

diagnoses_path = r"../00_Datasets/mimic-iv-3_1/hosp/diagnoses_icd.csv.gz"

In [3]:
mimic_df = pd.read_csv(mimic_file_path)
stroke_first_stay = pd.read_csv(first_stay_path)

patients_df = pd.read_csv(patients_path, compression='gzip')
admissions_df = pd.read_csv(admissions_path, compression='gzip')

diagnosis_df = pd.read_csv(diagnoses_path, compression='gzip')

# Add stay_id

In [4]:
# 1) Κράτα μόνο τα κλειδιά + stay_id από το stroke_first_stay
map_stay = (
    stroke_first_stay[["subject_id", "hadm_id", "stay_id"]]
    .dropna(subset=["subject_id", "hadm_id", "stay_id"])
    .drop_duplicates(subset=["subject_id", "hadm_id"])  # αν υπάρχει 1 stay_id ανά (subject_id, hadm_id)
)

# 2) Φέρε το stay_id στο mimic_df
mimic_df = mimic_df.merge(
    map_stay,
    on=["subject_id", "hadm_id"],
    how="left"
)

# μετακίνηση της stay_id μετά την hadm_id
cols = mimic_df.columns.tolist()
cols.remove("stay_id")

hadm_pos = cols.index("hadm_id")
cols.insert(hadm_pos + 1, "stay_id")

mimic_df = mimic_df[cols]

# 3) (προαιρετικά) έλεγξε πόσα δεν βρέθηκαν
missing = mimic_df["stay_id"].isna().sum()
print("Missing stay_id rows:", missing)

Missing stay_id rows: 0


# Bring ICU intime & outtime

In [5]:
KEYS = ["subject_id", "hadm_id", "stay_id"]

# Πηγή για intime/outtime (patients_df) - κρατάμε μόνο τα απαραίτητα & unique keys
patients_times = (
    stroke_first_stay[KEYS + ["intime", "outtime"]]
    .drop_duplicates(subset=KEYS)
)

def merge_and_place_intime_outtime(stroke_df, times_df):
    # αν υπάρχουν ήδη, τα βγάζουμε για να μη γίνουν intime_x/intime_y
    stroke_df2 = stroke_df.drop(columns=["intime", "outtime"], errors="ignore").copy()

    # merge
    out = stroke_df2.merge(times_df, on=KEYS, how="left")

    # βάλε intime/outtime ακριβώς μετά τη stay_id
    cols = list(out.columns)
    for c in ["intime", "outtime"]:
        if c in cols:
            cols.remove(c)

    stay_idx = cols.index("stay_id")
    cols = cols[:stay_idx+1] + ["intime", "outtime"] + cols[stay_idx+1:]
    return out[cols]

mimic_df = merge_and_place_intime_outtime(mimic_df, patients_times)

# (προαιρετικό) γρήγορος έλεγχος
for name, df in [
    ("stroke01", mimic_df),
]:
    print(name, "missing intime:", df["intime"].isna().mean(), "| missing outtime:", df["outtime"].isna().mean())

stroke01 missing intime: 0.0 | missing outtime: 0.0


# Bring DOD
## Date of death after discharge

In [6]:
# 1) Κράτα μόνο τα απαραίτητα από patients_df και βγάλε duplicates (ασφάλεια)
patients_dod = (
    patients_df[["subject_id", "dod"]]
    .drop_duplicates(subset=["subject_id"])
)

# 2) Μια βοηθητική συνάρτηση merge
def merge_dod(stroke_df, patients_dod_df):
    out = stroke_df.merge(patients_dod_df, on="subject_id", how="left")
    return out

# 3) Apply σε όλα
mimic_df = merge_dod(mimic_df, patients_dod)

# 4) Έλεγχος
for name, df in [
    ("mimic_df", mimic_df),
]:
    print(name, "dod missing:", df["dod"].isna().mean())

mimic_df dod missing: 0.5614420062695925


In [7]:
def place_dod_next_to_outtime(df):
    if "dod" not in df.columns or "outtime" not in df.columns:
        return df

    cols = list(df.columns)
    cols.remove("dod")  # βγάλ’ την από όπου είναι τώρα

    out_idx = cols.index("outtime")
    cols = cols[:out_idx + 1] + ["dod"] + cols[out_idx + 1:]
    return df[cols]

mimic_df = place_dod_next_to_outtime(mimic_df)

# Bring Dischtime

In [8]:
KEYS = ["subject_id", "hadm_id"]

# Κράτα μόνο τα απαραίτητα και κάνε unique per (subject_id, hadm_id)
adm_dischtime = (
    admissions_df[KEYS + ["dischtime"]]
    .drop_duplicates(subset=KEYS)
)

def merge_dischtime_and_place(df, adm_df):
    # Αφαίρεσε τυχόν υπάρχουσα dischtime για να μην βγει dischtime_x/dischtime_y
    df2 = df.drop(columns=["dischtime"], errors="ignore").copy()

    # Merge
    out = df2.merge(adm_df, on=KEYS, how="left")

    # Βάλε dischtime αμέσως μετά την outtime
    cols = list(out.columns)
    if "outtime" in cols and "dischtime" in cols:
        cols.remove("dischtime")
        out_idx = cols.index("outtime")
        cols = cols[:out_idx + 1] + ["dischtime"] + cols[out_idx + 1:]
        out = out[cols]

    return out

mimic_df = merge_dischtime_and_place(mimic_df, adm_dischtime)

# (προαιρετικό) έλεγχος
for name, df in [
    ("mimic_df", mimic_df),
]:
    print(name, "missing dischtime:", df["dischtime"].isna().mean())

mimic_df missing dischtime: 0.0


# Rename Columns

In [9]:
rename_map = {
    "intime": "icu_intime",
    "outtime": "icu_outtime",
    "dischtime": "hosp_dischtime",
}

mimic_df = mimic_df.rename(columns=rename_map)

# Create DOD 30/180/360 Masks

In [10]:
# 1) Σιγουρευόμαστε ότι είναι datetime
mimic_df["icu_intime"] = pd.to_datetime(mimic_df["icu_intime"], errors="coerce")
mimic_df["dod"] = pd.to_datetime(mimic_df["dod"], errors="coerce")

def mort_within_days(df, days: int, intime_col="icu_intime", dod_col="dod"):
    # True μόνο όταν έχουμε dod και dod <= icu_intime + days
    return ((df[dod_col].notna()) & (df[dod_col] <= (df[intime_col] + pd.Timedelta(days=days)))).astype(int)

mimic_df["mort_30d"]  = mort_within_days(mimic_df, 30)
mimic_df["mort_180d"] = mort_within_days(mimic_df, 180)
mimic_df["mort_360d"] = mort_within_days(mimic_df, 360)

print("Δημιουργήθηκαν οι mort_30d, mort_180d, mort_360d.")

Δημιουργήθηκαν οι mort_30d, mort_180d, mort_360d.


# Duration στα events 30, 180, 360

In [11]:
# 1) Datetimes
mimic_df["icu_intime"] = pd.to_datetime(mimic_df["icu_intime"], errors="coerce")
mimic_df["dod"]        = pd.to_datetime(mimic_df["dod"], errors="coerce")

# 2) Time to death (days)
mimic_df["time_to_death_days"] = (mimic_df["dod"] - mimic_df["icu_intime"]).dt.total_seconds() / 86400.0

# Aν βρεθούν αρνητικά durations τα κάνω μηδέν
neg = mimic_df["time_to_death_days"].notna() & (mimic_df["time_to_death_days"] < 0)
if neg.any():
    print(f"Βρέθηκαν {neg.sum()} αρνητικά time_to_death_days. Θα τα κάνω 0.")
    mimic_df.loc[neg, "time_to_death_days"] = 0.0

# 3) duration + event για κάθε horizon
for H in [30, 180, 360]:
    mort_col = f"mort_{H}d"

    # event: από τα labels που έχω ήδη
    mimic_df[f"event_{H}d"] = pd.to_numeric(mimic_df[mort_col], errors="coerce").fillna(0).astype(int)

    # duration: αν υπάρχει dod -> min(ttd, H), αλλιώς censor στο H
    ttd = mimic_df["time_to_death_days"]
    mimic_df[f"duration_{H}d"] = np.where(
        ttd.notna(),
        np.minimum(ttd, float(H)),
        float(H)
    ).astype(float)

    # clamp για ασφάλεια
    mimic_df[f"duration_{H}d"] = mimic_df[f"duration_{H}d"].clip(lower=0, upper=float(H))

# 4) Quick summary
print("N:", len(mimic_df))
print("Deaths within 30d :", mimic_df["event_30d"].sum())
print("Deaths within 180d:", mimic_df["event_180d"].sum())
print("Deaths within 360d:", mimic_df["event_360d"].sum())

Βρέθηκαν 1280 αρνητικά time_to_death_days. Θα τα κάνω 0.
N: 51040
Deaths within 30d : 12672
Deaths within 180d: 16272
Deaths within 360d: 18048


In [12]:
def sanity_check_labels(mimic_df, H):
    mort_col = f"mort_{H}d"
    event_from_dod = ((mimic_df["time_to_death_days"].notna()) & (mimic_df["time_to_death_days"] <= H)).astype(int)
    mismatch = (mimic_df[mort_col].astype(int) != event_from_dod).sum()
    print(f"H={H}d mismatch mort_{H}d vs (dod<=H from icu_intime): {mismatch} / {len(mimic_df)}")
    if mismatch > 0:
        display(mimic_df.loc[mimic_df[mort_col].astype(int) != event_from_dod,
                       ["subject_id","hadm_id","stay_id","icu_intime","dod","time_to_death_days",mort_col]].head(10))

for H in [30,180,360]:
    sanity_check_labels(mimic_df, H)

H=30d mismatch mort_30d vs (dod<=H from icu_intime): 0 / 51040
H=180d mismatch mort_180d vs (dod<=H from icu_intime): 0 / 51040
H=360d mismatch mort_360d vs (dod<=H from icu_intime): 0 / 51040


In [13]:
cols_to_drop = ["mort_30d", "mort_180d", "mort_360d", "stay_id", "icu_intime", "icu_outtime", "hosp_dischtime", "dod"]

mimic_df = mimic_df.drop(columns=[c for c in cols_to_drop if c in mimic_df.columns])

In [14]:
display(mimic_df)

Unnamed: 0,row_count,subject_id,hadm_id,Time_Zone,gender,age,race,Alanine_Aminotransferase_(ALT)_(Max),Alanine_Aminotransferase_(ALT)_(Mean),Alanine_Aminotransferase_(ALT)_(Median),...,SOFA,los,hospital_expire_flag,time_to_death_days,event_30d,duration_30d,event_180d,duration_180d,event_360d,duration_360d
0,1,10004733,27411876,1,M,51,UNKNOWN,46.0,46.0,46.0,...,3,8.357373,0,,0,30.0,0,180.0,0,360.0
1,2,10004733,27411876,2,M,51,UNKNOWN,46.0,46.0,46.0,...,3,8.357373,0,,0,30.0,0,180.0,0,360.0
2,3,10004733,27411876,3,M,51,UNKNOWN,46.0,46.0,46.0,...,3,8.357373,0,,0,30.0,0,180.0,0,360.0
3,4,10004733,27411876,4,M,51,UNKNOWN,46.0,46.0,46.0,...,3,8.357373,0,,0,30.0,0,180.0,0,360.0
4,5,10004733,27411876,5,M,51,UNKNOWN,46.0,46.0,46.0,...,3,8.357373,0,,0,30.0,0,180.0,0,360.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51035,58140,19999987,23865745,12,F,57,UNKNOWN,63.0,63.0,63.0,...,6,1.937847,0,,0,30.0,0,180.0,0,360.0
51036,58141,19999987,23865745,13,F,57,UNKNOWN,63.0,63.0,63.0,...,6,1.937847,0,,0,30.0,0,180.0,0,360.0
51037,58142,19999987,23865745,14,F,57,UNKNOWN,63.0,63.0,63.0,...,6,1.937847,0,,0,30.0,0,180.0,0,360.0
51038,58143,19999987,23865745,15,F,57,UNKNOWN,63.0,63.0,63.0,...,6,1.937847,0,,0,30.0,0,180.0,0,360.0


# Save

In [15]:
out_dir = "CSV/Exports/Temp/Doctors_Dataset/"
os.makedirs(out_dir, exist_ok=True)

mimic_df.to_csv(os.path.join(out_dir, "o01_mimic_out_of_hospital.csv"), index=False)