In [1]:
import gc
import gzip
import pandas as pd
import numpy as np
import os
from pathlib import Path
from collections import Counter

In [2]:
# Compressed file paths
eicu_file_path = r"../04_ANN/CSV/exports/whole_set/o3_hour_overlap_window_eicu.csv"
mimic_file_path = r"../04_ANN/CSV/exports/whole_set/o3_hour_overlap_window_mimic.csv"

file_sofa_path= r"CSV/Imports/sofa.csv"
file_first_day_sofa_path= r"CSV/Imports/first_day_sofa.csv"
first_stay_path= r"CSV/Imports/o03_icu_first_stay.csv"

In [3]:
# SOFA data
stroke_sofa = pd.read_csv(file_sofa_path)
stroke_first_day_sofa = pd.read_csv(file_first_day_sofa_path)
stroke_first_stay = pd.read_csv(first_stay_path)

In [4]:
eicu_df = pd.read_csv(eicu_file_path)
mimic_df = pd.read_csv(mimic_file_path)

mimic_df = mimic_df[mimic_df["los"] <= 10].copy()
eicu_df = eicu_df[eicu_df["los"] <= 10].copy()

# Keep doctors features

In [5]:
# ----------------------------
# 1) Columns to keep (as provided)
# ----------------------------
wanted_cols_text = """
row_count
subject_id
hadm_id
Time_Zone
gender
age
race
Alanine_Aminotransferase_(ALT)_(Max)
Alanine_Aminotransferase_(ALT)_(Mean)
Alanine_Aminotransferase_(ALT)_(Median)
Alanine_Aminotransferase_(ALT)_(Min)
Albumin_(Max)
Albumin_(Mean)
Albumin_(Median)
Albumin_(Min)
Ammonia_(Max)
Ammonia_(Mean)
Ammonia_(Median)
Ammonia_(Min)
Anion_Gap_(Max)
Anion_Gap_(Mean)
Anion_Gap_(Median)
Anion_Gap_(Min)
Arterial_Blood_Pressure_diastolic_(mmHg)_(Max)
Arterial_Blood_Pressure_diastolic_(mmHg)_(Mean)
Arterial_Blood_Pressure_diastolic_(mmHg)_(Median)
Arterial_Blood_Pressure_diastolic_(mmHg)_(Min)
Arterial_Blood_Pressure_mean_(mmHg)_(Max)
Arterial_Blood_Pressure_mean_(mmHg)_(Mean)
Arterial_Blood_Pressure_mean_(mmHg)_(Median)
Arterial_Blood_Pressure_mean_(mmHg)_(Min)
Arterial_Blood_Pressure_systolic_(mmHg)_(Max)
Arterial_Blood_Pressure_systolic_(mmHg)_(Mean)
Arterial_Blood_Pressure_systolic_(mmHg)_(Median)
Arterial_Blood_Pressure_systolic_(mmHg)_(Min)
Asparate_Aminotransferase_(AST)_(Max)
Asparate_Aminotransferase_(AST)_(Mean)
Asparate_Aminotransferase_(AST)_(Median)
Asparate_Aminotransferase_(AST)_(Min)
Base_Excess_(Max)
Base_Excess_(Mean)
Base_Excess_(Median)
Base_Excess_(Min)
Bicarbonate_(Max)
Bicarbonate_(Mean)
Bicarbonate_(Median)
Bicarbonate_(Min)
BUN_(Max)
BUN_(Mean)
BUN_(Median)
BUN_(Min)
C_Reactive_Protein_(CRP)_(Max)
C_Reactive_Protein_(CRP)_(Mean)
C_Reactive_Protein_(CRP)_(Min)
C_Reactive_Protein_(CRP)_(Median)
Central_Venous_Pressure_(mmHg)_(Max)
Central_Venous_Pressure_(mmHg)_(Mean)
Central_Venous_Pressure_(mmHg)_(Median)
Central_Venous_Pressure_(mmHg)_(Min)
Chloride_(Max)
Chloride_(Mean)
Chloride_(Median)
Chloride_(Min)
Chloride_(serum)_(Max)
Chloride_(serum)_(Mean)
Chloride_(serum)_(Median)
Chloride_(serum)_(Min)
CK_(CPK)_(Max)
CK_(CPK)_(Mean)
CK_(CPK)_(Median)
CK_(CPK)_(Min)
CK-MB_(Max)
CK-MB_(Mean)
CK-MB_(Median)
CK-MB_(Min)
Creatinine_(Max)
Creatinine_(Mean)
Creatinine_(Median)
Creatinine_(Min)
Differential-Lymphs_(Max)
Differential-Lymphs_(Mean)
Differential-Lymphs_(Median)
Differential-Lymphs_(Min)
Differential-Neuts_(Max)
Differential-Neuts_(Mean)
Differential-Neuts_(Median)
Differential-Neuts_(Min)
Fibrinogen_(Max)
Fibrinogen_(Mean)
Fibrinogen_(Median)
Fibrinogen_(Min)
GCS_(Max)
GCS_(Mean)
GCS_(Median)
GCS_(Min)
Glucose_finger_stick_(range_70-100)_(Max)
Glucose_finger_stick_(range_70-100)_(Mean)
Glucose_finger_stick_(range_70-100)_(Median)
Glucose_finger_stick_(range_70-100)_(Min)
Heart_Rate_(bpm)_(Max)
Heart_Rate_(bpm)_(Mean)
Heart_Rate_(bpm)_(Median)
Heart_Rate_(bpm)_(Min)
Hemoglobin_(Max)
Hemoglobin_(Mean)
Hemoglobin_(Median)
Hemoglobin_(Min)
INR(PT)_(Max)
INR(PT)_(Mean)
INR(PT)_(Median)
INR(PT)_(Min)
Inspired_O2_Fraction_(Max)
Inspired_O2_Fraction_(Mean)
Inspired_O2_Fraction_(Median)
Inspired_O2_Fraction_(Min)
Lactate_(Max)
Lactate_(Mean)
Lactate_(Median)
Lactate_(Min)
LDH_(Max)
LDH_(Mean)
LDH_(Median)
LDH_(Min)
MCH_(Max)
MCH_(Mean)
MCH_(Median)
MCH_(Min)
MCHC_(Max)
MCHC_(Mean)
MCHC_(Median)
MCHC_(Min)
MCV_(Max)
MCV_(Mean)
MCV_(Median)
MCV_(Min)
Non_Invasive_Blood_Pressure_diastolic_(mmHg)_(Max)
Non_Invasive_Blood_Pressure_diastolic_(mmHg)_(Mean)
Non_Invasive_Blood_Pressure_diastolic_(mmHg)_(Median)
Non_Invasive_Blood_Pressure_diastolic_(mmHg)_(Min)
Non_Invasive_Blood_Pressure_mean_(mmHg)_(Max)
Non_Invasive_Blood_Pressure_mean_(mmHg)_(Mean)
Non_Invasive_Blood_Pressure_mean_(mmHg)_(Median)
Non_Invasive_Blood_Pressure_mean_(mmHg)_(Min)
Non_Invasive_Blood_Pressure_systolic_(mmHg)_(Max)
Non_Invasive_Blood_Pressure_systolic_(mmHg)_(Mean)
Non_Invasive_Blood_Pressure_systolic_(mmHg)_(Median)
Non_Invasive_Blood_Pressure_systolic_(mmHg)_(Min)
O2_saturation_pulseoxymetry_(%)_(Max)
O2_saturation_pulseoxymetry_(%)_(Mean)
O2_saturation_pulseoxymetry_(%)_(Median)
O2_saturation_pulseoxymetry_(%)_(Min)
Platelet_Count_(Max)
Platelet_Count_(Mean)
Platelet_Count_(Median)
Platelet_Count_(Min)
pO2_(Max)
pO2_(Mean)
pO2_(Median)
pO2_(Min)
Potassium_(Max)
Potassium_(Mean)
Potassium_(Median)
Potassium_(Min)
PT_(Max)
PT_(Mean)
PT_(Median)
PT_(Min)
PTT_(Max)
PTT_(Mean)
PTT_(Median)
PTT_(Min)
RDW_(Max)
RDW_(Mean)
RDW_(Median)
RDW_(Min)
Respiratory_Rate_(insp/min)_(Max)
Respiratory_Rate_(insp/min)_(Mean)
Respiratory_Rate_(insp/min)_(Median)
Respiratory_Rate_(insp/min)_(Min)
Sodium_(Max)
Sodium_(Mean)
Sodium_(Median)
Sodium_(Min)
Temperature_Fahrenheit_(F)_(Max)
Temperature_Fahrenheit_(F)_(Mean)
Temperature_Fahrenheit_(F)_(Median)
Temperature_Fahrenheit_(F)_(Min)
Total_Bilirubin_(Max)
Total_Bilirubin_(Mean)
Total_Bilirubin_(Median)
Total_Bilirubin_(Min)
Troponin-T_(Max)
Troponin-T_(Mean)
Troponin-T_(Median)
Troponin-T_(Min)
White_Blood_Cells_(Max)
White_Blood_Cells_(Mean)
White_Blood_Cells_(Median)
White_Blood_Cells_(Min)
Glucose_(Max)
Glucose_(Mean)
Glucose_(Median)
Glucose_(Min)
hospital_expire_flag
los
""".strip()

WANTED_COLS = [c.strip() for c in wanted_cols_text.splitlines() if c.strip()]

# Quick duplicate check (just in case)
dup = [c for c, k in Counter(WANTED_COLS).items() if k > 1]
if dup:
    print("Duplicate names in WANTED_COLS:", dup)

# ----------------------------
# 2) Optional aliases (common naming differences)
# ----------------------------
ALIASES = {
    "los": ["LOS", "icu_los", "unit_los"],
    "hospital_expire_flag": ["hospital_expired_flag", "hospital_expired", "mortality", "unitdischargestatus"],
    "Time_Zone": ["time_zone", "TimeZone", "timeZone"],
    "race": ["ethnicity"],
}

def subset_keep_columns(df, wanted_cols, df_name="df", aliases=None):
    aliases = aliases or {}
    resolved = []
    missing = []

    df_cols = set(df.columns)

    for col in wanted_cols:
        if col in df_cols:
            resolved.append(col)
            continue

        # try aliases
        found = None
        for alt in aliases.get(col, []):
            if alt in df_cols:
                found = alt
                break

        if found is not None:
            resolved.append(found)
        else:
            missing.append(col)

    out = df[resolved].copy()

    print(f"\n[{df_name}] kept {out.shape[1]}/{len(wanted_cols)} columns | missing: {len(missing)}")
    if missing:
        # show a manageable preview
        preview = missing[:30]
        print("Missing (first 30):", preview)
        if len(missing) > 30:
            print(f"... plus {len(missing)-30} more")

    return out, missing, resolved

# ----------------------------
# 3) Apply to your dataframes
# ----------------------------
mimic_df_small, mimic_missing, mimic_resolved = subset_keep_columns(
    mimic_df, WANTED_COLS, df_name="mimic_df", aliases=ALIASES
)

eicu_df_small, eicu_missing, eicu_resolved = subset_keep_columns(
    eicu_df, WANTED_COLS, df_name="eicu_df", aliases=ALIASES
)

# ----------------------------
# 4) Sanity checks
# ----------------------------
print("\nShapes:")
print("mimic_df_small:", mimic_df_small.shape)
print("eicu_df_small :", eicu_df_small.shape)

print("\nTargets check:")
for name, d in [("mimic", mimic_df_small), ("eicu", eicu_df_small)]:
    cols = d.columns
    print(f"{name}: has hospital_expire_flag? {'hospital_expire_flag' in cols} | has los? {'los' in cols or 'LOS' in cols}")


[mimic_df] kept 205/205 columns | missing: 0

[eicu_df] kept 205/205 columns | missing: 0

Shapes:
mimic_df_small: (51040, 205)
eicu_df_small : (78256, 205)

Targets check:
mimic: has hospital_expire_flag? True | has los? True
eicu: has hospital_expire_flag? True | has los? True


In [6]:
display(mimic_df_small)

Unnamed: 0,row_count,subject_id,hadm_id,Time_Zone,gender,age,race,Alanine_Aminotransferase_(ALT)_(Max),Alanine_Aminotransferase_(ALT)_(Mean),Alanine_Aminotransferase_(ALT)_(Median),...,White_Blood_Cells_(Max),White_Blood_Cells_(Mean),White_Blood_Cells_(Median),White_Blood_Cells_(Min),Glucose_(Max),Glucose_(Mean),Glucose_(Median),Glucose_(Min),hospital_expire_flag,los
0,1,10004733,27411876,1,M,51,UNKNOWN,46.0,46.0,46.0,...,7.4,7.4,7.4,7.4,86.0,86.0,86.0,86.0,0,8.357373
1,2,10004733,27411876,2,M,51,UNKNOWN,46.0,46.0,46.0,...,7.4,7.4,7.4,7.4,86.0,86.0,86.0,86.0,0,8.357373
2,3,10004733,27411876,3,M,51,UNKNOWN,46.0,46.0,46.0,...,7.4,7.4,7.4,7.4,86.0,86.0,86.0,86.0,0,8.357373
3,4,10004733,27411876,4,M,51,UNKNOWN,46.0,46.0,46.0,...,7.4,7.4,7.4,7.4,94.0,90.0,90.0,86.0,0,8.357373
4,5,10004733,27411876,5,M,51,UNKNOWN,46.0,46.0,46.0,...,7.4,7.4,7.4,7.4,94.0,90.0,90.0,86.0,0,8.357373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58139,58140,19999987,23865745,12,F,57,UNKNOWN,63.0,63.0,63.0,...,12.9,12.9,12.9,12.9,113.0,113.0,113.0,113.0,0,1.937847
58140,58141,19999987,23865745,13,F,57,UNKNOWN,63.0,63.0,63.0,...,12.9,12.9,12.9,12.9,113.0,113.0,113.0,113.0,0,1.937847
58141,58142,19999987,23865745,14,F,57,UNKNOWN,63.0,63.0,63.0,...,12.9,12.9,12.9,12.9,113.0,113.0,113.0,113.0,0,1.937847
58142,58143,19999987,23865745,15,F,57,UNKNOWN,63.0,63.0,63.0,...,12.9,12.9,12.9,12.9,113.0,113.0,113.0,113.0,0,1.937847


# NLR, ANC & PLR Calculation

In [7]:
cols_check = [
    "Differential-Lymphs_(Median)",
    "Differential-Neuts_(Median)"
]
display(mimic_df_small[cols_check].describe(percentiles=[.5,.9,.95]).T)
display(eicu_df_small[cols_check].describe(percentiles=[.5,.9,.95]).T)

Unnamed: 0,count,mean,std,min,50%,90%,95%,max
Differential-Lymphs_(Median),13648.0,14.277345,11.799012,0.0,11.4,26.4,34.8,100.0
Differential-Neuts_(Median),13648.0,78.272018,14.11146,0.0,81.9,90.4,92.3,100.0


Unnamed: 0,count,mean,std,min,50%,90%,95%,max
Differential-Lymphs_(Median),47008.0,17.059932,10.707496,0.0,15.0,31.0,35.5,100.0
Differential-Neuts_(Median),41136.0,72.690297,12.908184,0.0,74.5,87.5,90.0,98.25


In [8]:
def convert_wbc_and_platelets_to_per_uL(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:
    """
    Convert WBC and Platelet counts from (x10^3/μL) to (/μL) by multiplying by 1000.
    This makes values align with clinician intuition (e.g., WBC 10 -> 10,000 /μL).
    """
    out = df.copy()
    aggs = ["Max", "Mean", "Median", "Min"]
    bases = ["White_Blood_Cells", "Platelet_Count"]

    converted = []
    for agg in aggs:
        for base in bases:
            col = f"{base}_({agg})"
            if col in out.columns:
                out[col] = pd.to_numeric(out[col], errors="coerce") * 1000.0
                converted.append(col)

    if verbose:
        print("Converted to /μL (x1000):", converted)
    return out


def add_anc_nlr_plr_then_drop_differentials(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:
    """
    Assumes after conversion:
      - WBC is now in /μL
      - Platelet_Count is now in /μL
      - Differential-* are still percentages (0–100)

    For each agg in {Max, Mean, Median, Min}:
      - ANC_(agg) = WBC_(agg) * (Neuts%_(agg)/100)         -> /μL
      - ALC_(agg) = WBC_(agg) * (Lymphs%_(agg)/100)        -> /μL (internal)
      - NLR_(agg) = ANC_(agg) / ALC_(agg)                  -> unitless
      - PLR_(agg) = Platelet_Count_(agg) / ALC_(agg)       -> unitless

    Finally drops Differential-Lymphs_* and Differential-Neuts_* columns.
    """
    out = df.copy()
    aggs = ["Max", "Mean", "Median", "Min"]

    for agg in aggs:
        wbc_col   = f"White_Blood_Cells_({agg})"
        neut_col  = f"Differential-Neuts_({agg})"
        lymph_col = f"Differential-Lymphs_({agg})"
        plt_col   = f"Platelet_Count_({agg})"

        anc_out = f"ANC_({agg})"
        nlr_out = f"NLR_({agg})"
        plr_out = f"PLR_({agg})"

        if wbc_col not in out.columns or lymph_col not in out.columns:
            if verbose:
                miss = [c for c in (wbc_col, lymph_col) if c not in out.columns]
                print(f"[skip {agg}] missing for ALC: {miss}")
            continue

        wbc    = pd.to_numeric(out[wbc_col], errors="coerce")      # /μL
        lymphp = pd.to_numeric(out[lymph_col], errors="coerce")    # %
        alc = wbc * (lymphp / 100.0)                               # /μL

        # ---------- ANC ----------
        if neut_col in out.columns:
            neutp = pd.to_numeric(out[neut_col], errors="coerce")  # %
            anc = wbc * (neutp / 100.0)                             # /μL
            out[anc_out] = anc
        else:
            anc = pd.Series(np.nan, index=out.index)
            if verbose:
                print(f"[{agg}] missing {neut_col} -> ANC/NLR not created")

        # ---------- NLR ----------
        if neut_col in out.columns:
            out[nlr_out] = anc / alc.replace(0, np.nan)

        # ---------- PLR ----------
        if plt_col in out.columns:
            platelets = pd.to_numeric(out[plt_col], errors="coerce")  # /μL
            out[plr_out] = platelets / alc.replace(0, np.nan)
        else:
            if verbose:
                print(f"[{agg}] missing {plt_col} -> PLR not created")

    # Drop differential percentage columns AFTER computing ratios/counts
    drop_cols = []
    for agg in aggs:
        for base in ["Differential-Lymphs", "Differential-Neuts"]:
            c = f"{base}_({agg})"
            if c in out.columns:
                drop_cols.append(c)

    out = out.drop(columns=drop_cols)

    if verbose:
        print("Dropped:", drop_cols)

    return out


# -------------------------
# USAGE (2 steps)
# -------------------------
mimic_df_small = convert_wbc_and_platelets_to_per_uL(mimic_df_small, verbose=True)
eicu_df_small  = convert_wbc_and_platelets_to_per_uL(eicu_df_small,  verbose=True)

mimic_df_small = add_anc_nlr_plr_then_drop_differentials(mimic_df_small, verbose=True)
eicu_df_small  = add_anc_nlr_plr_then_drop_differentials(eicu_df_small,  verbose=True)


Converted to /μL (x1000): ['White_Blood_Cells_(Max)', 'Platelet_Count_(Max)', 'White_Blood_Cells_(Mean)', 'Platelet_Count_(Mean)', 'White_Blood_Cells_(Median)', 'Platelet_Count_(Median)', 'White_Blood_Cells_(Min)', 'Platelet_Count_(Min)']
Converted to /μL (x1000): ['White_Blood_Cells_(Max)', 'Platelet_Count_(Max)', 'White_Blood_Cells_(Mean)', 'Platelet_Count_(Mean)', 'White_Blood_Cells_(Median)', 'Platelet_Count_(Median)', 'White_Blood_Cells_(Min)', 'Platelet_Count_(Min)']
Dropped: ['Differential-Lymphs_(Max)', 'Differential-Neuts_(Max)', 'Differential-Lymphs_(Mean)', 'Differential-Neuts_(Mean)', 'Differential-Lymphs_(Median)', 'Differential-Neuts_(Median)', 'Differential-Lymphs_(Min)', 'Differential-Neuts_(Min)']
Dropped: ['Differential-Lymphs_(Max)', 'Differential-Neuts_(Max)', 'Differential-Lymphs_(Mean)', 'Differential-Neuts_(Mean)', 'Differential-Lymphs_(Median)', 'Differential-Neuts_(Median)', 'Differential-Lymphs_(Min)', 'Differential-Neuts_(Min)']


# Calculate RAR

In [9]:
def add_rar_columns_only(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for suf in ["Max", "Min", "Mean", "Median"]:
        rdw_col = f"RDW_({suf})"
        alb_col = f"Albumin_({suf})"
        rar_col = f"RAR_({suf})"

        rdw = pd.to_numeric(df[rdw_col], errors="coerce")
        alb = pd.to_numeric(df[alb_col], errors="coerce").replace(0, np.nan)

        df[rar_col] = rdw / alb

    return df

# apply
mimic_df_small = add_rar_columns_only(mimic_df_small)
eicu_df_small  = add_rar_columns_only(eicu_df_small)

# eICU BMI

In [10]:
compressed_file_path = r"..\00_Datasets\eicu-2_0\patient.csv.gz"

patients_df = pd.read_csv(compressed_file_path, compression='gzip')

In [11]:
# Columns related to ICU
icu_columns = [
    'uniquepid', # ID for a unique patient
    'unitvisitnumber', # identifies the visit number of the patient
    'patienthealthsystemstayid', # surrogate key for hospital Stay
    'patientunitstayid', # surrogate key for ICU Stay
    'gender', # gender of the patient
    'age', # age of the patient in full years
    'ethnicity', # ethnicity of the patient
    'admissionheight', # admission height of the patient in cm
    'admissionweight', #admission weight of the patient in kilograms
]

# Create a new DataFrame with only the ICU-related columns
temp_df = patients_df[icu_columns]

In [12]:
def add_bmi_from_temp_using_patientunitstayid(
    eicu_df_small: pd.DataFrame,
    temp_df: pd.DataFrame,
    verbose: bool = True
) -> pd.DataFrame:
    out = eicu_df_small.copy()

    # mapping table from temp_df
    tmp = temp_df[["patientunitstayid", "admissionheight", "admissionweight"]].copy()

    # align dtypes for merge
    out["hadm_id"] = pd.to_numeric(out["hadm_id"], errors="coerce")
    tmp["patientunitstayid"] = pd.to_numeric(tmp["patientunitstayid"], errors="coerce")

    # one row per ICU stay
    tmp = tmp.drop_duplicates(subset=["patientunitstayid"])

    # merge: hadm_id (left) ↔ patientunitstayid (right)
    out = out.merge(
        tmp,
        left_on="hadm_id",
        right_on="patientunitstayid",
        how="left"
    ).drop(columns=["patientunitstayid"])  # drop duplicate key column

    # BMI
    h_cm = pd.to_numeric(out["admissionheight"], errors="coerce")
    w_kg = pd.to_numeric(out["admissionweight"], errors="coerce")

    h_m = (h_cm / 100.0).replace(0, np.nan)
    w_kg = w_kg.replace(0, np.nan)

    out["BMI"] = w_kg / (h_m ** 2)

    if verbose:
        cols = ["admissionheight", "admissionweight", "BMI"]
        print("NaN counts:\n", out[cols].isna().sum())
        print("\nNaN rates (%):\n", (out[cols].isna().mean() * 100).round(2))

    return out

# usage:
eicu_df_small = add_bmi_from_temp_using_patientunitstayid(eicu_df_small, temp_df, verbose=True)

NaN counts:
 admissionheight    1312
admissionweight    2032
BMI                2928
dtype: int64

NaN rates (%):
 admissionheight    1.68
admissionweight    2.60
BMI                3.74
dtype: float64


In [13]:
# Αφαίρεση στηλών ύψους και βάρους από την eicu
eicu_df_small = eicu_df_small.drop(columns=["admissionheight", "admissionweight"], errors="ignore")

# MIMIC BMI

In [14]:
display(eicu_df_small)

Unnamed: 0,row_count,subject_id,hadm_id,Time_Zone,gender,age,race,Alanine_Aminotransferase_(ALT)_(Max),Alanine_Aminotransferase_(ALT)_(Mean),Alanine_Aminotransferase_(ALT)_(Median),...,NLR_(Median),PLR_(Median),ANC_(Min),NLR_(Min),PLR_(Min),RAR_(Max),RAR_(Min),RAR_(Mean),RAR_(Median),BMI
0,1,233029,142974,1,F,54,BLACK/AFRICAN AMERICAN,,,,...,7.166667,192.192192,9546.0,7.166667,192.192192,,,,,47.528345
1,2,233029,142974,2,F,54,BLACK/AFRICAN AMERICAN,,,,...,7.166667,192.192192,9546.0,7.166667,192.192192,,,,,47.528345
2,3,233029,142974,3,F,54,BLACK/AFRICAN AMERICAN,,,,...,7.166667,192.192192,9546.0,7.166667,192.192192,,,,,47.528345
3,4,233029,142974,4,F,54,BLACK/AFRICAN AMERICAN,,,,...,7.166667,192.192192,9546.0,7.166667,192.192192,,,,,47.528345
4,5,233029,142974,5,F,54,BLACK/AFRICAN AMERICAN,,,,...,7.166667,192.192192,9546.0,7.166667,192.192192,,,,,47.528345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78251,86652,3520548,3353094,12,F,78,BLACK/AFRICAN AMERICAN,,,,...,,77.922078,,,74.018692,,,,,23.824641
78252,86653,3520548,3353094,13,F,78,BLACK/AFRICAN AMERICAN,,,,...,,82.592593,,,74.018692,,,,,23.824641
78253,86654,3520548,3353094,14,F,78,BLACK/AFRICAN AMERICAN,,,,...,,77.922078,,,74.018692,,,,,23.824641
78254,86655,3520548,3353094,15,F,78,BLACK/AFRICAN AMERICAN,,,,...,,63.870968,,,63.870968,,,,,23.824641


In [15]:
first_stay_path = r"CSV/Exports/o03_icu_first_stay.csv"

first_stay_df = pd.read_csv(first_stay_path)

In [16]:
# -------------------------
# Paths
# -------------------------
CHARTEVENTS_PATH = r"../00_Datasets/mimic-iv-3_1/icu/chartevents.csv.gz"
DITEMS_PATH      = r"../00_Datasets/mimic-iv-3_1/icu/d_items.csv.gz"

# -------------------------
# 1) Load d_items and find itemids for height/weight
# -------------------------
d_items = pd.read_csv(DITEMS_PATH, compression="gzip")
d_items["label"] = d_items["label"].astype(str)
d_items["unitname"] = d_items.get("unitname", pd.Series([np.nan]*len(d_items))).astype(str)

# Height itemids
height_mask = (
    d_items["label"].str.contains(r"\bheight\b", case=False, na=False)
    & ~d_items["label"].str.contains(r"estimated|source|stated", case=False, na=False)
)

# Weight itemids
weight_mask = (
    d_items["label"].str.contains(r"\bweight\b", case=False, na=False)
    & ~d_items["label"].str.contains(r"change|gain|loss|dry", case=False, na=False)
)

height_itemids = set(d_items.loc[height_mask, "itemid"].unique())
weight_itemids = set(d_items.loc[weight_mask, "itemid"].unique())
need_itemids   = height_itemids | weight_itemids

print(f"Found itemids -> height: {len(height_itemids)}, weight: {len(weight_itemids)}, total: {len(need_itemids)}")

# Meta for merge (helps with units)
meta = d_items.loc[d_items["itemid"].isin(need_itemids), ["itemid", "label", "unitname"]].copy()

# -------------------------
# 2) Helpers for unit conversion
# -------------------------
def to_height_cm(val, uom):
    if pd.isna(val):
        return np.nan
    u = ("" if pd.isna(uom) else str(uom)).strip().lower()
    if u in ["cm", "centimeters", "centimetres"]:
        return val
    if u in ["in", "inch", "inches"]:
        return val * 2.54
    if u in ["m", "meter", "meters"]:
        return val * 100.0
    # fallback (αν δεν ξέρουμε μονάδα, κρατάμε όπως είναι)
    return val

def to_weight_kg(val, uom):
    if pd.isna(val):
        return np.nan
    u = ("" if pd.isna(uom) else str(uom)).strip().lower()
    if u in ["kg", "kilogram", "kilograms"]:
        return val
    if u in ["lb", "lbs", "pound", "pounds"]:
        return val * 0.45359237
    if u in ["g", "gram", "grams"]:
        return val / 1000.0
    return val

# -------------------------
# 3) Stream read chartevents, keep only height/weight, map to subject_id/hadm_id
# -------------------------
chunksize = 200_000
reader = pd.read_csv(CHARTEVENTS_PATH, compression="gzip", chunksize=chunksize)

# keep only minimal mapping from first_stay_df
stay_map = first_stay_df[["stay_id", "subject_id", "hadm_id"]].drop_duplicates()

kept = []

for chunk in reader:
    # keep only itemids we care about + numeric values
    chunk = chunk[chunk["itemid"].isin(need_itemids) & chunk["valuenum"].notna()]
    if chunk.empty:
        continue

    # reduce cols early
    use_cols = ["stay_id", "itemid", "valuenum"]
    if "valueuom" in chunk.columns:
        use_cols.append("valueuom")
    if "charttime" in chunk.columns:
        use_cols.append("charttime")

    chunk = chunk[use_cols].copy()

    # attach metadata (label/unitname) to help conversion if valueuom missing
    chunk = chunk.merge(meta, on="itemid", how="left")

    # choose unit source: prefer valueuom, else unitname
    if "valueuom" in chunk.columns:
        chunk["uom"] = chunk["valueuom"].where(chunk["valueuom"].notna(), chunk["unitname"])
    else:
        chunk["uom"] = chunk["unitname"]

    # merge to get subject_id/hadm_id
    chunk = chunk.merge(stay_map, on="stay_id", how="inner")
    if chunk.empty:
        continue

    # mark measure
    chunk["measure"] = np.where(chunk["itemid"].isin(height_itemids), "height", "weight")

    # convert to metric
    chunk["value_metric"] = np.where(
        chunk["measure"] == "height",
        chunk.apply(lambda r: to_height_cm(r["valuenum"], r["uom"]), axis=1),
        chunk.apply(lambda r: to_weight_kg(r["valuenum"], r["uom"]), axis=1),
    )

    kept.append(chunk[["subject_id", "hadm_id", "measure", "value_metric"]])

# combine
if kept:
    long_df = pd.concat(kept, ignore_index=True)
else:
    long_df = pd.DataFrame(columns=["subject_id", "hadm_id", "measure", "value_metric"])

gc.collect()

# -------------------------
# 4) Clean unrealistic values + aggregate to one height/weight per (subject_id, hadm_id)
# -------------------------
# broad plausibility filters
h = long_df[long_df["measure"] == "height"].copy()
w = long_df[long_df["measure"] == "weight"].copy()

h = h[(h["value_metric"] >= 100) & (h["value_metric"] <= 250)]
w = w[(w["value_metric"] >= 30) & (w["value_metric"] <= 350)]

height_by = h.groupby(["subject_id", "hadm_id"])["value_metric"].median().rename("admissionheight_cm")
weight_by = w.groupby(["subject_id", "hadm_id"])["value_metric"].median().rename("admissionweight_kg")

temp_chart_df = pd.concat([height_by, weight_by], axis=1).reset_index()

print("temp_chart_df shape:", temp_chart_df.shape)
print(temp_chart_df.head())


Found itemids -> height: 2, weight: 8, total: 10
temp_chart_df shape: (3621, 4)
   subject_id   hadm_id  admissionheight_cm  admissionweight_kg
0    10004733  27411876              180.17              112.50
1    10027602  28166872              162.53               65.75
2    10029224  25729446              167.82               69.50
3    10032725  20611640              182.94               84.50
4    10044916  21484626              175.13              155.20


In [17]:
display(temp_chart_df)

Unnamed: 0,subject_id,hadm_id,admissionheight_cm,admissionweight_kg
0,10004733,27411876,180.17,112.50
1,10027602,28166872,162.53,65.75
2,10029224,25729446,167.82,69.50
3,10032725,20611640,182.94,84.50
4,10044916,21484626,175.13,155.20
...,...,...,...,...
3616,19978774,22382691,,216.00
3617,19992365,20220175,,102.85
3618,19994233,29338696,,112.20
3619,19997293,26366652,,173.00


In [18]:
# 0) (προαιρετικά αλλά καλό) ensure σωστά dtypes για keys
mimic_df_small = mimic_df_small.copy()
temp_chart_df  = temp_chart_df.copy()

mimic_df_small["subject_id"] = pd.to_numeric(mimic_df_small["subject_id"], errors="coerce")
mimic_df_small["hadm_id"]    = pd.to_numeric(mimic_df_small["hadm_id"], errors="coerce")

temp_chart_df["subject_id"]  = pd.to_numeric(temp_chart_df["subject_id"], errors="coerce")
temp_chart_df["hadm_id"]     = pd.to_numeric(temp_chart_df["hadm_id"], errors="coerce")

# 1) κρατάμε μόνο τα columns που θέλουμε να φέρουμε (και αφαιρούμε τυχόν duplicates)
temp_hw = temp_chart_df[["subject_id","hadm_id","admissionheight_cm","admissionweight_kg"]].drop_duplicates(
    subset=["subject_id","hadm_id"]
)

# 2) merge
mimic_df_small = mimic_df_small.merge(
    temp_hw,
    on=["subject_id","hadm_id"],
    how="left"
)

# 3) quick checks
cols = ["admissionheight_cm", "admissionweight_kg"]
print("NaN counts:\n", mimic_df_small[cols].isna().sum())
print("\nNaN rates (%):\n", (mimic_df_small[cols].isna().mean() * 100).round(2))
print("\nExample rows with height/weight:")
print(mimic_df_small.loc[mimic_df_small["admissionheight_cm"].notna(), ["subject_id","hadm_id"] + cols].head(10))

NaN counts:
 admissionheight_cm    29376
admissionweight_kg      208
dtype: int64

NaN rates (%):
 admissionheight_cm    57.55
admissionweight_kg     0.41
dtype: float64

Example rows with height/weight:
   subject_id   hadm_id  admissionheight_cm  admissionweight_kg
0    10004733  27411876              180.17               112.5
1    10004733  27411876              180.17               112.5
2    10004733  27411876              180.17               112.5
3    10004733  27411876              180.17               112.5
4    10004733  27411876              180.17               112.5
5    10004733  27411876              180.17               112.5
6    10004733  27411876              180.17               112.5
7    10004733  27411876              180.17               112.5
8    10004733  27411876              180.17               112.5
9    10004733  27411876              180.17               112.5


In [19]:
h_m = (pd.to_numeric(mimic_df_small["admissionheight_cm"], errors="coerce") / 100.0).replace(0, np.nan)
w_kg = pd.to_numeric(mimic_df_small["admissionweight_kg"], errors="coerce").replace(0, np.nan)
mimic_df_small["BMI"] = w_kg / (h_m ** 2)

In [20]:
# Αφαίρεση στηλών ύψους και βάρους από την eicu
mimic_df_small = mimic_df_small.drop(columns=["admissionheight_cm", "admissionweight_kg"], errors="ignore")

# SOFA eICU

In [21]:
APACHE_PRED_PATH = r"..\00_Datasets\eicu-2_0\apachePredVar.csv.gz"
APACHE_APS_PATH  = r"..\00_Datasets\eicu-2_0\apacheApsVar.csv.gz"
RESPCARE_PATH    = r"..\00_Datasets\eicu-2_0\respiratoryCare.csv.gz"

In [22]:
apache_pred = pd.read_csv(APACHE_PRED_PATH, compression="gzip")
apache_aps = pd.read_csv(APACHE_APS_PATH, compression="gzip")
respcare = pd.read_csv(RESPCARE_PATH, compression="gzip")

  respcare = pd.read_csv(RESPCARE_PATH, compression="gzip")


In [23]:
# -------------------------
# Output folder
# -------------------------
OUT_DIR = Path("CSV/Exports/Temp/Doctors_Dataset/headers")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def export_headers(df: pd.DataFrame, name: str):
    """
    Saves:
      1) all headers to CSV
      2) candidate headers for RDW/Albumin to a separate CSV (optional but useful)
    """
    # 1) All headers
    headers_df = pd.DataFrame({"column": df.columns.astype(str)})
    all_path = OUT_DIR / f"{name}_headers.csv"
    headers_df.to_csv(all_path, index=False, encoding="utf-8-sig")

    print(f"[OK] Saved: {all_path}")

# -------------------------
# Run exports
# -------------------------
export_headers(apache_pred, "apache_pred")
export_headers(apache_aps, "apache_aps")
export_headers(respcare, "respcare")

[OK] Saved: CSV\Exports\Temp\Doctors_Dataset\headers\apache_pred_headers.csv
[OK] Saved: CSV\Exports\Temp\Doctors_Dataset\headers\apache_aps_headers.csv
[OK] Saved: CSV\Exports\Temp\Doctors_Dataset\headers\respcare_headers.csv


In [24]:
def _to_fio2_frac(x):
    x = pd.to_numeric(x, errors="coerce")
    # αν είναι 21–100, το κάνουμε 0.21–1.0
    return x.where(x <= 1.0, x / 100.0).replace(0, np.nan)

def sofa_resp(pao2, fio2, support):
    pf = pd.to_numeric(pao2, errors="coerce") / _to_fio2_frac(fio2)
    s  = pd.to_numeric(support, errors="coerce").fillna(0).astype(int)

    out = pd.Series(np.nan, index=pf.index, dtype="float")
    out = np.where(pf >= 400, 0, out)
    out = np.where((pf < 400) & (pf >= 300), 1, out)
    out = np.where((pf < 300) & (pf >= 200), 2, out)
    out = np.where((pf < 200) & (pf >= 100) & (s == 1), 3, out)
    out = np.where((pf < 100) & (s == 1), 4, out)
    return pd.Series(out, index=pf.index)

def sofa_cns(gcs):
    g = pd.to_numeric(gcs, errors="coerce")
    out = pd.Series(np.nan, index=g.index, dtype="float")
    out = np.where(g == 15, 0, out)
    out = np.where((g >= 13) & (g <= 14), 1, out)
    out = np.where((g >= 10) & (g <= 12), 2, out)
    out = np.where((g >= 6)  & (g <= 9), 3, out)
    out = np.where(g < 6, 4, out)
    return pd.Series(out, index=g.index)

def sofa_liver(bili_mgdl):
    b = pd.to_numeric(bili_mgdl, errors="coerce")
    out = pd.Series(np.nan, index=b.index, dtype="float")
    out = np.where(b < 1.2, 0, out)
    out = np.where((b >= 1.2) & (b <= 1.9), 1, out)
    out = np.where((b >= 2.0) & (b <= 5.9), 2, out)
    out = np.where((b >= 6.0) & (b <= 11.9), 3, out)
    out = np.where(b >= 12.0, 4, out)
    return pd.Series(out, index=b.index)

def sofa_renal(creat_mgdl, urine_ml_day=None, dialysis=None):
    c = pd.to_numeric(creat_mgdl, errors="coerce")
    u = pd.to_numeric(urine_ml_day, errors="coerce") if urine_ml_day is not None else None
    d = pd.to_numeric(dialysis, errors="coerce").fillna(0).astype(int) if dialysis is not None else pd.Series(0, index=c.index)

    out = pd.Series(np.nan, index=c.index, dtype="float")
    out = np.where(c < 1.2, 0, out)
    out = np.where((c >= 1.2) & (c <= 1.9), 1, out)
    out = np.where((c >= 2.0) & (c <= 3.4), 2, out)
    out = np.where((c >= 3.5) & (c <= 4.9), 3, out)
    out = np.where(c >= 5.0, 4, out)

    # urine criteria (αν το urine είναι 24h total)
    if u is not None:
        out = np.where(u < 500, np.maximum(out, 3), out)
        out = np.where(u < 200, np.maximum(out, 4), out)

    # dialysis -> πρακτικά πολύ severe (συχνά το αντιμετωπίζουν ως 4)
    out = np.where(d == 1, 4, out)

    return pd.Series(out, index=c.index)

def sofa_coag(plt):
    p = pd.to_numeric(plt, errors="coerce")
    out = pd.Series(np.nan, index=p.index, dtype="float")
    out = np.where(p >= 150, 0, out)
    out = np.where((p < 150) & (p >= 100), 1, out)
    out = np.where((p < 100) & (p >= 50), 2, out)
    out = np.where((p < 50)  & (p >= 20), 3, out)
    out = np.where(p < 20, 4, out)
    return pd.Series(out, index=p.index)

def sofa_cardio(map_mmHg, pressor_level=None):
    # v1: μόνο MAP 0/1 (χωρίς pressor doses)
    m = pd.to_numeric(map_mmHg, errors="coerce")
    out = pd.Series(np.nan, index=m.index, dtype="float")
    out = np.where(m >= 70, 0, out)
    out = np.where(m < 70, 1, out)
    return pd.Series(out, index=m.index)

def build_sofa_eicu_day1(apache_aps, apache_pred, respcare, eicu_df_small):
    # keys
    aps  = apache_aps.copy()
    pred = apache_pred.copy()
    rc   = respcare.copy()

    aps["patientunitstayid"]  = pd.to_numeric(aps["patientunitstayid"], errors="coerce")
    pred["patientunitstayid"] = pd.to_numeric(pred["patientunitstayid"], errors="coerce")
    rc["patientunitstayid"]   = pd.to_numeric(rc["patientunitstayid"], errors="coerce")

    # --- support flag (day1/baseline)
    # pred: oobventday1 / oobintubday1 (doc: intubated day1 => mechanically ventilated) 
    support_pred = pred[["patientunitstayid","oobventday1","oobintubday1"]].drop_duplicates("patientunitstayid")
    for c in ["oobventday1","oobintubday1"]:
        support_pred[c] = pd.to_numeric(support_pred[c], errors="coerce").fillna(0).astype(int)
    support_pred["support_pred"] = ((support_pred["oobventday1"]==1) | (support_pred["oobintubday1"]==1)).astype(int)

    # aps: vent/intubated
    support_aps = aps[["patientunitstayid","vent","intubated"]].drop_duplicates("patientunitstayid").copy()
    for c in ["vent","intubated"]:
        support_aps[c] = pd.to_numeric(support_aps[c], errors="coerce").fillna(0).astype(int)
    support_aps["support_aps"] = ((support_aps["vent"]==1) | (support_aps["intubated"]==1)).astype(int)

    # respcare: has ventstartoffset => ventilated episode exists
    support_rc = rc.groupby("patientunitstayid")["ventstartoffset"].apply(lambda s: int(s.notna().any())).reset_index()
    support_rc = support_rc.rename(columns={"ventstartoffset":"support_rc"})

    support = support_pred[["patientunitstayid","support_pred"]].merge(
        support_aps[["patientunitstayid","support_aps"]], on="patientunitstayid", how="outer"
    ).merge(
        support_rc, on="patientunitstayid", how="outer"
    ).fillna(0)

    support["support"] = ((support["support_pred"]==1) | (support["support_aps"]==1) | (support["support_rc"]==1)).astype(int)

    # --- base sofa frame
    base = aps[["patientunitstayid","pao2","fio2","bilirubin","creatinine","urine","dialysis","meanbp","eyes","verbal","motor"]].drop_duplicates("patientunitstayid")
    base = base.merge(support[["patientunitstayid","support"]], on="patientunitstayid", how="left").fillna({"support":0})

    # GCS from components
    base["gcs"] = (
        pd.to_numeric(base["eyes"], errors="coerce")
        + pd.to_numeric(base["verbal"], errors="coerce")
        + pd.to_numeric(base["motor"], errors="coerce")
    )

    # platelets: από eicu_df_small (worst) -> πάρε το MIN ανά stay σε όλο το dataset σου (ή μόνο day1 αν θες)
    tmp = eicu_df_small[["hadm_id","Platelet_Count_(Min)"]].copy()
    tmp["hadm_id"] = pd.to_numeric(tmp["hadm_id"], errors="coerce")
    plt_by_stay = tmp.groupby("hadm_id")["Platelet_Count_(Min)"].min().reset_index().rename(columns={"hadm_id":"patientunitstayid", "Platelet_Count_(Min)":"platelets_worst"})

    base = base.merge(plt_by_stay, on="patientunitstayid", how="left")

    # subscores
    base["SOFA_resp"]  = sofa_resp(base["pao2"], base["fio2"], base["support"])
    base["SOFA_cns"]   = sofa_cns(base["gcs"])
    base["SOFA_liver"] = sofa_liver(base["bilirubin"])
    base["SOFA_renal"] = sofa_renal(base["creatinine"], base["urine"], base["dialysis"])
    base["SOFA_coag"]  = sofa_coag(base["platelets_worst"])
    base["SOFA_cardio"]= sofa_cardio(base["meanbp"])

    base["SOFA_total_v1"] = base[["SOFA_resp","SOFA_cns","SOFA_liver","SOFA_renal","SOFA_coag","SOFA_cardio"]].sum(axis=1, min_count=1)

    return base[["patientunitstayid","SOFA_resp","SOFA_cns","SOFA_liver","SOFA_renal","SOFA_coag","SOFA_cardio","SOFA_total_v1","support"]]

# ---- usage ----
sofa_day1 = build_sofa_eicu_day1(apache_aps, apache_pred, respcare, eicu_df_small)
eicu_df_small = eicu_df_small.merge(sofa_day1, left_on="hadm_id", right_on="patientunitstayid", how="left").drop(columns=["patientunitstayid"])

In [25]:
# κράτα ένα ενιαίο column
eicu_df_small["SOFA"] = eicu_df_small["SOFA_total_v1"]

# πέτα τα ενδιάμεσα
drop_cols = ["SOFA_resp","SOFA_cns","SOFA_liver","SOFA_renal","SOFA_coag","SOFA_cardio","SOFA_total_v1","support"]
eicu_df_small = eicu_df_small.drop(columns=[c for c in drop_cols if c in eicu_df_small.columns])

# MIMIC SOFA

In [26]:
# 1) Κράτα μόνο τα κλειδιά + stay_id από το stroke_first_stay
map_stay = (
    stroke_first_stay[["subject_id", "hadm_id", "stay_id"]]
    .dropna(subset=["subject_id", "hadm_id", "stay_id"])
    .drop_duplicates(subset=["subject_id", "hadm_id"])  # αν υπάρχει 1 stay_id ανά (subject_id, hadm_id)
)

# 2) Φέρε το stay_id στο mimic_df
mimic_df_small = mimic_df_small.merge(
    map_stay,
    on=["subject_id", "hadm_id"],
    how="left"
)

# μετακίνηση της stay_id μετά την hadm_id
cols = mimic_df_small.columns.tolist()
cols.remove("stay_id")

hadm_pos = cols.index("hadm_id")
cols.insert(hadm_pos + 1, "stay_id")

mimic_df_small =mimic_df_small[cols]

# 3) (προαιρετικά) έλεγξε πόσα δεν βρέθηκαν
missing = mimic_df_small["stay_id"].isna().sum()
print("Missing stay_id rows:", missing)

Missing stay_id rows: 0


In [27]:
# Κρατάω από το stroke_first_day_sofa μόνο τις γραμμές που έχουν stay_id ίδιο με αυτό τον εισαγω΄γων εγκεφαλικού

target_stay_ids = mimic_df_small["stay_id"].dropna().unique()

stroke_first_day_sofa_filtered = stroke_first_day_sofa[
    stroke_first_day_sofa["stay_id"].isin(target_stay_ids)
].copy()

stroke_first_day_sofa_filtered.reset_index(drop=True, inplace=True)

In [28]:
keys = ["subject_id", "hadm_id", "stay_id"]

# 1) Κράτα μόνο τα keys + sofa (και λύσε τυχόν διπλοεγγραφές)
sofa_lookup = (
    stroke_first_day_sofa_filtered[keys + ["sofa"]]
    .groupby(keys, as_index=False)["sofa"].max()   # αν υπάρχουν πολλαπλές εγγραφές, κράτα το max sofa
)

# 2) Merge (left, ώστε να μη χάσεις γραμμές από merged_df)
mimic_df_small = mimic_df_small.merge(sofa_lookup, on=keys, how="left", validate="m:1")

mimic_df_small["sofa"].isna().sum()

np.int64(0)

In [29]:
# Αφαίρεση στηλών
mimic_df_small = mimic_df_small.drop(columns=["stay_id"], errors="ignore")

In [30]:
display(mimic_df_small)

Unnamed: 0,row_count,subject_id,hadm_id,Time_Zone,gender,age,race,Alanine_Aminotransferase_(ALT)_(Max),Alanine_Aminotransferase_(ALT)_(Mean),Alanine_Aminotransferase_(ALT)_(Median),...,PLR_(Median),ANC_(Min),NLR_(Min),PLR_(Min),RAR_(Max),RAR_(Min),RAR_(Mean),RAR_(Median),BMI,sofa
0,1,10004733,27411876,1,M,51,UNKNOWN,46.0,46.0,46.0,...,265.663391,6053.2,9.295455,265.663391,5.586207,5.586207,5.586207,5.586207,34.656729,3
1,2,10004733,27411876,2,M,51,UNKNOWN,46.0,46.0,46.0,...,265.663391,6053.2,9.295455,265.663391,5.586207,5.586207,5.586207,5.586207,34.656729,3
2,3,10004733,27411876,3,M,51,UNKNOWN,46.0,46.0,46.0,...,265.663391,6053.2,9.295455,265.663391,5.586207,5.586207,5.586207,5.586207,34.656729,3
3,4,10004733,27411876,4,M,51,UNKNOWN,46.0,46.0,46.0,...,265.663391,6053.2,9.295455,265.663391,5.586207,5.586207,5.586207,5.586207,34.656729,3
4,5,10004733,27411876,5,M,51,UNKNOWN,46.0,46.0,46.0,...,265.663391,6053.2,9.295455,265.663391,5.586207,5.586207,5.586207,5.586207,34.656729,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51035,58140,19999987,23865745,12,F,57,UNKNOWN,63.0,63.0,63.0,...,40.430366,8849.4,2.956897,40.430366,,,,,,6
51036,58141,19999987,23865745,13,F,57,UNKNOWN,63.0,63.0,63.0,...,40.430366,8849.4,2.956897,40.430366,,,,,,6
51037,58142,19999987,23865745,14,F,57,UNKNOWN,63.0,63.0,63.0,...,40.430366,8849.4,2.956897,40.430366,,,,,,6
51038,58143,19999987,23865745,15,F,57,UNKNOWN,63.0,63.0,63.0,...,40.430366,8849.4,2.956897,40.430366,,,,,,6


# Move los and hospital_expire_flag at the end

In [35]:
def move_targets_to_end(df, targets=("los", "hospital_expire_flag")):
    df = df.copy()
    targets = [c for c in targets if c in df.columns]
    other_cols = [c for c in df.columns if c not in targets]
    return df[other_cols + targets]

mimic_df_small = mimic_df_small.rename(columns={"sofa": "SOFA"})

mimic_df_small = move_targets_to_end(mimic_df_small)
eicu_df_small  = move_targets_to_end(eicu_df_small)

# quick check
print("MIMIC last cols:", list(mimic_df_small.columns[-5:]))
print("eICU  last cols:", list(eicu_df_small.columns[-5:]))

MIMIC last cols: ['RAR_(Median)', 'BMI', 'SOFA', 'los', 'hospital_expire_flag']
eICU  last cols: ['RAR_(Median)', 'BMI', 'SOFA', 'los', 'hospital_expire_flag']


# Save Mimic - eICU Sets

In [36]:
out_dir = "CSV/Exports/Temp/Doctors_Dataset"
os.makedirs(out_dir, exist_ok=True)

mimic_df_small.to_csv(os.path.join(out_dir, "o01_mimic_for_ext_val.csv"), index=False)
eicu_df_small.to_csv(os.path.join(out_dir, "o01_eicu_for_ext_val.csv"), index=False)

# Test

In [33]:
sid = 3353144
sid_str = str(sid)

print("---- eicu_df_small: matches in subject_id ----")
m_eicu = eicu_df_small[eicu_df_small["subject_id"].astype(str) == sid_str]
print("rows:", len(m_eicu))
print(m_eicu[["subject_id","hadm_id","Time_Zone"]].head(10))

print("\n---- temp_df: matches in uniquepid ----")
m_u = temp_df[temp_df["uniquepid"].astype(str) == sid_str]
print("rows:", len(m_u))
print(m_u.head(5))

print("\n---- temp_df: matches in patienthealthsystemstayid ----")
m_h = temp_df[temp_df["patienthealthsystemstayid"].astype(str) == sid_str]
print("rows:", len(m_h))
print(m_h[["uniquepid","patienthealthsystemstayid","patientunitstayid","admissionheight","admissionweight"]].head(10))

print("\n---- temp_df: matches in patientunitstayid ----")
m_pu = temp_df[temp_df["patientunitstayid"].astype(str) == sid_str]
print("rows:", len(m_pu))
print(m_pu[["uniquepid","patienthealthsystemstayid","patientunitstayid","admissionheight","admissionweight"]].head(10))


---- eicu_df_small: matches in subject_id ----
rows: 0
Empty DataFrame
Columns: [subject_id, hadm_id, Time_Zone]
Index: []

---- temp_df: matches in uniquepid ----
rows: 0
Empty DataFrame
Columns: [uniquepid, unitvisitnumber, patienthealthsystemstayid, patientunitstayid, gender, age, ethnicity, admissionheight, admissionweight]
Index: []

---- temp_df: matches in patienthealthsystemstayid ----
rows: 0
Empty DataFrame
Columns: [uniquepid, patienthealthsystemstayid, patientunitstayid, admissionheight, admissionweight]
Index: []

---- temp_df: matches in patientunitstayid ----
rows: 1
        uniquepid  patienthealthsystemstayid  patientunitstayid  \
200838  035-20783                    2743005            3353144   

        admissionheight  admissionweight  
200838            180.3             39.2  
