# **This notebook acts as the dataloader of MIMIC-III**

TESTING

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import seaborn as sns
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import copy
from sklearn.metrics import mean_absolute_error, r2_score
import os
import glob

# **MIMIC-III**

In [None]:
path_client_1 = "../../Datasets/mimic_iii_data"

In [3]:
def load_csv(name):
    file_path = os.path.join(path_client_1, f"{name}.csv")
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File {file_path} not found.")
    print(f"Loading {file_path} ...")
    return pd.read_csv(file_path, low_memory=False)

We chose the following CSV files based on our reasonings on overleaf.

In [4]:
dataframes = {}

dataframes["ADMISSIONS"] = load_csv("ADMISSIONS")
dataframes["DATETIMEEVENTS"] = load_csv("DATETIMEEVENTS")
dataframes["ICUSTAYS"] = load_csv("ICUSTAYS")
dataframes["INPUTEVENTS_CV"] = load_csv("INPUTEVENTS_CV")
dataframes["INPUTEVENTS_MV"] = load_csv("INPUTEVENTS_MV")
dataframes["LABEVENTS"] = load_csv("LABEVENTS")
dataframes["MICROBIOLOGYEVENTS"] = load_csv("MICROBIOLOGYEVENTS")
dataframes["NOTEEVENTS"] = load_csv("NOTEEVENTS")
dataframes["OUTPUTEVENTS"] = load_csv("OUTPUTEVENTS")
dataframes["PATIENTS"] = load_csv("PATIENTS")
dataframes["PROCEDUREEVENTS_MV"] = load_csv("PROCEDUREEVENTS_MV")
dataframes["SERVICES"] = load_csv("SERVICES")

# print("Loaded:", list(dataframes.keys()))

Loading ../../Datasets/mimic_iii_data/ADMISSIONS.csv ...
Loading ../../Datasets/mimic_iii_data/DATETIMEEVENTS.csv ...
Loading ../../Datasets/mimic_iii_data/ICUSTAYS.csv ...
Loading ../../Datasets/mimic_iii_data/INPUTEVENTS_CV.csv ...
Loading ../../Datasets/mimic_iii_data/INPUTEVENTS_MV.csv ...
Loading ../../Datasets/mimic_iii_data/LABEVENTS.csv ...
Loading ../../Datasets/mimic_iii_data/MICROBIOLOGYEVENTS.csv ...
Loading ../../Datasets/mimic_iii_data/NOTEEVENTS.csv ...
Loading ../../Datasets/mimic_iii_data/OUTPUTEVENTS.csv ...
Loading ../../Datasets/mimic_iii_data/PATIENTS.csv ...
Loading ../../Datasets/mimic_iii_data/PROCEDUREEVENTS_MV.csv ...
Loading ../../Datasets/mimic_iii_data/SERVICES.csv ...


In [5]:
dataframes["ICUSTAYS"]

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,DBSOURCE,FIRST_CAREUNIT,LAST_CAREUNIT,FIRST_WARDID,LAST_WARDID,INTIME,OUTTIME,LOS
0,365,268,110404,280836,carevue,MICU,MICU,52,52,2198-02-14 23:27:38,2198-02-18 05:26:11,3.2490
1,366,269,106296,206613,carevue,MICU,MICU,52,52,2170-11-05 11:05:29,2170-11-08 17:46:57,3.2788
2,367,270,188028,220345,carevue,CCU,CCU,57,57,2128-06-24 15:05:20,2128-06-27 12:32:29,2.8939
3,368,271,173727,249196,carevue,MICU,SICU,52,23,2120-08-07 23:12:42,2120-08-10 00:39:04,2.0600
4,369,272,164716,210407,carevue,CCU,CCU,57,57,2186-12-25 21:08:04,2186-12-27 12:01:13,1.6202
...,...,...,...,...,...,...,...,...,...,...,...,...
61527,59806,94944,143774,201233,metavision,CSRU,CSRU,15,15,2104-04-15 10:18:16,2104-04-17 14:51:00,2.1894
61528,59807,94950,123750,283653,metavision,CCU,CCU,7,7,2155-12-08 05:33:16,2155-12-10 17:24:58,2.4942
61529,59808,94953,196881,241585,metavision,SICU,SICU,57,57,2160-03-03 16:09:11,2160-03-04 14:22:33,0.9259
61530,59809,94954,118475,202802,metavision,CSRU,CSRU,15,15,2183-03-25 09:53:10,2183-03-27 17:55:03,2.3346


In [6]:
OBS_WINDOW = pd.Timedelta(hours=24)
BIN_SIZE   = pd.Timedelta(hours=1)
id_col = "ICUSTAY_ID"
client_1 = dataframes["ICUSTAYS"][["ICUSTAY_ID", "SUBJECT_ID", "INTIME", "OUTTIME"]].copy()

client_1["INTIME"]  = pd.to_datetime(client_1["INTIME"],  errors="coerce")
client_1["OUTTIME"] = pd.to_datetime(client_1["OUTTIME"], errors="coerce")

stays_for_event_windowing = client_1[[id_col, "INTIME"]].copy()
stays_for_event_windowing = stays_for_event_windowing.rename(columns={"INTIME":"admit_time"})

print(client_1.shape)                  
print(stays_for_event_windowing.shape)

(61532, 4)
(61532, 2)


In [7]:
merge_log = []

for name, df in dataframes.items():
    # We only process tables that have an ICUSTAY_ID
    if id_col not in df.columns:
        print(f"Skipping {name}: no {id_col}")
        continue

    print(f"\nProcessing {name}.csv")
    df = df.copy()

    # 1) Find & parse eventtime
    # Look for any column with 'time' in name (datetime)…
    dt_cols  = [c for c in df.columns if "time" in c.lower()]
    off_cols = [c for c in df.columns if "offset" in c.lower()]

    if dt_cols:
        first_dt = dt_cols[0]
        df[first_dt] = pd.to_datetime(df[first_dt], errors="coerce")
        df = df.merge(stays_for_event_windowing, on=id_col, how="left")
        df["eventtime"] = df[first_dt]

    elif off_cols:
        first_off = off_cols[0]
        df = df.merge(stays_for_event_windowing, on=id_col, how="left")
        df["eventtime"] = (
            df["admit_time"] 
            + pd.to_timedelta(df[first_off].astype(float), unit="m")
        )
    else:
        print(f"  No datetime or offset in {name}, skipping")
        continue

    # 2) Dilter to first 24 h and bin into hours
    before = len(df)
    df = df.loc[
        (df["eventtime"] >= df["admit_time"]) &
        (df["eventtime"] <  df["admit_time"] + OBS_WINDOW)
    ].copy()
    after = len(df)
    print(f"  Kept {after}/{before} rows in first 24 h")

    df["hour_from_admit"] = (
        (df["eventtime"] - df["admit_time"])
        .dt.total_seconds()
        .floordiv(BIN_SIZE.total_seconds())
        .astype(int)
    )

    # 3a) Numeric aggregation
    numeric_cols = df.select_dtypes(include="number")\
                     .columns.difference([id_col,"hour_from_admit"])
    num_wide = None
    if len(numeric_cols):
        grp = df.groupby([id_col,"hour_from_admit"])[numeric_cols] \
                .agg(["mean","std","min","max","count"])
        num_wide = grp.unstack(level="hour_from_admit", fill_value=np.nan)
        num_wide.columns = [
            f"{name}_{orig}_{stat}_h{hour}"
            for orig, stat, hour in num_wide.columns
        ]
        num_wide = num_wide.reset_index()

    # 3b) Categorical aggregation
    cat_cols = df.select_dtypes(include=["object","category","bool"])\
                 .columns.difference([id_col,"hour_from_admit"])
    cat_wide = None
    if len(cat_cols):
        grp = df.groupby([id_col,"hour_from_admit"])[cat_cols] \
                .agg(lambda x: x.mode().iloc[0] if not x.mode().empty else pd.NA)
        cat_wide = grp.unstack(level="hour_from_admit", fill_value=pd.NA)
        cat_wide.columns = [
            f"{name}_{orig}_mode_h{hour}"
            for orig, hour in cat_wide.columns
        ]
        cat_wide = cat_wide.reset_index()

    # 4) Combine and prune all-NaN
    parts = [t for t in (num_wide, cat_wide) if t is not None]
    if not parts:
        print(f"  Skipping {name}: nothing to aggregate")
        continue

    hourly_wide = parts[0]
    for t in parts[1:]:
        hourly_wide = hourly_wide.merge(t, on=id_col, how="outer")

    keep = [c for c in hourly_wide.columns
            if c == id_col or not hourly_wide[c].isna().all()]
    hourly_wide = hourly_wide[keep]

    # 5) Merge into client_1
    overlap = set(hourly_wide.columns) & set(client_1.columns) - {id_col}
    if overlap:
        hourly_wide = hourly_wide.drop(columns=list(overlap))

    client_1 = client_1.merge(hourly_wide, on=id_col, how="left")
    merge_log.append((name, "hourly"))

print("\nMIMIC merged shape:", client_1.shape)
print("Merge log:", merge_log)

Skipping ADMISSIONS: no ICUSTAY_ID

Processing DATETIMEEVENTS.csv
  Kept 797085/4485937 rows in first 24 h

Processing ICUSTAYS.csv
  Kept 61532/61532 rows in first 24 h

Processing INPUTEVENTS_CV.csv
  Kept 3158746/17527935 rows in first 24 h

Processing INPUTEVENTS_MV.csv
  Kept 1010925/3618991 rows in first 24 h
Skipping LABEVENTS: no ICUSTAY_ID
Skipping MICROBIOLOGYEVENTS: no ICUSTAY_ID
Skipping NOTEEVENTS: no ICUSTAY_ID

Processing OUTPUTEVENTS.csv
  Kept 1045350/4349218 rows in first 24 h
Skipping PATIENTS: no ICUSTAY_ID

Processing PROCEDUREEVENTS_MV.csv
  Kept 143908/258066 rows in first 24 h
Skipping SERVICES: no ICUSTAY_ID

MIMIC merged shape: (61532, 7304)
Merge log: [('DATETIMEEVENTS', 'hourly'), ('ICUSTAYS', 'hourly'), ('INPUTEVENTS_CV', 'hourly'), ('INPUTEVENTS_MV', 'hourly'), ('OUTPUTEVENTS', 'hourly'), ('PROCEDUREEVENTS_MV', 'hourly')]


In [11]:
patients = dataframes["PATIENTS"][["SUBJECT_ID","GENDER","DOB","DOD"]].copy()

patients["DOB"] = pd.to_datetime(patients["DOB"], errors="coerce")
patients["DOD"] = pd.to_datetime(patients["DOD"], errors="coerce")

client_1 = client_1.merge(patients, on="SUBJECT_ID", how="left")

print("With demographics:", client_1.shape)
print(client_1[["SUBJECT_ID","GENDER","DOB","DOD"]].head())

With demographics: (61532, 7307)
   SUBJECT_ID GENDER        DOB        DOD
0         268      F 2132-02-21 2198-02-18
1         269      M 2130-09-30        NaT
2         270      M 2048-05-26        NaT
3         271      F 2074-11-30        NaT
4         272      M 2119-11-21        NaT


In [12]:
client_1.to_csv("client_1_ICUSTAY_raw_hour.csv", index=False)

In [12]:
client_1

Unnamed: 0,ICUSTAY_ID,SUBJECT_ID,INTIME,OUTTIME,DATETIMEEVENTS_CGID_mean,DATETIMEEVENTS_CGID_std,DATETIMEEVENTS_CGID_min,DATETIMEEVENTS_CGID_max,DATETIMEEVENTS_CGID_count,DATETIMEEVENTS_ERROR_mean,...,PROCEDUREEVENTS_MV_LOCATION_mode,PROCEDUREEVENTS_MV_LOCATIONCATEGORY_mode,PROCEDUREEVENTS_MV_ORDERCATEGORYDESCRIPTION_mode,PROCEDUREEVENTS_MV_ORDERCATEGORYNAME_mode,PROCEDUREEVENTS_MV_STATUSDESCRIPTION_mode,PROCEDUREEVENTS_MV_STORETIME_mode,PROCEDUREEVENTS_MV_VALUEUOM_mode,GENDER,DOB,DOD
0,280836,268,2198-02-14 23:27:38,2198-02-18 05:26:11,,,,,,,...,,,,,,,,F,2132-02-21 00:00:00,2198-02-18 00:00:00
1,206613,269,2170-11-05 11:05:29,2170-11-08 17:46:57,,,,,,,...,,,,,,,,M,2130-09-30 00:00:00,
2,220345,270,2128-06-24 15:05:20,2128-06-27 12:32:29,,,,,,,...,,,,,,,,M,2048-05-26 00:00:00,
3,249196,271,2120-08-07 23:12:42,2120-08-10 00:39:04,,,,,,,...,,,,,,,,F,2074-11-30 00:00:00,
4,210407,272,2186-12-25 21:08:04,2186-12-27 12:01:13,,,,,,,...,,,,,,,,M,2119-11-21 00:00:00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61527,201233,94944,2104-04-15 10:18:16,2104-04-17 14:51:00,19631.509804,2435.027485,15285.0,20971.0,51.0,0.0,...,Right IJ,Peripheral,Task,Invasive Lines,FinishedRunning,2104-04-16 05:50:00,min,M,2027-03-02 00:00:00,
61528,283653,94950,2155-12-08 05:33:16,2155-12-10 17:24:58,17206.933333,1609.722889,15858.0,20889.0,15.0,0.0,...,RL Ant Forearm,Peripheral - old,Electrolytes,Peripheral Lines,FinishedRunning,2155-12-08 09:26:00,min,F,1855-12-07 00:00:00,
61529,241585,94953,2160-03-03 16:09:11,2160-03-04 14:22:33,19442.363636,1039.553867,17873.0,20889.0,22.0,0.0,...,Left Cephalic Lower Arm,Invasive Arterial,Task,Invasive Lines,FinishedRunning,2160-03-04 10:52:00,min,F,2107-01-29 00:00:00,2162-01-05 00:00:00
61530,202802,94954,2183-03-25 09:53:10,2183-03-27 17:55:03,14903.763636,1284.098098,14332.0,20889.0,55.0,0.0,...,Right IJ,Invasive Venous,Electrolytes,Invasive Lines,FinishedRunning,2183-03-25 11:04:00,min,F,2115-05-23 00:00:00,


In [13]:
client_1.to_csv("client_1_ICUSTAY_raw.csv", index=False)

In [14]:
# Check all column names:
client_1.columns

Index(['ICUSTAY_ID', 'SUBJECT_ID', 'INTIME', 'OUTTIME',
       'DATETIMEEVENTS_CGID_mean', 'DATETIMEEVENTS_CGID_std',
       'DATETIMEEVENTS_CGID_min', 'DATETIMEEVENTS_CGID_max',
       'DATETIMEEVENTS_CGID_count', 'DATETIMEEVENTS_ERROR_mean',
       ...
       'PROCEDUREEVENTS_MV_LOCATION_mode',
       'PROCEDUREEVENTS_MV_LOCATIONCATEGORY_mode',
       'PROCEDUREEVENTS_MV_ORDERCATEGORYDESCRIPTION_mode',
       'PROCEDUREEVENTS_MV_ORDERCATEGORYNAME_mode',
       'PROCEDUREEVENTS_MV_STATUSDESCRIPTION_mode',
       'PROCEDUREEVENTS_MV_STORETIME_mode', 'PROCEDUREEVENTS_MV_VALUEUOM_mode',
       'GENDER', 'DOB', 'DOD'],
      dtype='object', length=310)