# **This notebook acts as the dataloader of eICU**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import seaborn as sns
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import copy
from sklearn.metrics import mean_absolute_error, r2_score
import os
import glob

## **eICU**

In [None]:
path_client_2 = "../../Datasets/eICU_data"

def load_csv(name):
    file_path = os.path.join(path_client_2, f"{name}.csv")
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File {file_path} not found.")
    print(f"Loading {file_path} ...")
    return pd.read_csv(file_path, low_memory=False)

dataframes = {}

dataframes["admissionDx"] = load_csv("admissionDx")
dataframes["patient"] = load_csv("patient")
dataframes["diagnosis"] = load_csv("diagnosis")
dataframes["lab"] = load_csv("lab")
dataframes["medication"] = load_csv("medication")
dataframes["infusionDrug"] = load_csv("infusionDrug")
dataframes["intakeOutput"] = load_csv("intakeOutput")
dataframes["microLab"] = load_csv("microLab")
dataframes["nurseAssessment"] = load_csv("nurseAssessment")
dataframes["respiratoryCharting"] = load_csv("respiratoryCharting")
dataframes["customLab"] = load_csv("customLab")
# dataframes["carePlanGeneral"] = load_csv("carePlanGeneral")
# dataframes["nurseCharting"] = load_csv("nurseCharting")

# print("Loaded:", list(dataframes.keys()))

Loading ../../Datasets/eICU_data/admissionDx.csv ...
Loading ../../Datasets/eICU_data/patient.csv ...
Loading ../../Datasets/eICU_data/diagnosis.csv ...
Loading ../../Datasets/eICU_data/lab.csv ...
Loading ../../Datasets/eICU_data/medication.csv ...
Loading ../../Datasets/eICU_data/infusionDrug.csv ...
Loading ../../Datasets/eICU_data/intakeOutput.csv ...
Loading ../../Datasets/eICU_data/microLab.csv ...
Loading ../../Datasets/eICU_data/nurseAssessment.csv ...
Loading ../../Datasets/eICU_data/respiratoryCharting.csv ...
Loading ../../Datasets/eICU_data/customLab.csv ...
Loading ../../Datasets/eICU_data/carePlanGeneral.csv ...


Check here.

In [9]:
print(dataframes["patient"].columns.tolist())

['patientunitstayid', 'patienthealthsystemstayid', 'gender', 'age', 'ethnicity', 'hospitalid', 'wardid', 'apacheadmissiondx', 'admissionheight', 'hospitaladmittime24', 'hospitaladmitoffset', 'hospitaladmitsource', 'hospitaldischargeyear', 'hospitaldischargetime24', 'hospitaldischargeoffset', 'hospitaldischargelocation', 'hospitaldischargestatus', 'unittype', 'unitadmittime24', 'unitadmitsource', 'unitvisitnumber', 'unitstaytype', 'admissionweight', 'dischargeweight', 'unitdischargetime24', 'unitdischargeoffset', 'unitdischargelocation', 'unitdischargestatus', 'uniquepid']


In [None]:
OBS_WINDOW = pd.Timedelta(hours=24)
BIN_SIZE = pd.Timedelta(hours=1)
id_col = "patientunitstayid"
client_2 = dataframes["patient"].copy()
 
print(f"Initial client_2 shape from patient table: {client_2.shape}")
print(f"Columns in client_2 after loading patient table: {client_2.columns.tolist()}")

if "unitadmittime24" in client_2.columns:
    client_2["unitadmittime"] = pd.to_datetime(client_2["unitadmittime24"], errors="coerce")
if "unitdischargetime24" in client_2.columns:
    client_2["unitdischtime"] = pd.to_datetime(client_2["unitdischargetime24"], errors="coerce")

stays_for_event_windowing = client_2[[id_col, "unitadmittime"]].copy()

Initial client_2 shape from patient table: (200859, 29)
Columns in client_2 after loading patient table: ['patientunitstayid', 'patienthealthsystemstayid', 'gender', 'age', 'ethnicity', 'hospitalid', 'wardid', 'apacheadmissiondx', 'admissionheight', 'hospitaladmittime24', 'hospitaladmitoffset', 'hospitaladmitsource', 'hospitaldischargeyear', 'hospitaldischargetime24', 'hospitaldischargeoffset', 'hospitaldischargelocation', 'hospitaldischargestatus', 'unittype', 'unitadmittime24', 'unitadmitsource', 'unitvisitnumber', 'unitstaytype', 'admissionweight', 'dischargeweight', 'unitdischargetime24', 'unitdischargeoffset', 'unitdischargelocation', 'unitdischargestatus', 'uniquepid']


  client_2["unitadmittime"] = pd.to_datetime(client_2["unitadmittime24"], errors="coerce")
  client_2["unitdischtime"] = pd.to_datetime(client_2["unitdischargetime24"], errors="coerce")


In [11]:
client_2.shape[1]

31

In [12]:
dataframes["patient"]

Unnamed: 0,patientunitstayid,patienthealthsystemstayid,gender,age,ethnicity,hospitalid,wardid,apacheadmissiondx,admissionheight,hospitaladmittime24,...,unitadmitsource,unitvisitnumber,unitstaytype,admissionweight,dischargeweight,unitdischargetime24,unitdischargeoffset,unitdischargelocation,unitdischargestatus,uniquepid
0,141168,128919,Female,70,Caucasian,59,91,"Rhythm disturbance (atrial, supraventricular)",152.4,15:54:00,...,Direct Admit,1,admit,84.3,85.8,03:50:00,3596,Death,Expired,002-34851
1,141178,128927,Female,52,Caucasian,60,83,,162.6,08:56:00,...,Emergency Department,1,admit,54.4,54.4,09:18:00,8,Step-Down Unit (SDU),Alive,002-33870
2,141179,128927,Female,52,Caucasian,60,83,,162.6,08:56:00,...,ICU to SDU,2,stepdown/other,,60.4,19:20:00,2042,Home,Alive,002-33870
3,141194,128941,Male,68,Caucasian,73,92,"Sepsis, renal/UTI (including bladder)",180.3,18:18:40,...,Floor,1,admit,73.9,76.7,15:31:00,4813,Floor,Alive,002-5276
4,141196,128943,Male,71,Caucasian,67,109,,162.6,20:21:00,...,ICU to SDU,2,stepdown/other,,63.2,22:23:00,1463,Floor,Alive,002-37665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200854,3353235,2743084,Male,50,Caucasian,458,1109,"CHF, congestive heart failure",175.3,04:55:00,...,Emergency Department,1,admit,90.0,99.2,23:18:00,1069,Telemetry,Alive,035-16382
200855,3353237,2743086,Female,79,Caucasian,458,1106,"Embolus, pulmonary",162.6,01:45:00,...,Direct Admit,1,admit,78.4,81.4,23:08:00,1269,Step-Down Unit (SDU),Alive,035-751
200856,3353251,2743099,Male,73,African American,458,1104,Cardiac arrest (with or without respiratory ar...,177.8,12:51:00,...,Emergency Department,1,admit,102.0,96.2,23:16:00,16259,Telemetry,Alive,035-5166
200857,3353254,2743102,Male,81,Caucasian,459,1108,"Bleeding, lower GI",185.4,07:43:00,...,Emergency Department,1,admit,83.9,92.9,19:25:00,431,Step-Down Unit (SDU),Alive,035-19511


This is a sanity check, here 31 is the number of columns in client 2. We just have the 29 columns from original patient csv file, and now we added 2 new columns from the "unitadmittime24" and "unitdischargetime24" columns - because we use these two give us the 24 hour window.

In [13]:
for name, df in dataframes.items():
    print(name, df.columns.tolist())

admissionDx ['admissiondxid', 'patientunitstayid', 'admitdxenteredoffset', 'admitdxpath', 'admitdxname', 'admitdxtext']
patient ['patientunitstayid', 'patienthealthsystemstayid', 'gender', 'age', 'ethnicity', 'hospitalid', 'wardid', 'apacheadmissiondx', 'admissionheight', 'hospitaladmittime24', 'hospitaladmitoffset', 'hospitaladmitsource', 'hospitaldischargeyear', 'hospitaldischargetime24', 'hospitaldischargeoffset', 'hospitaldischargelocation', 'hospitaldischargestatus', 'unittype', 'unitadmittime24', 'unitadmitsource', 'unitvisitnumber', 'unitstaytype', 'admissionweight', 'dischargeweight', 'unitdischargetime24', 'unitdischargeoffset', 'unitdischargelocation', 'unitdischargestatus', 'uniquepid']
diagnosis ['diagnosisid', 'patientunitstayid', 'activeupondischarge', 'diagnosisoffset', 'diagnosisstring', 'icd9code', 'diagnosispriority']
lab ['labid', 'patientunitstayid', 'labresultoffset', 'labtypeid', 'labname', 'labresult', 'labresulttext', 'labmeasurenamesystem', 'labmeasurenameinter

In [14]:
merge_log = []

for name, df in dataframes.items():
    if name == "patient":
        continue
    if id_col not in df.columns:
        print(f"Skipping {name}: no {id_col}")
        continue

    print(f"\nProcessing {name}.csv")
    df = df.copy()

    # 1) Reconstruct eventtime from either datetime or offset
    dt_cols  = [c for c in df.columns if "datetime" in c.lower()]
    off_cols = [c for c in df.columns if "offset"   in c.lower()]

    if dt_cols:
        # use the first datetime column
        df[dt_cols[0]] = pd.to_datetime(df[dt_cols[0]], errors="coerce")
        df = df.merge(stays_for_event_windowing, on=id_col, how="left")
        df["eventtime"] = df[dt_cols[0]]

    elif off_cols:
        # use the first offset column (minutes since admit)
        df = df.merge(stays_for_event_windowing, on=id_col, how="left")
        df["eventtime"] = (
            df["unitadmittime"]
            + pd.to_timedelta(df[off_cols[0]].astype(float), unit="m")
        )

    else:
        print(f"  No datetime or offset in {name}, skipping")
        continue

    # 2) Filter to the first 24 h and assign hour bins 0…23
    before = len(df)
    df = df.loc[
        (df["eventtime"] >= df["unitadmittime"]) &
        (df["eventtime"] <  df["unitadmittime"] + OBS_WINDOW)
    ].copy()
    after = len(df)
    print(f"  Kept {after}/{before} rows in first 24 h")

    df["hour_from_intime"] = (
        (df["eventtime"] - df["unitadmittime"])
        .dt.total_seconds()
        .floordiv(BIN_SIZE.total_seconds())
        .astype(int)
    )

    # 3) Aggregate per (patientunitstayid, hour_from_intime)
    # 3a) Numeric stats
    numeric_cols = df.select_dtypes(include="number") \
                     .columns.difference([id_col, "hour_from_intime"])
    if len(numeric_cols):
        num_grp = df.groupby([id_col, "hour_from_intime"])[numeric_cols] \
                    .agg(["mean","std","min","max","count"])
        num_wide = num_grp.unstack(level="hour_from_intime", fill_value=np.nan)
        num_wide.columns = [
            f"{name}_{orig}_{stat}_h{hour}"
            for orig, stat, hour in num_wide.columns
        ]
        num_wide = num_wide.reset_index()
    else:
        num_wide = None

    # 3b) Categorical mode
    cat_cols = df.select_dtypes(include=["object","category","bool"]) \
                 .columns.difference([id_col, "hour_from_intime"])
    if len(cat_cols):
        cat_grp = df.groupby([id_col, "hour_from_intime"])[cat_cols] \
                    .agg(lambda x: x.mode().iloc[0] if not x.mode().empty else pd.NA)
        cat_wide = cat_grp.unstack(level="hour_from_intime", fill_value=pd.NA)
        cat_wide.columns = [
            f"{name}_{orig}_mode_h{hour}"
            for orig, hour in cat_wide.columns
        ]
        cat_wide = cat_wide.reset_index()
    else:
        cat_wide = None

    # 4) Merge the numeric & categorical wide tables and drop all-NaN columns
    parts = [t for t in (num_wide, cat_wide) if t is not None]
    if not parts:
        print(f"  Skipping {name}: nothing to aggregate")
        continue

    hourly_wide = parts[0]
    for t in parts[1:]:
        hourly_wide = hourly_wide.merge(t, on=id_col, how="outer")

    keep_cols = [c for c in hourly_wide.columns
                 if c == id_col or not hourly_wide[c].isna().all()]
    hourly_wide = hourly_wide[keep_cols]

    # 5) Final merge into client_2
    client_2 = client_2.merge(hourly_wide, on=id_col, how="left")
    merge_log.append((name, "hourly"))

print("\nFinal shape:", client_2.shape)
print("Merge log:", merge_log)


Processing admissionDx.csv
  Kept 572006/626858 rows in first 24 h

Processing diagnosis.csv
  Kept 1098773/2710672 rows in first 24 h

Processing lab.csv
  Kept 8733243/39132531 rows in first 24 h

Processing medication.csv
  Kept 4152212/7301853 rows in first 24 h

Processing infusionDrug.csv
  Kept 1349234/4803719 rows in first 24 h

Processing intakeOutput.csv
  Kept 2903401/12030289 rows in first 24 h

Processing microLab.csv
  Kept 4240/16996 rows in first 24 h

Processing nurseAssessment.csv
  Kept 4232705/15602498 rows in first 24 h

Processing respiratoryCharting.csv
  Kept 4710117/20168176 rows in first 24 h

Processing customLab.csv
  Kept 390/1082 rows in first 24 h

Processing carePlanGeneral.csv
  Kept 2048636/3115018 rows in first 24 h

Final shape: (200859, 6231)
Merge log: [('admissionDx', 'hourly'), ('diagnosis', 'hourly'), ('lab', 'hourly'), ('medication', 'hourly'), ('infusionDrug', 'hourly'), ('intakeOutput', 'hourly'), ('microLab', 'hourly'), ('nurseAssessment', 

In [18]:
client_2.shape

(200859, 6231)

In [19]:
client_2

Unnamed: 0,patientunitstayid,patienthealthsystemstayid,gender,age,ethnicity,hospitalid,wardid,apacheadmissiondx,admissionheight,hospitaladmittime24,...,carePlanGeneral_cplitemvalue_mode_h14,carePlanGeneral_cplitemvalue_mode_h15,carePlanGeneral_cplitemvalue_mode_h16,carePlanGeneral_cplitemvalue_mode_h17,carePlanGeneral_cplitemvalue_mode_h18,carePlanGeneral_cplitemvalue_mode_h19,carePlanGeneral_cplitemvalue_mode_h20,carePlanGeneral_cplitemvalue_mode_h21,carePlanGeneral_cplitemvalue_mode_h22,carePlanGeneral_cplitemvalue_mode_h23
0,141168,128919,Female,70,Caucasian,59,91,"Rhythm disturbance (atrial, supraventricular)",152.4,15:54:00,...,,,,,,,,,,
1,141178,128927,Female,52,Caucasian,60,83,,162.6,08:56:00,...,,,,,,,,,,
2,141179,128927,Female,52,Caucasian,60,83,,162.6,08:56:00,...,,,,,,,,,,
3,141194,128941,Male,68,Caucasian,73,92,"Sepsis, renal/UTI (including bladder)",180.3,18:18:40,...,,,,,,,,,,Combined device and drug therapy
4,141196,128943,Male,71,Caucasian,67,109,,162.6,20:21:00,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200854,3353235,2743084,Male,50,Caucasian,458,1109,"CHF, congestive heart failure",175.3,04:55:00,...,,,,,,,,,,
200855,3353237,2743086,Female,79,Caucasian,458,1106,"Embolus, pulmonary",162.6,01:45:00,...,,,,,,,,,,
200856,3353251,2743099,Male,73,African American,458,1104,Cardiac arrest (with or without respiratory ar...,177.8,12:51:00,...,,,,,,Continuous infusion - with daily holiday,,,,
200857,3353254,2743102,Male,81,Caucasian,459,1108,"Bleeding, lower GI",185.4,07:43:00,...,,,,,,,,,,


In [22]:
# optional: save
client_2.to_csv("client_2_raw_hour.csv", index=False)
print("Saved client_2_raw_hour.csv")

Saved client_2_raw_hour.csv
