## Show the tables in schema

In [2]:
import sys
import os
sys.path.append(os.path.abspath(".."))  # adding the parent directory of 'notebooks' to sys.path
from db_utils.db_setup import Database
from sqlalchemy import text
engine = Database()
schema_list = ["mimiciv_icu", "mimiciv_hosp"]
for schema in schema_list:
    print(f"Schema: {schema}")
    tables = Database.show_tables_in_schema(engine, schema)
    print(tables)

2025-06-21 18:57:21,045 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2025-06-21 18:57:21,046 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-06-21 18:57:21,047 INFO sqlalchemy.engine.Engine select current_schema()
2025-06-21 18:57:21,047 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-06-21 18:57:21,048 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2025-06-21 18:57:21,048 INFO sqlalchemy.engine.Engine [raw sql] {}
Schema: mimiciv_icu
2025-06-21 18:57:21,050 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-06-21 18:57:21,050 INFO sqlalchemy.engine.Engine SELECT table_name FROM information_schema.tables WHERE table_schema = %(schema)s
2025-06-21 18:57:21,050 INFO sqlalchemy.engine.Engine [generated in 0.00076s] {'schema': 'mimiciv_icu'}
['caregiver', 'chartevents', 'datetimeevents', 'd_items', 'icustays', 'ingredientevents', 'inputevents', 'outputevents', 'procedureevents']
Schema: mimiciv_hosp
2025-06-21 18:57:21,063 INFO sqlalchemy.engine.Engine SELECT 

# Getting all `labevents` data and filtering

##### Fetching `demographic` data

In [3]:
from dotenv import load_dotenv
import os
import pandas as pd
from sqlalchemy import create_engine, text
from psycopg2.extras import execute_values

# Load environment variables
load_dotenv()
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")

# Create SQLAlchemy engine
url = f"postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(url)
conn = engine.connect()
cursor = conn.connection.cursor()

# Creating a TEMPORARY table
cursor.execute("""
    CREATE TEMP TABLE temp_cohort (
        subject_id INT,
        hadm_id INT,
        admittime TIMESTAMP,
        dischtime TIMESTAMP,
        target  INT
    );
""")

# Loading the CSV and insert into temp_cohort
cohort_df = pd.read_csv('../assets/cohort1_target.csv')
cohort_df['admittime'] = pd.to_datetime(cohort_df['admittime'], errors='coerce')
cohort_df['dischtime'] = pd.to_datetime(cohort_df['dischtime'], errors='coerce')

values = list(cohort_df.itertuples(index=False, name=None))
execute_values(cursor,
    "INSERT INTO temp_cohort (subject_id, hadm_id, admittime, dischtime, target) VALUES %s",
    values
)

# Fetching demographic data from admissions table
cursor.execute("""
    SELECT DISTINCT
        c.subject_id,
        c.hadm_id,
        c.admittime,
        c.dischtime,
        c.target,
        p.gender,
        p.anchor_age,
        a.race
    FROM temp_cohort c
    JOIN mimiciv_hosp.admissions a ON c.hadm_id = a.hadm_id
    JOIN mimiciv_hosp.patients p ON a.subject_id = p.subject_id

""")

rows = cursor.fetchall()

# Finally creating DataFrame 
columns = ['subject_id', 'hadm_id', 'admittime', 'dischtime', 'target', 'gender', 'anchor_age', 'race']
final_df = pd.DataFrame(rows, columns=columns)
# Save the final DataFrame to a Parquet file
final_df['admittime'] = pd.to_datetime(final_df['admittime'], errors='coerce')
final_df['dischtime'] = pd.to_datetime(final_df['dischtime'], errors='coerce')
final_df['anchor_age'] = pd.to_numeric(final_df['anchor_age'], errors='coerce')
final_df['target'] = pd.to_numeric(final_df['target'], errors='coerce')
final_df.to_parquet("../dataset/raw/cohort_with_demographic_data.parquet", index=False)

# Finalize
conn.connection.commit()
cursor.close()
conn.close()

In [6]:
missing_ids = [
    (14634633, 28387252),
    (17957482, 20759114),
    (12956096, 27116694),
    (11101913, 27589462),
    (12593003, 25929337),
    (13474206, 21575924),
    (12956096, 21897330),
    (11338207, 23582798),
    (13474206, 24246939),
    (18215560, 22709370)
]


engine = create_engine(url)
conn = engine.connect()
cursor = conn.connection.cursor()
# Build a SQL-friendly string
id_tuple_str = ",".join(f"({sid},{hid})" for sid, hid in missing_ids)

query = f"""
    SELECT le.subject_id, le.hadm_id, COUNT(*) as lab_count
    FROM mimiciv_hosp.labevents le
    WHERE (le.subject_id, le.hadm_id) IN ({id_tuple_str})
    GROUP BY le.subject_id, le.hadm_id
"""

cursor.execute(query)
data = cursor.fetchall()
df = pd.DataFrame(data, columns=['subject_id', 'hadm_id', 'lab_count'])
df.head()


Unnamed: 0,subject_id,hadm_id,lab_count


In [40]:
# Load files
demog_df = pd.read_parquet("../dataset/raw/lab_event_data_with_demographics.parquet")
cohort_df = pd.read_csv('../assets/extracted.csv')

# Ensure consistent dtypes
cohort_df['subject_id'] = cohort_df['subject_id'].astype(int)
cohort_df['admid'] = cohort_df['admid'].astype(int)
demog_df['subject_id'] = demog_df['subject_id'].astype(int)
demog_df['hadm_id'] = demog_df['hadm_id'].astype(int)

# Create sets of (subject_id, hadm_id) pairs
cohort_pairs = set(zip(cohort_df['subject_id'], cohort_df['admid']))
demog_pairs = set(zip(demog_df['subject_id'], demog_df['hadm_id']))

# Identify missing pairs
missing_pairs = sorted(cohort_pairs - demog_pairs)

print(f"✅ Total missing (subject_id, hadm_id) pairs: {len(missing_pairs)}")
print("First 10 missing:")
for pair in missing_pairs[:10]:
    print(pair)


✅ Total missing (subject_id, hadm_id) pairs: 372
First 10 missing:
(10275673, 20335833)
(10470555, 23490435)
(10472840, 22711151)
(10540938, 22151813)
(10540938, 23239998)
(10540938, 24081400)
(10540938, 26471128)
(10540938, 26728577)
(10540938, 28150003)
(10540938, 28412117)


Unnamed: 0,itemid,label,valuenum,charttime,admid,subject_id,minute,hour,day
0,50861,AlanineAminotransferase(ALT),22.0,2117-12-18 10:12:00,21586397,10010231,2481,41,1
1,50862,Albumin,3.9,2117-12-18 10:12:00,21586397,10010231,2481,41,1
2,50863,AlkalinePhosphatase,68.0,2117-12-18 10:12:00,21586397,10010231,2481,41,1
3,50868,AnionGap,15.0,2117-12-18 10:12:00,21586397,10010231,2481,41,1
4,50878,AsparateAminotransferase(AST),18.0,2117-12-18 10:12:00,21586397,10010231,2481,41,1


In [None]:
import pandas as pd
from sqlalchemy import create_engine, text
from tqdm import tqdm

# Replace with your actual DB connection URL
url = f"postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(url)
cohort_df = pd.read_csv('../assets/cohort1_target.csv')

cohort_df = cohort_df.rename(columns={'admid': 'hadm_id'})

dischtime_lookup = {
    (int(row.subject_id), int(row.hadm_id)): pd.to_datetime(row.dischtime)
    for row in cohort_df.itertuples(index=False)
}

no_lab = []
lab_but_not_in_window = []
lab_in_window = []

with engine.connect() as conn:
    for sid, hid in tqdm(missing_pairs, desc="Checking missing patients"):
        # 1. Check if any lab events exist at all
        query = text("""
            SELECT charttime FROM mimiciv_hosp.labevents 
            WHERE subject_id = :sid AND hadm_id = :hid
        """)
        result = conn.execute(query, {"sid": sid, "hid": hid}).fetchall()

        if not result:
            no_lab.append((sid, hid))
            continue

        # 2. Check if any lab event is within 7 days before discharge
        if (sid, hid) not in dischtime_lookup:
            print(f"⚠️ Missing dischtime for {sid}, {hid}")
            continue

        disch_time = dischtime_lookup[(sid, hid)]
        in_window = any(
            disch_time - pd.Timedelta(days=7) <= pd.to_datetime(r[0]) <= disch_time
            for r in result
        )

        if in_window:
            lab_in_window.append((sid, hid))
        else:
            lab_but_not_in_window.append((sid, hid))

# Summary
print("\n✅ Summary of Missing Patients Classification:")
print(f"Total missing patients checked: {len(missing_pairs)}")
print(f"➤ No lab events at all: {len(no_lab)}")
print(f"➤ Lab events, but none within 7-day window: {len(lab_but_not_in_window)}")
print(f"➤ Lab events in window (unexpectedly excluded): {len(lab_in_window)}")

# Optional: Save for inspection
pd.DataFrame(no_lab, columns=["subject_id", "hadm_id"]).to_csv("no_lab_events.csv", index=False)
pd.DataFrame(lab_but_not_in_window, columns=["subject_id", "hadm_id"]).to_csv("lab_outside_window.csv", index=False)


Checking missing patients:  31%|███       | 114/372 [2:04:02<4:46:45, 66.69s/it]

In [None]:
from sklearn.preprocessing import LabelEncoder
def map_race(race):
    if pd.isna(race):
        return 'Unknown or Not Reported'
    
    race = race.upper()
    
    if 'HISPANIC' in race or 'LATINO' in race or 'SOUTH AMERICAN' in race:
        return 'Hispanic or Latino'
    elif 'WHITE' in race:
        return 'White'
    elif 'BLACK' in race or 'AFRICAN' in race:
        return 'Black or African American'
    elif 'ASIAN' in race:
        return 'Asian'
    elif 'PACIFIC ISLANDER' in race or 'NATIVE HAWAIIAN' in race:
        return 'Native Hawaiian or Other Pacific Islander'
    elif 'AMERICAN INDIAN' in race or 'ALASKA NATIVE' in race:
        return 'American Indian or Alaska Native'
    elif 'DECLINED' in race or 'UNABLE' in race or 'UNKNOWN' in race:
        return 'Unknown or Not Reported'
    else:
        return 'Other'

le = LabelEncoder()
demog_df['race_grouped'] = demog_df['race'].apply(map_race)  # apply your earlier grouping
demog_df['race_target'] = le.fit_transform(demog_df['race_grouped'])


In [None]:
# unique ages and races
print("Unique ages:", final_df['anchor_age'].unique())
print("Unique races:", final_df['race'].unique())

##### Fetching labevents data prior `7` or `14` days

In [None]:
from loguru import logger
import pandas as pd

logger.info("Starting lab data extraction process.")

# First get all unique patient IDs
patient_ids = pd.read_sql("SELECT DISTINCT subject_id FROM public.temp_cohort ORDER BY subject_id", engine)
logger.info(f"Fetched {len(patient_ids)} unique patient IDs from temp_cohort.")

lab_df = pd.DataFrame()

batch_size = 100
total_batches = (len(patient_ids) + batch_size - 1) // batch_size
logger.info(f"Processing patient data in batches of {batch_size}, total batches: {total_batches}")

for i in range(0, len(patient_ids), batch_size):
    batch_num = i // batch_size + 1
    batch = patient_ids.iloc[i:i+batch_size]
    batch_list = tuple(batch['subject_id'])
    
    logger.info(f"Processing batch {batch_num}/{total_batches} with {len(batch)} patient IDs.")
    
    query = f"""
        SELECT DISTINCT
            le.subject_id, 
            le.hadm_id, 
            le.itemid, 
            le.charttime, 
            le.valuenum,
            tc.dischtime,
            tc.target
        FROM mimiciv_hosp.labevents le
        JOIN public.temp_cohort tc
          ON le.subject_id = tc.subject_id
         AND le.hadm_id = tc.hadm_id
        WHERE le.charttime BETWEEN (tc.dischtime - INTERVAL '7 days') AND tc.dischtime
        AND le.subject_id IN {batch_list}
    """
    
    chunk = pd.read_sql(query, engine)
    logger.info(f"Batch {batch_num} fetched {len(chunk)} lab event records.")
    
    lab_df = pd.concat([lab_df, chunk], ignore_index=True)

lab_df.reset_index(drop=True, inplace=True)
logger.info(f"Lab data extraction complete. Total records collected: {len(lab_df)}")
logger.info("Saving lab data to Parquet file.")
lab_df.to_parquet("../dataset/raw/lab_event_data_with_demographics.parquet", index=False)


# Pre-processing for tabular data

### Aggregating on an `hourly` basis

In [None]:
import polars as pl 
import numpy as np
import pandas as pd
patient_data_df = pd.read_parquet("../dataset/raw/lab_event_data_with_demographics.parquet")
sup_df = pd.read_csv("../assets/ts.csv")  
sup_extracted_df = pd.read_csv("../assets/extracted.csv")
cohort_df = pd.read_csv("../assets/cohort1_target.csv")
# len(patient_data_df)

In [None]:
len(patient_data_df['hadm_id'].unique()), len(sup_df), len(sup_extracted_df), len(cohort_df)

(4908, 239990, 1092498, 5308)

In [None]:
patient_data_df.head()

Unnamed: 0,subject_id,hadm_id,itemid,charttime,valuenum,dischtime,target,gender,anchor_age,race
0,10014354,26486158,50822,2148-09-01 18:47:00,3.8,2148-09-08 12:00:00,0,M,60,WHITE
1,10014354,26486158,50861,2148-09-01 14:05:00,7.0,2148-09-08 12:00:00,0,M,60,WHITE
2,10014354,26486158,50861,2148-09-02 00:00:00,8.0,2148-09-08 12:00:00,0,M,60,WHITE
3,10014354,26486158,50861,2148-09-02 04:29:00,7.0,2148-09-08 12:00:00,0,M,60,WHITE
4,10014354,26486158,50861,2148-09-02 07:37:00,7.0,2148-09-08 12:00:00,0,M,60,WHITE


In [None]:
# Read ts.csv (daily binned data per (admid, itemid))
ts_df = sup_df.copy()

# Set MultiIndex
ts_df.set_index(["admid", "itemid"], inplace=True)

# Unstack to reshape so each row = 1 admission
reshaped = ts_df.unstack(level=1)

# Flatten MultiIndex columns: (day, itemid) → itemid_day
reshaped.columns = [f"itemid_{item}_{day}d" for day, item in reshaped.columns]

# Reset index to make admid a column
reshaped = reshaped.reset_index()

# Impute missing values (example: fillna with 0 or use KNNImputer)
# Option 1: Fill NaNs with 0
imputed = reshaped.fillna(0)

# Option 2: KNN Imputer
# from sklearn.impute import KNNImputer
# imputer = KNNImputer(n_neighbors=5)
# imputed = pd.DataFrame(imputer.fit_transform(reshaped.iloc[:, 1:]), columns=reshaped.columns[1:])
# imputed.insert(0, 'admid', reshaped['admid'].values)

# imputed is now ready for model training


In [None]:
len(patient_data_df['hadm_id'].unique()) == len(sup_extracted_df['admid'].unique())  # Number of unique patients in the patient 

False

In [None]:
cohort_df['hadm_id'].unique()

In [None]:
# patient_data_df.drop_duplicates(subset=['hadm_id', 'subject_id', 'itemid'], inplace=True)
hadm_ids = set(patient_data_df['hadm_id'].unique())
admid_ids = set(sup_extracted_df['admid'].unique())

print("Only in patient_data_df:", hadm_ids - admid_ids)
print("Only in sup_df:", admid_ids - hadm_ids)


Only in patient_data_df: set()
Only in sup_df: {np.int64(28049409), np.int64(24512520), np.int64(29966344), np.int64(23732248), np.int64(26859546), np.int64(24348699), np.int64(25581599), np.int64(29601847), np.int64(23566395), np.int64(28069955), np.int64(24105031), np.int64(23582798), np.int64(23343183), np.int64(27936855), np.int64(27109464), np.int64(21479513), np.int64(24238176), np.int64(28002410), np.int64(29954155), np.int64(29978734), np.int64(21897330), np.int64(22063223), np.int64(27164792), np.int64(27801728), np.int64(26728577), np.int64(29429890), np.int64(28745870), np.int64(27752592), np.int64(26646673), np.int64(27390097), np.int64(24320151), np.int64(21457047), np.int64(29726889), np.int64(28928169), np.int64(21547199), np.int64(21080270), np.int64(28412117), np.int64(29712617), np.int64(28150003), np.int64(21575924), np.int64(27189494), np.int64(29303040), np.int64(28754176), np.int64(28461313), np.int64(27646210), np.int64(25872649), np.int64(20220172), np.int64(249

In [None]:
len(patient_data_df['hadm_id'].unique())  # Number of unique admissions in the patient data

4908

In [None]:
from sklearn.impute import KNNImputer


# Create a copy and convert timestamps
new_df = patient_data_df.copy()
new_df["charttime"] = pd.to_datetime(new_df["charttime"])
new_df["dischtime"] = pd.to_datetime(new_df["dischtime"])

# Calculate hours before discharge
new_df["hours_before_discharge"] = (new_df["dischtime"] - new_df["charttime"]).dt.total_seconds() / 3600

# Filter to 12-hour window (0 to 12 hours inclusive)
new_df_filtered = new_df[
    (new_df["hours_before_discharge"] >= 0) & (new_df["hours_before_discharge"] <= 168)
].copy()    

print(f"Processing {len(new_df_filtered)} records within 12-hour window...")

# creating 7 day bins
new_df_filtered["hour_bin"] = (np.floor(new_df_filtered["hours_before_discharge"]/6) + 1).astype(int)
new_df_filtered["hour_bin"] = new_df_filtered["hour_bin"].clip(upper=27)  # Cap at 7

# Create feature_id with hour bin
new_df_filtered["feature_id"] = (
    "itemid_" + 
    new_df_filtered["itemid"].astype(str) + 
    "_last_" + 
    new_df_filtered["hour_bin"].astype(str) + 
    "h"
)

# Pivot numeric features (mean aggregation)
numeric_pivot = new_df_filtered.pivot_table(
    index="hadm_id",
    columns="feature_id",
    values="valuenum",
    aggfunc="mean",
    # fill_value=np.nan,
)
# Pivot binary features (existence indicator)
new_df_filtered["has_measurement"] = 1
binary_pivot = new_df_filtered.pivot_table(
    index="hadm_id",
    columns="feature_id",
    values="has_measurement",
    aggfunc="max",  # 1 if any measurement exists
    fill_value=0,
)
binary_pivot.columns = [col + "_measured" for col in binary_pivot.columns]

# Step 3: Impute missing values using KNN
imputer = KNNImputer(n_neighbors=5)
ts_user_imputed = pd.DataFrame(
    imputer.fit_transform(numeric_pivot),
    columns=numeric_pivot.columns,
    index=numeric_pivot.index
).reset_index()


# Get targets
targets = new_df_filtered[["hadm_id", "target"]].drop_duplicates().set_index("hadm_id")

# Combine features with targets (NO forward/backward fill)
numeric_features = numeric_pivot.join(targets).reset_index()
binary_features = binary_pivot.join(targets).reset_index()

print(f"Created numeric features: {numeric_features.shape}")
print(f"Created binary features: {binary_features.shape}")
print(f"Filtered down to {len(new_df_filtered)} rows from {len(new_df)}")
print(f"Number of unique hadm_ids: {new_df_filtered['hadm_id'].nunique()}")

In [None]:
numeric_features.head()

In [None]:
28 // 2

In [None]:
print("Supervisor shape:", ts_supervisor_final.shape)
print("User shape:", binary_features.shape)

# Optional: inspect overlapping features
common_columns = set(ts_supervisor_final.columns) & set(binary_features.columns)
print("Common features:", len(common_columns))

# Check hadm_id consistency
print("Same hadm_ids:", set(ts_supervisor_final['admid']) == set(binary_features['hadm_id']))


# Preprocessing for Temporal Data

In [None]:
import pandas as pd
patient_data_df = pd.read_parquet("../dataset/raw/lab_event_data_with_demographics.parquet")
len(patient_data_df)

1170853

In [None]:
temp_df = patient_data_df.copy()

# Drop unnecessary columns ["race", "gender", "anchor_age", "target"]
temp_df = temp_df.drop(columns=["race", "gender", "anchor_age", "target"])

# Then drop duplicates based on ["subject_id", "hadm_id", "itemid", "charttime"]
temp_df = temp_df.drop_duplicates(subset=["subject_id", "hadm_id", "itemid", "charttime"])

temp_df.head(10)


In [None]:
temp_df.shape

In [None]:
import numpy as np
def assign_time_bin(hours_before_discharge, window_hours=6):
    """Assign records to fixed time bins (e.g., 0-6h, 6-12h).
    Example: For a 6-hour window:
        0.5h → bin 0, 6.1h → bin 6, 23h → bin 18
    """
    return (np.floor(hours_before_discharge / window_hours) * window_hours)

In [None]:
import numpy as np
df = temp_df.copy()
max_window_days = 7
time_bin_hours = 12

# Convert charttime and dischtime to datetime
unique_items = df["itemid"].unique()
# creating a dictionary to map itemid to index 
# because the itemid can be large and sparse
inputdict = {item: idx for idx, item in enumerate(unique_items)}
n_features = len(inputdict)

# calculating hours before discharge and filter window
df["hours_before_discharge"] = (df["dischtime"] - df["charttime"]).dt.total_seconds() / 3600
df = df[(df["hours_before_discharge"] >= 0) & 
        (df["hours_before_discharge"] <= max_window_days * 24)]

# 3. Assign time bins (aligned to discharge)
df['time_bin'] = (np.floor(df['hours_before_discharge'] / time_bin_hours) 
                    * time_bin_hours)

# Grouping by patient and time bin
grouped = df.sort_values(["subject_id", "hadm_id", "time_bin"])\
            .groupby(["subject_id", "hadm_id", "time_bin"])

# Initializing arrays to hold features, masking, timestamps, and patient IDs
n_timesteps = len(grouped)
x = np.zeros((n_features, n_timesteps))
masking = np.zeros_like(x)
timestamps = np.zeros(n_timesteps)
patient_ids = []

# Populating arrays to  hold features, masking, timestamps, and patient IDs
for i, ((subj_id, adm_id, time_bin), group) in enumerate(grouped):
    # get the time bin as a timestamp
    timestamps[i] = time_bin
    patient_ids.append(f"{subj_id}_{adm_id}")
    
    for _, row in group.iterrows():
        # get the feature index from the inputdict
        feat_idx = inputdict[row["itemid"]]
        # Fill the feature value and masking
        x[feat_idx, i] = row["valuenum"]
        # Set masking to 1 if the feature is present
        masking[feat_idx, i] = 1

# Calculating delta (time since last observation)
delta = np.zeros_like(x)
for i in range(1, n_timesteps):
    # calculate the time gap between the current and previous time bin
    time_gap = timestamps[i-1] - timestamps[i]  # Note: reversed for "before discharge"
    # if the previous time bin was missing, accumulate the time gap
    # else use the actual time gap
    delta[:, i] = np.where(
        masking[:, i-1] == 0,
        time_gap + delta[:, i-1],  # Accumulate if missing
        time_gap                   # Else use actual gap
    )

In [None]:
df.head(15)

In [None]:
delta[0, 0:10]  

In [None]:
masking[0, 0:10]  

In [None]:
x[3, 0:30] 

In [None]:
x.shape, masking.shape, delta.shape, timestamps.shape

In [None]:
np.unique(x)

In [None]:
import numpy as np
import pandas as pd

def df_to_x_m_d(df, max_window_days=7):
    """
    Convert DataFrame to GRU-D inputs (x, masking, delta), using raw `itemid` as indices.
    
    Args:
        df: DataFrame with columns ['subject_id', 'hadm_id', 'itemid', 'charttime', 'valuenum', 'dischtime'].
        max_window_days: Maximum days before discharge to include.
    
    Returns:
        x: Feature matrix of shape (n_features, n_timesteps).
        masking: Binary mask of observed values (same shape as x).
        delta: Time gaps since last observation (same shape as x).
        timestamps: Hours since discharge for each timestep.
        ids: DataFrame with ['subject_id', 'hadm_id'] for each timestep.
    """

    # --- 1. Preprocess Timestamps ---
    df["hours_since_discharge"] = (df["dischtime"] - df["charttime"]).dt.total_seconds() / 3600

    # Filter to keep only within the max window
    df = df[(df["hours_since_discharge"] >= 0) & (df["hours_since_discharge"] <= max_window_days * 24)].copy()

    # --- 2. Group by Patient and Time ---
    # Sort dataframe
    df = df.sort_values(by=["subject_id", "hadm_id", "charttime"])

    # Group by 'subject_id', 'hadm_id', 'charttime'
    grouped = df.groupby(["subject_id", "hadm_id", "charttime"])

    n_timesteps = len(grouped)

    # --- 3. Initialize Arrays ---
    n_features = df["itemid"].max() + 1  # Assumes itemids start at 0
    x = np.zeros((n_features, n_timesteps))
    masking = np.zeros_like(x)
    timestamps = np.zeros(n_timesteps)
    ids = []

    # --- 4. Populate x, masking, and timestamps ---
    for i, ((subj_id, adm_id, time), group) in enumerate(grouped):
        timestamps[i] = (time - group["dischtime"].iloc[0]).total_seconds() / 3600
        ids.append({"subject_id": subj_id, "hadm_id": adm_id})
        for _, row in group.iterrows():
            x[int(row["itemid"]), i] = row["valuenum"]
            masking[int(row["itemid"]), i] = 1

    # --- 5. Calculate delta ---
    delta = np.zeros_like(x)
    for i in range(1, n_timesteps):
        time_gap = timestamps[i] - timestamps[i-1]
        delta[:, i] = np.where(
            masking[:, i-1] == 0,
            time_gap + delta[:, i-1],  # Accumulate if previous value was missing
            time_gap                   # Else use actual time gap
        )

    return x, masking, delta, timestamps, pd.DataFrame(ids)


In [None]:
x, masking, delta, timestamps, ids = df_to_x_m_d(temp_df, max_window_days=7)

print("x shape:", x.shape)          # (max_itemid + 1, n_timesteps)
print("masking shape:", masking.shape)  # Same as x
print("delta shape:", delta.shape)    # Same as x
print("Timestamps (hours before discharge):", timestamps)
# print("Patient IDs:", ids)

In [None]:
ids.head()

In [None]:
timestamps # Display first 5 timestamps