## Show the tables in schema

In [13]:
import sys
import os
sys.path.append(os.path.abspath(".."))  # adding the parent directory of 'notebooks' to sys.path
from db_utils.db_setup import Database
from sqlalchemy import text
engine = Database()
schema_list = ["mimiciv_icu", "mimiciv_hosp"]
for schema in schema_list:
    print(f"Schema: {schema}")
    tables = Database.show_tables_in_schema(engine, schema)
    print(tables)

2025-06-13 14:23:13,287 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2025-06-13 14:23:13,287 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-06-13 14:23:13,288 INFO sqlalchemy.engine.Engine select current_schema()
2025-06-13 14:23:13,288 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-06-13 14:23:13,289 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2025-06-13 14:23:13,289 INFO sqlalchemy.engine.Engine [raw sql] {}
Schema: mimiciv_icu
2025-06-13 14:23:13,291 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-06-13 14:23:13,291 INFO sqlalchemy.engine.Engine SELECT table_name FROM information_schema.tables WHERE table_schema = %(schema)s
2025-06-13 14:23:13,291 INFO sqlalchemy.engine.Engine [generated in 0.00060s] {'schema': 'mimiciv_icu'}
['caregiver', 'chartevents', 'datetimeevents', 'd_items', 'icustays', 'ingredientevents', 'inputevents', 'outputevents', 'procedureevents']
Schema: mimiciv_hosp
2025-06-13 14:23:13,295 INFO sqlalchemy.engine.Engine SELECT 

# Getting all `labevents` data and filtering

##### Fetching `demographic` data

In [None]:
from dotenv import load_dotenv
import os
import pandas as pd
from sqlalchemy import create_engine, text
from psycopg2.extras import execute_values

# Load environment variables
load_dotenv()
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")

# Create SQLAlchemy engine
url = f"postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(url)
conn = engine.connect()
cursor = conn.connection.cursor()

# Creating a TEMPORARY table
cursor.execute("""
    CREATE TEMP TABLE temp_cohort (
        subject_id INT,
        hadm_id INT,
        admittime TIMESTAMP,
        dischtime TIMESTAMP,
        target  INT
    );
""")

# Loading the CSV and insert into temp_cohort
cohort_df = pd.read_csv('../assets/cohort1_target.csv')
cohort_df['admittime'] = pd.to_datetime(cohort_df['admittime'], errors='coerce')
cohort_df['dischtime'] = pd.to_datetime(cohort_df['dischtime'], errors='coerce')

values = list(cohort_df.itertuples(index=False, name=None))
execute_values(cursor,
    "INSERT INTO temp_cohort (subject_id, hadm_id, admittime, dischtime, target) VALUES %s",
    values
)

# Fetching demographic data from admissions table
cursor.execute("""
    SELECT 
        c.subject_id,
        c.hadm_id,
        c.admittime,
        c.dischtime,
        c.target,
        p.gender,
        p.anchor_age,
        a.race
    FROM temp_cohort c
    JOIN mimiciv_hosp.admissions a ON c.hadm_id = a.hadm_id
    JOIN mimiciv_hosp.patients p ON a.subject_id = p.subject_id

""")

rows = cursor.fetchall()

# Finally creating DataFrame 
columns = ['subject_id', 'hadm_id', 'admittime', 'dischtime', 'target', 'gender', 'anchor_age', 'race']
final_df = pd.DataFrame(rows, columns=columns)
# Save the final DataFrame to a Parquet file
final_df['admittime'] = pd.to_datetime(final_df['admittime'], errors='coerce')
final_df['dischtime'] = pd.to_datetime(final_df['dischtime'], errors='coerce')
final_df['anchor_age'] = pd.to_numeric(final_df['anchor_age'], errors='coerce')
final_df['target'] = pd.to_numeric(final_df['target'], errors='coerce')
final_df.to_parquet("../dataset/raw/cohort_with_demographic_data.parquet", index=False)

# Finalize
conn.connection.commit()
cursor.close()
conn.close()

In [4]:
demog_df = pd.read_parquet("../dataset/raw/cohort_with_demographic_data.parquet")
demog_df.head(5)  # Display the first 5 rows of the raw Parquet file

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,target,gender,anchor_age,race
0,10010231,23835132,2118-04-02 11:54:00,2118-04-07 11:26:00,0,M,57,HISPANIC/LATINO - GUATEMALAN
1,10010231,23835132,2118-04-02 11:54:00,2118-04-07 11:26:00,0,M,57,HISPANIC/LATINO - GUATEMALAN
2,10010231,23835132,2118-04-02 11:54:00,2118-04-07 11:26:00,0,M,57,HISPANIC/LATINO - GUATEMALAN
3,10010231,24995642,2118-02-21 13:30:00,2118-02-26 16:50:00,0,M,57,HISPANIC/LATINO - GUATEMALAN
4,10010231,24995642,2118-02-21 13:30:00,2118-02-26 16:50:00,0,M,57,HISPANIC/LATINO - GUATEMALAN


In [14]:
demog_df['gender'].unique()  # Check unique values in the

array(['M', 'F'], dtype=object)

In [5]:
demog_df['anchor_age'].max(), demog_df['anchor_age'].min()

(np.int64(91), np.int64(18))

In [6]:
from sklearn.preprocessing import LabelEncoder
def map_race(race):
    if pd.isna(race):
        return 'Unknown or Not Reported'
    
    race = race.upper()
    
    if 'HISPANIC' in race or 'LATINO' in race or 'SOUTH AMERICAN' in race:
        return 'Hispanic or Latino'
    elif 'WHITE' in race:
        return 'White'
    elif 'BLACK' in race or 'AFRICAN' in race:
        return 'Black or African American'
    elif 'ASIAN' in race:
        return 'Asian'
    elif 'PACIFIC ISLANDER' in race or 'NATIVE HAWAIIAN' in race:
        return 'Native Hawaiian or Other Pacific Islander'
    elif 'AMERICAN INDIAN' in race or 'ALASKA NATIVE' in race:
        return 'American Indian or Alaska Native'
    elif 'DECLINED' in race or 'UNABLE' in race or 'UNKNOWN' in race:
        return 'Unknown or Not Reported'
    else:
        return 'Other'

le = LabelEncoder()
demog_df['race_grouped'] = demog_df['race'].apply(map_race)  # apply your earlier grouping
demog_df['race_target'] = le.fit_transform(demog_df['race_grouped'])


In [6]:
# unique ages and races
print("Unique ages:", final_df['anchor_age'].unique())
print("Unique races:", final_df['race'].unique())

Unique ages: [57 58 60 72 59 73 75 74 41 61 65 45 71 78 24 50 77 63 69 91 44 42 76 84
 56 67 55 80 46 68 47 32 53 33 52 48 30 85 66 83 87 64 81 36 26 79 28 43
 70 27 62 25 49 54 89 21 20 82 34 51 40 29 31 86 38 23 88 39 35 22 37 18
 19]
Unique races: ['HISPANIC/LATINO - GUATEMALAN' 'WHITE' 'BLACK/AFRICAN AMERICAN' 'OTHER'
 'ASIAN - CHINESE' 'ASIAN - SOUTH EAST ASIAN' 'ASIAN' 'UNKNOWN'
 'WHITE - OTHER EUROPEAN' 'UNABLE TO OBTAIN' 'PATIENT DECLINED TO ANSWER'
 'WHITE - RUSSIAN' 'SOUTH AMERICAN' 'WHITE - BRAZILIAN'
 'HISPANIC/LATINO - DOMINICAN' 'BLACK/AFRICAN' 'PORTUGUESE'
 'HISPANIC/LATINO - PUERTO RICAN' 'BLACK/CAPE VERDEAN'
 'HISPANIC/LATINO - HONDURAN' 'HISPANIC/LATINO - CENTRAL AMERICAN'
 'BLACK/CARIBBEAN ISLAND' 'ASIAN - ASIAN INDIAN'
 'WHITE - EASTERN EUROPEAN' 'HISPANIC/LATINO - COLUMBIAN'
 'HISPANIC/LATINO - SALVADORAN' 'HISPANIC/LATINO - CUBAN'
 'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER' 'ASIAN - KOREAN'
 'HISPANIC/LATINO - MEXICAN' 'AMERICAN INDIAN/ALASKA NATIVE']


##### Fetching labevents data prior `7` or `14` days

In [72]:
raw_parquet = pd.read_parquet("../dataset/raw/lab_events_7_days_prior.parquet")
raw_parquet.head(5)  # Display the first 5 rows of the raw Parquet file

Unnamed: 0,subject_id,hadm_id,itemid,charttime,valuenum,dischtime,target,gender,anchor_age,race
0,10010231,29368887,51233,2118-01-15 17:45:00,,2118-01-20 14:00:00,1,M,57,HISPANIC/LATINO - GUATEMALAN
1,10010231,29368887,51233,2118-01-15 17:45:00,,2118-01-20 14:00:00,1,M,57,HISPANIC/LATINO - GUATEMALAN
2,10010231,29368887,51233,2118-01-15 17:45:00,,2118-01-20 14:00:00,1,M,57,HISPANIC/LATINO - GUATEMALAN
3,10010231,21586397,51678,2117-12-19 06:20:00,6.0,2117-12-23 16:51:00,1,M,57,HISPANIC/LATINO - GUATEMALAN
4,10010231,21586397,51678,2117-12-19 06:20:00,6.0,2117-12-23 16:51:00,1,M,57,HISPANIC/LATINO - GUATEMALAN


In [73]:
len(raw_parquet.drop_duplicates(subset=['hadm_id']))  # Display the number of rows in the raw Parquet file


4908

In [None]:
# First get all unique patient IDs
patient_ids = pd.read_sql("SELECT DISTINCT subject_id FROM public.temp_cohort ORDER BY subject_id", engine)

lab_df = pd.DataFrame()

# Process in patient batches
batch_size = 100
for i in range(0, len(patient_ids), batch_size):
    batch = patient_ids.iloc[i:i+batch_size]
    batch_list = tuple(batch['subject_id'])
    
    query = f"""
        SELECT 
            le.subject_id, 
            le.hadm_id, 
            le.itemid, 
            le.charttime, 
            le.valuenum,
            tc.dischtime,
            tc.target
        FROM mimiciv_hosp.labevents le
        JOIN public.temp_cohort tc
          ON le.subject_id = tc.subject_id
         AND le.hadm_id = tc.hadm_id
        WHERE le.charttime BETWEEN (tc.dischtime - INTERVAL '7 days') AND tc.dischtime
        AND le.subject_id IN {batch_list}
    """
    
    chunk = pd.read_sql(query, engine)
    # Process your chunk
    lab_df = pd.concat([lab_df, chunk], ignore_index=True)
# Reset index after concat
lab_df.reset_index(drop=True, inplace=True)

# Pre-processing for tabular data

### Aggregating on an `hourly` basis

In [None]:
import polars as pl 
import numpy as np
import pandas as pd
patient_data_df = pd.read_parquet("../dataset/raw/lab_events_7_days_prior.parquet")
sup_df = pd.read_csv("../assets/ts.csv")  
sup_extracted_df = pd.read_csv("../assets/extracted.csv")
cohort_df = pd.read_csv("../assets/cohort1_target.csv")
# len(patient_data_df)

In [40]:
# Read ts.csv (daily binned data per (admid, itemid))
ts_df = sup_df.copy()

# Set MultiIndex
ts_df.set_index(["admid", "itemid"], inplace=True)

# Unstack to reshape so each row = 1 admission
reshaped = ts_df.unstack(level=1)

# Flatten MultiIndex columns: (day, itemid) → itemid_day
reshaped.columns = [f"itemid_{item}_{day}d" for day, item in reshaped.columns]

# Reset index to make admid a column
reshaped = reshaped.reset_index()

# Impute missing values (example: fillna with 0 or use KNNImputer)
# Option 1: Fill NaNs with 0
imputed = reshaped.fillna(0)

# Option 2: KNN Imputer
# from sklearn.impute import KNNImputer
# imputer = KNNImputer(n_neighbors=5)
# imputed = pd.DataFrame(imputer.fit_transform(reshaped.iloc[:, 1:]), columns=reshaped.columns[1:])
# imputed.insert(0, 'admid', reshaped['admid'].values)

# imputed is now ready for model training


In [66]:
len(patient_data_df['hadm_id'].unique()) == len(cohort_df['hadm_id'].unique())  # Number of unique patients in the patient 

False

In [68]:
cohort_df['hadm_id'].unique()

array([21586397, 29368887, 24995642, ..., 25127296, 23826564, 21364559],
      shape=(5308,))

In [64]:
# patient_data_df.drop_duplicates(subset=['hadm_id', 'subject_id', 'itemid'], inplace=True)
hadm_ids = set(patient_data_df['hadm_id'].unique())
admid_ids = set(cohort_df['hadm_id'].unique())

print("Only in patient_data_df:", hadm_ids - admid_ids)
print("Only in sup_df:", admid_ids - hadm_ids)


Only in patient_data_df: set()
Only in sup_df: {np.int64(28049409), np.int64(24512520), np.int64(29966344), np.int64(23732248), np.int64(26859546), np.int64(24348699), np.int64(25581599), np.int64(29601847), np.int64(23566395), np.int64(28069955), np.int64(24105031), np.int64(23582798), np.int64(23343183), np.int64(28848211), np.int64(27936855), np.int64(27109464), np.int64(21479513), np.int64(29130844), np.int64(24238176), np.int64(28002410), np.int64(29954155), np.int64(29978734), np.int64(21897330), np.int64(22063223), np.int64(27164792), np.int64(20949120), np.int64(27801728), np.int64(29429890), np.int64(26728577), np.int64(28745870), np.int64(27752592), np.int64(26646673), np.int64(27390097), np.int64(24320151), np.int64(21457047), np.int64(29808799), np.int64(29726889), np.int64(28928169), np.int64(21547199), np.int64(21080270), np.int64(28412117), np.int64(29712617), np.int64(29458668), np.int64(28150003), np.int64(21575924), np.int64(27189494), np.int64(29303040), np.int64(287

In [51]:
patient_data_df.head()

Unnamed: 0,subject_id,hadm_id,itemid,charttime,valuenum,dischtime,target,gender,anchor_age,race
0,10010231,29368887,51233,2118-01-15 17:45:00,,2118-01-20 14:00:00,1,M,57,HISPANIC/LATINO - GUATEMALAN
1,10010231,29368887,51233,2118-01-15 17:45:00,,2118-01-20 14:00:00,1,M,57,HISPANIC/LATINO - GUATEMALAN
2,10010231,29368887,51233,2118-01-15 17:45:00,,2118-01-20 14:00:00,1,M,57,HISPANIC/LATINO - GUATEMALAN
3,10010231,21586397,51678,2117-12-19 06:20:00,6.0,2117-12-23 16:51:00,1,M,57,HISPANIC/LATINO - GUATEMALAN
4,10010231,21586397,51678,2117-12-19 06:20:00,6.0,2117-12-23 16:51:00,1,M,57,HISPANIC/LATINO - GUATEMALAN


In [35]:
from sklearn.impute import KNNImputer


# Create a copy and convert timestamps
new_df = patient_data_df.copy()
new_df["charttime"] = pd.to_datetime(new_df["charttime"])
new_df["dischtime"] = pd.to_datetime(new_df["dischtime"])

# Calculate hours before discharge
new_df["hours_before_discharge"] = (new_df["dischtime"] - new_df["charttime"]).dt.total_seconds() / 3600

# Filter to 12-hour window (0 to 12 hours inclusive)
new_df_filtered = new_df[
    (new_df["hours_before_discharge"] >= 0) & (new_df["hours_before_discharge"] <= 168)
].copy()    

print(f"Processing {len(new_df_filtered)} records within 12-hour window...")

# creating 7 day bins
new_df_filtered["hour_bin"] = (np.floor(new_df_filtered["hours_before_discharge"]/6) + 1).astype(int)
new_df_filtered["hour_bin"] = new_df_filtered["hour_bin"].clip(upper=27)  # Cap at 7

# Create feature_id with hour bin
new_df_filtered["feature_id"] = (
    "itemid_" + 
    new_df_filtered["itemid"].astype(str) + 
    "_last_" + 
    new_df_filtered["hour_bin"].astype(str) + 
    "h"
)

# Pivot numeric features (mean aggregation)
numeric_pivot = new_df_filtered.pivot_table(
    index="hadm_id",
    columns="feature_id",
    values="valuenum",
    aggfunc="mean",
    # fill_value=np.nan,
)
# Pivot binary features (existence indicator)
new_df_filtered["has_measurement"] = 1
binary_pivot = new_df_filtered.pivot_table(
    index="hadm_id",
    columns="feature_id",
    values="has_measurement",
    aggfunc="max",  # 1 if any measurement exists
    fill_value=0,
)
binary_pivot.columns = [col + "_measured" for col in binary_pivot.columns]

# Step 3: Impute missing values using KNN
imputer = KNNImputer(n_neighbors=5)
ts_user_imputed = pd.DataFrame(
    imputer.fit_transform(numeric_pivot),
    columns=numeric_pivot.columns,
    index=numeric_pivot.index
).reset_index()


# Get targets
targets = new_df_filtered[["hadm_id", "target"]].drop_duplicates().set_index("hadm_id")

# Combine features with targets (NO forward/backward fill)
numeric_features = numeric_pivot.join(targets).reset_index()
binary_features = binary_pivot.join(targets).reset_index()

print(f"Created numeric features: {numeric_features.shape}")
print(f"Created binary features: {binary_features.shape}")
print(f"Filtered down to {len(new_df_filtered)} rows from {len(new_df)}")
print(f"Number of unique hadm_ids: {new_df_filtered['hadm_id'].nunique()}")

Processing 10537677 records within 12-hour window...
Created numeric features: (4907, 5976)
Created binary features: (4908, 8370)
Filtered down to 10537677 rows from 10537677
Number of unique hadm_ids: 4908


In [36]:
numeric_features.head()

Unnamed: 0,hadm_id,itemid_50801_last_12h,itemid_50801_last_27h,itemid_50802_last_10h,itemid_50802_last_11h,itemid_50802_last_12h,itemid_50802_last_13h,itemid_50802_last_14h,itemid_50802_last_15h,itemid_50802_last_16h,...,itemid_53174_last_27h,itemid_53174_last_2h,itemid_53174_last_3h,itemid_53174_last_4h,itemid_53174_last_5h,itemid_53174_last_7h,itemid_53174_last_8h,itemid_53174_last_9h,itemid_53180_last_8h,target
0,20004072,,,,,,,,,,...,,,,,,,,,,1
1,20004811,,,,,,,,,,...,,,,,,,,,,0
2,20006731,,,,,,,,,,...,,,,,,,,,,0
3,20008395,,,,,,,,,,...,,,,,,,,,,0
4,20010041,,,,,,,,,,...,,,,,,,,,,0


In [47]:
28 // 2

14

In [18]:
print("Supervisor shape:", ts_supervisor_final.shape)
print("User shape:", binary_features.shape)

# Optional: inspect overlapping features
common_columns = set(ts_supervisor_final.columns) & set(binary_features.columns)
print("Common features:", len(common_columns))

# Check hadm_id consistency
print("Same hadm_ids:", set(ts_supervisor_final['admid']) == set(binary_features['hadm_id']))


Supervisor shape: (5280, 2990)
User shape: (3099, 2036)
Common features: 0
Same hadm_ids: False


# Preprocessing for Temporal Data

In [None]:
import pandas as pd
patient_data_df = pd.read_parquet("../dataset/raw/lab_events_7_days_prior.parquet")
len(patient_data_df)

10537677

In [32]:
temp_df = patient_data_df.copy()

# Drop unnecessary columns ["race", "gender", "anchor_age", "target"]
temp_df = temp_df.drop(columns=["race", "gender", "anchor_age", "target"])

# Then drop duplicates based on ["subject_id", "hadm_id", "itemid", "charttime"]
temp_df = temp_df.drop_duplicates(subset=["subject_id", "hadm_id", "itemid", "charttime"])

temp_df.head(10)


Unnamed: 0,subject_id,hadm_id,itemid,charttime,valuenum,dischtime
0,10010231,29368887,51233,2118-01-15 17:45:00,,2118-01-20 14:00:00
3,10010231,21586397,51678,2117-12-19 06:20:00,6.0,2117-12-23 16:51:00
6,10010231,21586397,50861,2117-12-18 10:12:00,22.0,2117-12-23 16:51:00
9,10010231,21586397,50862,2117-12-18 10:12:00,3.9,2117-12-23 16:51:00
12,10010231,21586397,50863,2117-12-18 10:12:00,68.0,2117-12-23 16:51:00
15,10010231,21586397,50868,2117-12-18 10:12:00,15.0,2117-12-23 16:51:00
18,10010231,21586397,50878,2117-12-18 10:12:00,18.0,2117-12-23 16:51:00
21,10010231,21586397,50882,2117-12-18 10:12:00,23.0,2117-12-23 16:51:00
24,10010231,21586397,50885,2117-12-18 10:12:00,0.2,2117-12-23 16:51:00
27,10010231,21586397,50893,2117-12-18 10:12:00,9.2,2117-12-23 16:51:00


In [36]:
temp_df.shape

(1167600, 6)

In [33]:
import numpy as np
def assign_time_bin(hours_before_discharge, window_hours=6):
    """Assign records to fixed time bins (e.g., 0-6h, 6-12h).
    Example: For a 6-hour window:
        0.5h → bin 0, 6.1h → bin 6, 23h → bin 18
    """
    return (np.floor(hours_before_discharge / window_hours) * window_hours)

In [None]:
import numpy as np
df = temp_df.copy()
max_window_days = 7
time_bin_hours = 12

# Convert charttime and dischtime to datetime
unique_items = df["itemid"].unique()
# creating a dictionary to map itemid to index 
# because the itemid can be large and sparse
inputdict = {item: idx for idx, item in enumerate(unique_items)}
n_features = len(inputdict)

# calculating hours before discharge and filter window
df["hours_before_discharge"] = (df["dischtime"] - df["charttime"]).dt.total_seconds() / 3600
df = df[(df["hours_before_discharge"] >= 0) & 
        (df["hours_before_discharge"] <= max_window_days * 24)]

# 3. Assign time bins (aligned to discharge)
df['time_bin'] = (np.floor(df['hours_before_discharge'] / time_bin_hours) 
                    * time_bin_hours)

# Grouping by patient and time bin
grouped = df.sort_values(["subject_id", "hadm_id", "time_bin"])\
            .groupby(["subject_id", "hadm_id", "time_bin"])

# Initializing arrays to hold features, masking, timestamps, and patient IDs
n_timesteps = len(grouped)
x = np.zeros((n_features, n_timesteps))
masking = np.zeros_like(x)
timestamps = np.zeros(n_timesteps)
patient_ids = []

# Populating arrays to  hold features, masking, timestamps, and patient IDs
for i, ((subj_id, adm_id, time_bin), group) in enumerate(grouped):
    # get the time bin as a timestamp
    timestamps[i] = time_bin
    patient_ids.append(f"{subj_id}_{adm_id}")
    
    for _, row in group.iterrows():
        # get the feature index from the inputdict
        feat_idx = inputdict[row["itemid"]]
        # Fill the feature value and masking
        x[feat_idx, i] = row["valuenum"]
        # Set masking to 1 if the feature is present
        masking[feat_idx, i] = 1

# Calculating delta (time since last observation)
delta = np.zeros_like(x)
for i in range(1, n_timesteps):
    # calculate the time gap between the current and previous time bin
    time_gap = timestamps[i-1] - timestamps[i]  # Note: reversed for "before discharge"
    # if the previous time bin was missing, accumulate the time gap
    # else use the actual time gap
    delta[:, i] = np.where(
        masking[:, i-1] == 0,
        time_gap + delta[:, i-1],  # Accumulate if missing
        time_gap                   # Else use actual gap
    )

ValueError: not enough values to unpack (expected 3, got 1)

In [None]:
df.head(15)

Unnamed: 0,subject_id,hadm_id,itemid,charttime,valuenum,dischtime,hours_before_discharge,time_bin
0,10010231,29368887,51233,2118-01-15 17:45:00,,2118-01-20 14:00:00,116.25,108.0
3,10010231,21586397,51678,2117-12-19 06:20:00,6.0,2117-12-23 16:51:00,106.516667,96.0
6,10010231,21586397,50861,2117-12-18 10:12:00,22.0,2117-12-23 16:51:00,126.65,120.0
9,10010231,21586397,50862,2117-12-18 10:12:00,3.9,2117-12-23 16:51:00,126.65,120.0
12,10010231,21586397,50863,2117-12-18 10:12:00,68.0,2117-12-23 16:51:00,126.65,120.0
15,10010231,21586397,50868,2117-12-18 10:12:00,15.0,2117-12-23 16:51:00,126.65,120.0
18,10010231,21586397,50878,2117-12-18 10:12:00,18.0,2117-12-23 16:51:00,126.65,120.0
21,10010231,21586397,50882,2117-12-18 10:12:00,23.0,2117-12-23 16:51:00,126.65,120.0
24,10010231,21586397,50885,2117-12-18 10:12:00,0.2,2117-12-23 16:51:00,126.65,120.0
27,10010231,21586397,50893,2117-12-18 10:12:00,9.2,2117-12-23 16:51:00,126.65,120.0


In [None]:
delta[0, 0:10]  

array([   0.,  -24.,  -48.,  -72.,  -84., -108.,  120.,   96.,   72.,
         48.])

In [None]:
masking[0, 0:10]  

array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0.])

In [None]:
x[3, 0:30] 

array([3.9, 3.5, 3.7, 3.5, 3.6, 3.9, 4.3, 4.1, 4.1, 4. , 4.5, 4. , 3.7,
       4. , 3.7, 4.2, 0. , 0. , 0. , 0. , 0. , 3.5, 3.9, 3.6, 3.5, 3.3,
       3.2, 3.3, 0. , 0. ])

In [None]:
x.shape, masking.shape, delta.shape, timestamps.shape

((583, 31533), (583, 31533), (583, 31533), (31533,))

In [None]:
np.unique(x)

In [4]:
import numpy as np
import pandas as pd

def df_to_x_m_d(df, max_window_days=7):
    """
    Convert DataFrame to GRU-D inputs (x, masking, delta), using raw `itemid` as indices.
    
    Args:
        df: DataFrame with columns ['subject_id', 'hadm_id', 'itemid', 'charttime', 'valuenum', 'dischtime'].
        max_window_days: Maximum days before discharge to include.
    
    Returns:
        x: Feature matrix of shape (n_features, n_timesteps).
        masking: Binary mask of observed values (same shape as x).
        delta: Time gaps since last observation (same shape as x).
        timestamps: Hours since discharge for each timestep.
        ids: DataFrame with ['subject_id', 'hadm_id'] for each timestep.
    """

    # --- 1. Preprocess Timestamps ---
    df["hours_since_discharge"] = (df["dischtime"] - df["charttime"]).dt.total_seconds() / 3600

    # Filter to keep only within the max window
    df = df[(df["hours_since_discharge"] >= 0) & (df["hours_since_discharge"] <= max_window_days * 24)].copy()

    # --- 2. Group by Patient and Time ---
    # Sort dataframe
    df = df.sort_values(by=["subject_id", "hadm_id", "charttime"])

    # Group by 'subject_id', 'hadm_id', 'charttime'
    grouped = df.groupby(["subject_id", "hadm_id", "charttime"])

    n_timesteps = len(grouped)

    # --- 3. Initialize Arrays ---
    n_features = df["itemid"].max() + 1  # Assumes itemids start at 0
    x = np.zeros((n_features, n_timesteps))
    masking = np.zeros_like(x)
    timestamps = np.zeros(n_timesteps)
    ids = []

    # --- 4. Populate x, masking, and timestamps ---
    for i, ((subj_id, adm_id, time), group) in enumerate(grouped):
        timestamps[i] = (time - group["dischtime"].iloc[0]).total_seconds() / 3600
        ids.append({"subject_id": subj_id, "hadm_id": adm_id})
        for _, row in group.iterrows():
            x[int(row["itemid"]), i] = row["valuenum"]
            masking[int(row["itemid"]), i] = 1

    # --- 5. Calculate delta ---
    delta = np.zeros_like(x)
    for i in range(1, n_timesteps):
        time_gap = timestamps[i] - timestamps[i-1]
        delta[:, i] = np.where(
            masking[:, i-1] == 0,
            time_gap + delta[:, i-1],  # Accumulate if previous value was missing
            time_gap                   # Else use actual time gap
        )

    return x, masking, delta, timestamps, pd.DataFrame(ids)


In [5]:
x, masking, delta, timestamps, ids = df_to_x_m_d(temp_df, max_window_days=7)

print("x shape:", x.shape)          # (max_itemid + 1, n_timesteps)
print("masking shape:", masking.shape)  # Same as x
print("delta shape:", delta.shape)    # Same as x
print("Timestamps (hours before discharge):", timestamps)
# print("Patient IDs:", ids)

x shape: (53174, 860)
masking shape: (53174, 860)
delta shape: (53174, 860)
Timestamps (hours before discharge): [-106.76666667 -116.25        -59.41666667 -165.91666667  -80.68333333
  -89.25        -33.06666667  -14.         -154.31666667  -16.01666667
 -113.          -83.86666667 -129.91666667 -153.93333333 -155.48333333
 -107.88333333 -120.36666667 -157.71666667 -160.26666667  -39.65
 -146.83333333 -148.83333333  -83.         -155.95       -153.01666667
  -99.76666667  -85.53333333 -136.06666667 -156.5        -167.16666667
 -106.25         -6.9        -154.75        -21.28333333 -108.5
  -82.93333333  -21.28333333   -7.48333333 -134.76666667 -155.5
 -152.66666667 -155.96666667  -88.51666667 -105.41666667  -58.06666667
 -165.38333333  -39.5        -167.83333333  -88.51666667  -47.38333333
 -161.93333333  -50.43333333 -164.21666667 -154.33333333  -12.25
 -155.53333333 -156.4        -127.26666667  -81.08333333   -5.88333333
  -40.4        -159.28333333  -66.26666667  -57.68333333  -10

In [12]:
ids.head()

Unnamed: 0,subject_id,hadm_id
0,10010231,24995642
1,10010231,29368887
2,10012768,27462906
3,10014354,26486158
4,10022373,27450651


In [10]:
timestamps # Display first 5 timestamps

array([-106.76666667, -116.25      ,  -59.41666667, -165.91666667,
        -80.68333333,  -89.25      ,  -33.06666667,  -14.        ,
       -154.31666667,  -16.01666667, -113.        ,  -83.86666667,
       -129.91666667, -153.93333333, -155.48333333, -107.88333333,
       -120.36666667, -157.71666667, -160.26666667,  -39.65      ,
       -146.83333333, -148.83333333,  -83.        , -155.95      ,
       -153.01666667,  -99.76666667,  -85.53333333, -136.06666667,
       -156.5       , -167.16666667, -106.25      ,   -6.9       ,
       -154.75      ,  -21.28333333, -108.5       ,  -82.93333333,
        -21.28333333,   -7.48333333, -134.76666667, -155.5       ,
       -152.66666667, -155.96666667,  -88.51666667, -105.41666667,
        -58.06666667, -165.38333333,  -39.5       , -167.83333333,
        -88.51666667,  -47.38333333, -161.93333333,  -50.43333333,
       -164.21666667, -154.33333333,  -12.25      , -155.53333333,
       -156.4       , -127.26666667,  -81.08333333,   -5.88333