## Show the tables in schema

In [None]:
import sys
import os
sys.path.append(os.path.abspath(".."))  # adding the parent directory of 'notebooks' to sys.path
from db_utils.db_setup import Database
from sqlalchemy import text
engine = Database()
schema_list = ["mimiciv_icu", "mimiciv_hosp"]
for schema in schema_list:
    print(f"Schema: {schema}")
    tables = Database.show_tables_in_schema(engine, schema)
    print(tables)

# Getting all `labevents` data and filtering

##### Fetching `demographic` data

In [None]:
from dotenv import load_dotenv
import os
import pandas as pd
from sqlalchemy import create_engine, text
from psycopg2.extras import execute_values

# Load environment variables
load_dotenv()
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")

# Create SQLAlchemy engine
url = f"postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(url)
conn = engine.connect()
cursor = conn.connection.cursor()

# Creating a TEMPORARY table
cursor.execute("""
    CREATE TEMP TABLE temp_cohort (
        subject_id INT,
        hadm_id INT,
        admittime TIMESTAMP,
        dischtime TIMESTAMP,
        target  INT
    );
""")

# Loading the CSV and insert into temp_cohort
cohort_df = pd.read_csv('../assets/cohort1_target.csv')
cohort_df['admittime'] = pd.to_datetime(cohort_df['admittime'], errors='coerce')
cohort_df['dischtime'] = pd.to_datetime(cohort_df['dischtime'], errors='coerce')

values = list(cohort_df.itertuples(index=False, name=None))
execute_values(cursor,
    "INSERT INTO temp_cohort (subject_id, hadm_id, admittime, dischtime, target) VALUES %s",
    values
)

# Fetching demographic data from admissions table
cursor.execute("""
    SELECT DISTINCT
        c.subject_id,
        c.hadm_id,
        c.admittime,
        c.dischtime,
        c.target,
        p.gender,
        p.anchor_age,
        a.race
    FROM temp_cohort c
    JOIN mimiciv_hosp.admissions a ON c.hadm_id = a.hadm_id
    JOIN mimiciv_hosp.patients p ON a.subject_id = p.subject_id

""")

rows = cursor.fetchall()

# Finally creating DataFrame 
columns = ['subject_id', 'hadm_id', 'admittime', 'dischtime', 'target', 'gender', 'anchor_age', 'race']
final_df = pd.DataFrame(rows, columns=columns)
# Save the final DataFrame to a Parquet file
final_df['admittime'] = pd.to_datetime(final_df['admittime'], errors='coerce')
final_df['dischtime'] = pd.to_datetime(final_df['dischtime'], errors='coerce')
final_df['anchor_age'] = pd.to_numeric(final_df['anchor_age'], errors='coerce')
final_df['target'] = pd.to_numeric(final_df['target'], errors='coerce')
final_df.to_parquet("../dataset/raw/cohort_with_demographic_data.parquet", index=False)

# Finalize
conn.connection.commit()
cursor.close()
conn.close()

In [None]:
import pandas as pd
# Load files
demog_df = pd.read_parquet("../dataset/raw/lab_event_data_with_demographics.parquet")
cohort_df = pd.read_csv('../assets/extracted.csv')

# Ensure consistent dtypes
cohort_df['subject_id'] = cohort_df['subject_id'].astype(int)
cohort_df['admid'] = cohort_df['admid'].astype(int)
demog_df['subject_id'] = demog_df['subject_id'].astype(int)
demog_df['hadm_id'] = demog_df['hadm_id'].astype(int)


# Create sets of (subject_id, hadm_id) pairs
cohort_pairs = set(zip(cohort_df['subject_id'], cohort_df['admid']))
demog_pairs = set(zip(demog_df['subject_id'], demog_df['hadm_id']))

# Identify missing pairs
missing_pairs = sorted(cohort_pairs - demog_pairs)

print(f"✅ Total missing (subject_id, hadm_id) pairs: {len(missing_pairs)}")
print("First 10 missing:")
for pair in missing_pairs[:10]:
    print(pair)


In [None]:
demog_df.head(11)

In [None]:
sup_df = pd.read_csv("../assets/ts.csv") 
sup_df.head(11)

In [None]:
from sklearn.preprocessing import LabelEncoder
def map_race(race):
    if pd.isna(race):
        return 'Unknown or Not Reported'
    
    race = race.upper()
    
    if 'HISPANIC' in race or 'LATINO' in race or 'SOUTH AMERICAN' in race:
        return 'Hispanic or Latino'
    elif 'WHITE' in race:
        return 'White'
    elif 'BLACK' in race or 'AFRICAN' in race:
        return 'Black or African American'
    elif 'ASIAN' in race:
        return 'Asian'
    elif 'PACIFIC ISLANDER' in race or 'NATIVE HAWAIIAN' in race:
        return 'Native Hawaiian or Other Pacific Islander'
    elif 'AMERICAN INDIAN' in race or 'ALASKA NATIVE' in race:
        return 'American Indian or Alaska Native'
    elif 'DECLINED' in race or 'UNABLE' in race or 'UNKNOWN' in race:
        return 'Unknown or Not Reported'
    else:
        return 'Other'

le = LabelEncoder()
demog_df['race_grouped'] = demog_df['race'].apply(map_race)  # apply your earlier grouping
demog_df['race_target'] = le.fit_transform(demog_df['race_grouped'])


##### Fetching labevents data prior `7` or `14` days

In [None]:
from loguru import logger
import pandas as pd

logger.info("Starting lab data extraction process.")

# First get all unique patient IDs
patient_ids = pd.read_sql("SELECT DISTINCT subject_id FROM public.temp_cohort ORDER BY subject_id", engine)
logger.info(f"Fetched {len(patient_ids)} unique patient IDs from temp_cohort.")

lab_df = pd.DataFrame()

batch_size = 100
total_batches = (len(patient_ids) + batch_size - 1) // batch_size
logger.info(f"Processing patient data in batches of {batch_size}, total batches: {total_batches}")

for i in range(0, len(patient_ids), batch_size):
    batch_num = i // batch_size + 1
    batch = patient_ids.iloc[i:i+batch_size]
    batch_list = tuple(batch['subject_id'])
    
    logger.info(f"Processing batch {batch_num}/{total_batches} with {len(batch)} patient IDs.")
    
    query = f"""
        SELECT DISTINCT
            le.subject_id, 
            le.hadm_id, 
            le.itemid, 
            le.charttime, 
            le.valuenum,
            tc.dischtime,
            tc.target
        FROM mimiciv_hosp.labevents le
        JOIN public.temp_cohort tc
          ON le.subject_id = tc.subject_id
         AND le.hadm_id = tc.hadm_id
        WHERE le.charttime BETWEEN (tc.dischtime - INTERVAL '7 days') AND tc.dischtime
        AND le.subject_id IN {batch_list}
    """
    
    chunk = pd.read_sql(query, engine)
    logger.info(f"Batch {batch_num} fetched {len(chunk)} lab event records.")
    
    lab_df = pd.concat([lab_df, chunk], ignore_index=True)

lab_df.reset_index(drop=True, inplace=True)
logger.info(f"Lab data extraction complete. Total records collected: {len(lab_df)}")
logger.info("Saving lab data to Parquet file.")
lab_df.to_parquet("../dataset/raw/lab_event_data_with_demographics.parquet", index=False)


# Pre-processing for tabular data

### Aggregating on an `hourly` basis

In [1]:
import polars as pl 
import numpy as np
import pandas as pd
patient_data_df = pd.read_parquet("../dataset/raw/lab_events_7_days_prior.parquet")
sup_df = pd.read_csv("../assets/ts.csv")  
sup_extracted_df = pd.read_csv("../assets/extracted.csv")
cohort_df = pd.read_csv("../assets/cohort1_target.csv")
# len(patient_data_df)

In [3]:
len(patient_data_df), len(sup_df), len(sup_extracted_df), len(cohort_df)

(1171251, 239990, 1092498, 5308)

In [2]:
sup_extracted_df.drop(columns=['label'], inplace=True)
sup_extracted_df.head()

Unnamed: 0,itemid,valuenum,charttime,admid,subject_id,minute,hour,day
0,50861,22.0,2117-12-18 10:12:00,21586397,10010231,2481,41,1
1,50862,3.9,2117-12-18 10:12:00,21586397,10010231,2481,41,1
2,50863,68.0,2117-12-18 10:12:00,21586397,10010231,2481,41,1
3,50868,15.0,2117-12-18 10:12:00,21586397,10010231,2481,41,1
4,50878,18.0,2117-12-18 10:12:00,21586397,10010231,2481,41,1


In [3]:
df = patient_data_df.copy()
df = df.dropna(subset=["charttime", "dischtime"])
df["charttime"] = pd.to_datetime(df["charttime"])
df["dischtime"] = pd.to_datetime(df["dischtime"])

delta = df["dischtime"] - df["charttime"]

df["minute"] = (delta.dt.total_seconds() // 60).astype(int)
df["hour"] = (delta.dt.total_seconds() // 3600).astype(int)
df["day"] = (delta.dt.total_seconds() // (3600 * 24)).astype(int)
df.head()


Unnamed: 0,subject_id,hadm_id,dischtime,target,gender,anchor_age,race,itemid,charttime,valuenum,minute,hour,day
0,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-16 01:00:00,22.0,6435,107,4
1,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-17 00:00:00,22.0,5055,84,3
2,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-18 00:00:00,19.0,3615,60,2
3,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-19 00:00:00,22.0,2175,36,1
4,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-20 01:30:00,26.0,645,10,0


In [4]:
df["hour_bin"] = (df["hour"] // 12).astype(int)
ts = (
    df
    .groupby(["subject_id", "hadm_id", "itemid","hour_bin"])["valuenum"]
    .mean()
    .unstack(level=-1)
    .interpolate(method='linear', axis=1)
    .ffill(axis=1)
    .bfill(axis=1)
)

In [5]:
df.head()

Unnamed: 0,subject_id,hadm_id,dischtime,target,gender,anchor_age,race,itemid,charttime,valuenum,minute,hour,day,hour_bin
0,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-16 01:00:00,22.0,6435,107,4,8
1,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-17 00:00:00,22.0,5055,84,3,7
2,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-18 00:00:00,19.0,3615,60,2,5
3,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-19 00:00:00,22.0,2175,36,1,3
4,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-20 01:30:00,26.0,645,10,0,0


In [6]:
ts.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,hour_bin,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
subject_id,hadm_id,itemid,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
10010231,21586397,50861.0,62.0,62.0,52.5,43.0,37.0,31.0,28.0,25.0,21.0,21.5,22.0,22.0,22.0,22.0,22.0
10010231,21586397,50862.0,3.9,3.9,3.7,3.5,3.6,3.7,3.6,3.5,3.6,3.75,3.9,3.9,3.9,3.9,3.9
10010231,21586397,50863.0,47.0,47.0,46.0,45.0,51.5,58.0,54.0,50.0,51.0,59.5,68.0,68.0,68.0,68.0,68.0
10010231,21586397,50868.0,15.0,15.0,15.0,15.0,15.5,16.0,15.0,14.0,16.0,15.5,15.0,15.0,15.0,15.0,15.0
10010231,21586397,50878.0,48.0,48.0,42.0,36.0,31.0,26.0,24.5,23.0,17.0,17.5,18.0,18.0,18.0,18.0,18.0


In [7]:
ts.unstack(level=-1).fillna(0)

Unnamed: 0_level_0,hour_bin,0,0,0,0,0,0,0,0,0,0,...,14,14,14,14,14,14,14,14,14,14
Unnamed: 0_level_1,itemid,50801.0,50802.0,50803.0,50804.0,50806.0,50808.0,50809.0,50810.0,50811.0,50812.0,...,53163.0,53169.0,53170.0,53171.0,53172.0,53173.0,53174.0,53178.0,53180.0,53187.0
subject_id,hadm_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
10010231,21586397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10010231,23835132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10010231,24995642,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10010231,29368887,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10012768,26708632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19999784,29234099,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19999784,29324445,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19999784,29355057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19999784,29889147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
df_reset = ts.reset_index()
# Dummy example: let's say the target is mean value at hour 0 > threshold
target_df = df_reset.groupby('hadm_id')[0].mean().gt(50).astype(int).reset_index(name='target')


In [78]:
ts.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,hour_bin,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
hadm_id,itemid,target,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
20004072,50861.0,1,164.0,158.0,152.0,100.0,48.0,39.0,30.0,24.5,19.0,20.0,21.0,21.0,21.0,21.0,21.0
20004072,50862.0,1,3.3,3.55,3.8,3.6,3.4,3.5,3.6,3.55,3.5,3.5,3.5,3.5,3.5,3.5,3.5
20004072,50863.0,1,56.0,62.5,69.0,62.0,55.0,61.0,67.0,66.5,66.0,69.5,73.0,76.0,76.0,76.0,76.0
20004072,50868.0,1,7.0,7.5,8.0,7.0,6.0,7.0,8.0,9.0,10.0,9.5,9.0,7.0,7.0,7.0,7.0
20004072,50878.0,1,98.0,102.0,106.0,71.5,37.0,31.5,26.0,22.5,19.0,19.5,20.0,23.0,23.0,23.0,23.0


In [83]:
from sklearn.model_selection import StratifiedKFold, train_test_split

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
ts = ts.reset_index()  # This moves all 3 index levels to columns
ts = ts.set_index(['hadm_id', 'itemid'])  # Re-set only the intended 2-level index


for i, (train_idx, test_idx) in enumerate(skf.split(target_df, target_df['target'])):
    train_hadm = target_df.loc[train_idx, 'hadm_id']
    test_hadm = target_df.loc[test_idx, 'hadm_id']
    
    # Optional: create validation split from train
    # train_hadm, val_hadm = train_test_split(train_hadm, test_size=0.2, random_state=0, stratify=target_df.loc[train_idx, 'target'])

    # Filter original df by hadm_id
    df_train = ts.loc[ts.index.get_level_values('hadm_id').isin(train_hadm)]
    # df_val = df.loc[df.index.get_level_values('hadm_id').isin(val_hadm)]
    df_test = ts.loc[ts.index.get_level_values('hadm_id').isin(test_hadm)]

    # Done: df_train, df_val, df_test are now leak-free


In [88]:
common_ids = set(df_train.index.get_level_values("hadm_id")) & set(df_test.index.get_level_values("hadm_id"))
print("Overlapping hadm_ids:", common_ids)
assert len(common_ids) == 0


Overlapping hadm_ids: set()


In [90]:
df_train.unstack(-1)

hour_bin,target,target,target,target,target,target,target,target,target,target,...,14,14,14,14,14,14,14,14,14,14
itemid,50861.0,50862.0,50863.0,50868.0,50878.0,50882.0,50885.0,50893.0,50902.0,50912.0,...,53163.0,51901.0,51494.0,51429.0,53180.0,51210.0,51114.0,52195.0,51877.0,51893.0
hadm_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
20004072,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
20004811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
20008395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
20012521,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
20013201,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29981134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
29987748,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
29987780,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
29990599,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [91]:
df_test.unstack(-1)

hour_bin,target,target,target,target,target,target,target,target,target,target,...,14,14,14,14,14,14,14,14,14,14
itemid,50861.0,50863.0,50868.0,50878.0,50882.0,50885.0,50893.0,50902.0,50912.0,50931.0,...,50967.0,51762.0,51763.0,51764.0,51768.0,51205.0,52176.0,51284.0,50836.0,50841.0
hadm_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
20006731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
20010041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
20012034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
20020163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
20021715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29895434,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
29914882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
29917875,0.0,,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,...,,,,,,,,,,
29980930,,,,,,,,,,,...,,,,,,,,,,


In [82]:
# print(ts.index)
print(ts.index.names)
print(type(ts.index))


['hadm_id', 'itemid', 'target']
<class 'pandas.core.indexes.multi.MultiIndex'>


In [57]:
ts.columns

Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='int64', name='hour_bin')

In [48]:
ts.unstack(-1)

day,0,0,0,0,0,0,0,0,0,0,...,7,7,7,7,7,7,7,7,7,7
itemid,50801.0,50802.0,50803.0,50804.0,50806.0,50808.0,50809.0,50810.0,50811.0,50812.0,...,53163.0,53169.0,53170.0,53171.0,53172.0,53173.0,53174.0,53178.0,53180.0,53187.0
hadm_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
20004072,,,,,,,,,,,...,,,,,,,,,,
20004811,,,,,,,,,,,...,,,,,,,,,,
20006731,,,,,,,,,,,...,,,,,,,,,,
20008395,,,,,,,,,,,...,,,,,,,,,,
20010041,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29987748,,,,,,,,,,,...,,,,,,,,,,
29987780,,,,,,,,,,,...,,,,,,,,,,
29990599,,,,,,,,,,,...,,,,,,,,,,
29996493,,,,,,,,,,,...,,,,,,,,,,


In [49]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
ts_user_imputed = pd.DataFrame(
    imputer.fit_transform(ts),
    columns=ts.columns,
    index=ts.index
).reset_index()

KeyboardInterrupt: 

In [None]:
from sklearn.impute import KNNImputer


# Create a copy and convert timestamps
new_df = patient_data_df.copy().dropna(subset=["charttime", "dischtime", "hadm_id", "itemid", "valuenum"])
new_df["charttime"] = pd.to_datetime(new_df["charttime"])
new_df["dischtime"] = pd.to_datetime(new_df["dischtime"])

# Calculate hours before discharge
new_df["hours_before_discharge"] = (new_df["dischtime"] - new_df["charttime"]).dt.total_seconds() / 3600

# Filter to 12-hour window (0 to 12 hours inclusive)
new_df_filtered = new_df[
    (new_df["hours_before_discharge"] >= 0) & (new_df["hours_before_discharge"] <= 168)
].copy()    

print(f"Processing {len(new_df_filtered)} records within 12-hour window...")

# creating 7 day bins
new_df_filtered["hour_bin"] = (np.floor(new_df_filtered["hours_before_discharge"]/6) + 1).astype(int)
new_df_filtered["hour_bin"] = new_df_filtered["hour_bin"].clip(upper=27)  # Cap at 7

# Create feature_id with hour bin
new_df_filtered["feature_id"] = (
    "itemid_" + 
    new_df_filtered["itemid"].astype(str) + 
    "_last_" + 
    new_df_filtered["hour_bin"].astype(str) + 
    "h"
)

# Pivot numeric features (mean aggregation)
numeric_pivot = new_df_filtered.pivot_table(
    index="hadm_id",
    columns="feature_id",
    values="valuenum",
    aggfunc="mean",
    # fill_value=np.nan,
)
# Pivot binary features (existence indicator)
new_df_filtered["has_measurement"] = 1
binary_pivot = new_df_filtered.pivot_table(
    index="hadm_id",
    columns="feature_id",
    values="has_measurement",
    aggfunc="max",  # 1 if any measurement exists
    fill_value=0,
)
binary_pivot.columns = [col + "_measured" for col in binary_pivot.columns]

# Step 3: Impute missing values using KNN
imputer = KNNImputer(n_neighbors=5)
ts_user_imputed = pd.DataFrame(
    imputer.fit_transform(numeric_pivot),
    columns=numeric_pivot.columns,
    index=numeric_pivot.index
).reset_index()


# Get targets
targets = new_df_filtered[["hadm_id", "target"]].drop_duplicates().set_index("hadm_id")

# Combine features with targets (NO forward/backward fill)
numeric_features = numeric_pivot.join(targets).reset_index()
binary_features = binary_pivot.join(targets).reset_index()

print(f"Created numeric features: {numeric_features.shape}")
print(f"Created binary features: {binary_features.shape}")
print(f"Filtered down to {len(new_df_filtered)} rows from {len(new_df)}")
print(f"Number of unique hadm_ids: {new_df_filtered['hadm_id'].nunique()}")

In [None]:
numeric_features.head()

In [None]:
numeric_features.columns[:15]

# Preprocessing for Temporal Data

In [None]:
import pandas as pd
patient_data_df = pd.read_parquet("../dataset/raw/lab_event_data_with_demographics.parquet")
len(patient_data_df)

In [None]:
temp_df = patient_data_df.copy()

# Drop unnecessary columns ["race", "gender", "anchor_age", "target"]
temp_df = temp_df.drop(columns=["race", "gender", "anchor_age", "target"])

# Then drop duplicates based on ["subject_id", "hadm_id", "itemid", "charttime"]
temp_df = temp_df.drop_duplicates(subset=["subject_id", "hadm_id", "itemid", "charttime"])

temp_df.head(10)


In [None]:
temp_df.shape

In [None]:
import numpy as np
def assign_time_bin(hours_before_discharge, window_hours=6):
    """Assign records to fixed time bins (e.g., 0-6h, 6-12h).
    Example: For a 6-hour window:
        0.5h → bin 0, 6.1h → bin 6, 23h → bin 18
    """
    return (np.floor(hours_before_discharge / window_hours) * window_hours)

In [None]:
import numpy as np
df = temp_df.copy()
max_window_days = 7
time_bin_hours = 12

# Convert charttime and dischtime to datetime
unique_items = df["itemid"].unique()
# creating a dictionary to map itemid to index 
# because the itemid can be large and sparse
inputdict = {item: idx for idx, item in enumerate(unique_items)}
n_features = len(inputdict)

# calculating hours before discharge and filter window
df["hours_before_discharge"] = (df["dischtime"] - df["charttime"]).dt.total_seconds() / 3600
df = df[(df["hours_before_discharge"] >= 0) & 
        (df["hours_before_discharge"] <= max_window_days * 24)]

# 3. Assign time bins (aligned to discharge)
df['time_bin'] = (np.floor(df['hours_before_discharge'] / time_bin_hours) 
                    * time_bin_hours)

# Grouping by patient and time bin
grouped = df.sort_values(["subject_id", "hadm_id", "time_bin"])\
            .groupby(["subject_id", "hadm_id", "time_bin"])

# Initializing arrays to hold features, masking, timestamps, and patient IDs
n_timesteps = len(grouped)
x = np.zeros((n_features, n_timesteps))
masking = np.zeros_like(x)
timestamps = np.zeros(n_timesteps)
patient_ids = []

# Populating arrays to  hold features, masking, timestamps, and patient IDs
for i, ((subj_id, adm_id, time_bin), group) in enumerate(grouped):
    # get the time bin as a timestamp
    timestamps[i] = time_bin
    patient_ids.append(f"{subj_id}_{adm_id}")
    
    for _, row in group.iterrows():
        # get the feature index from the inputdict
        feat_idx = inputdict[row["itemid"]]
        # Fill the feature value and masking
        x[feat_idx, i] = row["valuenum"]
        # Set masking to 1 if the feature is present
        masking[feat_idx, i] = 1

# Calculating delta (time since last observation)
delta = np.zeros_like(x)
for i in range(1, n_timesteps):
    # calculate the time gap between the current and previous time bin
    time_gap = timestamps[i-1] - timestamps[i]  # Note: reversed for "before discharge"
    # if the previous time bin was missing, accumulate the time gap
    # else use the actual time gap
    delta[:, i] = np.where(
        masking[:, i-1] == 0,
        time_gap + delta[:, i-1],  # Accumulate if missing
        time_gap                   # Else use actual gap
    )

In [None]:
df.head(15)

In [None]:
delta[0, 0:10]  

In [None]:
masking[0, 0:10]  

In [None]:
x[3, 0:30] 

In [None]:
x.shape, masking.shape, delta.shape, timestamps.shape

In [None]:
np.unique(x)

In [None]:
import numpy as np
import pandas as pd

def df_to_x_m_d(df, max_window_days=7):
    """
    Convert DataFrame to GRU-D inputs (x, masking, delta), using raw `itemid` as indices.
    
    Args:
        df: DataFrame with columns ['subject_id', 'hadm_id', 'itemid', 'charttime', 'valuenum', 'dischtime'].
        max_window_days: Maximum days before discharge to include.
    
    Returns:
        x: Feature matrix of shape (n_features, n_timesteps).
        masking: Binary mask of observed values (same shape as x).
        delta: Time gaps since last observation (same shape as x).
        timestamps: Hours since discharge for each timestep.
        ids: DataFrame with ['subject_id', 'hadm_id'] for each timestep.
    """

    # --- 1. Preprocess Timestamps ---
    df["hours_since_discharge"] = (df["dischtime"] - df["charttime"]).dt.total_seconds() / 3600

    # Filter to keep only within the max window
    df = df[(df["hours_since_discharge"] >= 0) & (df["hours_since_discharge"] <= max_window_days * 24)].copy()

    # --- 2. Group by Patient and Time ---
    # Sort dataframe
    df = df.sort_values(by=["subject_id", "hadm_id", "charttime"])

    # Group by 'subject_id', 'hadm_id', 'charttime'
    grouped = df.groupby(["subject_id", "hadm_id", "charttime"])

    n_timesteps = len(grouped)

    # --- 3. Initialize Arrays ---
    n_features = df["itemid"].max() + 1  # Assumes itemids start at 0
    x = np.zeros((n_features, n_timesteps))
    masking = np.zeros_like(x)
    timestamps = np.zeros(n_timesteps)
    ids = []

    # --- 4. Populate x, masking, and timestamps ---
    for i, ((subj_id, adm_id, time), group) in enumerate(grouped):
        timestamps[i] = (time - group["dischtime"].iloc[0]).total_seconds() / 3600
        ids.append({"subject_id": subj_id, "hadm_id": adm_id})
        for _, row in group.iterrows():
            x[int(row["itemid"]), i] = row["valuenum"]
            masking[int(row["itemid"]), i] = 1

    # --- 5. Calculate delta ---
    delta = np.zeros_like(x)
    for i in range(1, n_timesteps):
        time_gap = timestamps[i] - timestamps[i-1]
        delta[:, i] = np.where(
            masking[:, i-1] == 0,
            time_gap + delta[:, i-1],  # Accumulate if previous value was missing
            time_gap                   # Else use actual time gap
        )

    return x, masking, delta, timestamps, pd.DataFrame(ids)


In [None]:
x, masking, delta, timestamps, ids = df_to_x_m_d(temp_df, max_window_days=7)

print("x shape:", x.shape)          # (max_itemid + 1, n_timesteps)
print("masking shape:", masking.shape)  # Same as x
print("delta shape:", delta.shape)    # Same as x
print("Timestamps (hours before discharge):", timestamps)
# print("Patient IDs:", ids)

In [None]:
ids.head()

In [None]:
timestamps # Display first 5 timestamps