## Show the tables in schema

In [None]:
import sys
import os
sys.path.append(os.path.abspath(".."))  # adding the parent directory of 'notebooks' to sys.path
from db_utils.db_setup import Database
from sqlalchemy import text
engine = Database()
schema_list = ["mimiciv_icu", "mimiciv_hosp"]
for schema in schema_list:
    print(f"Schema: {schema}")
    tables = Database.show_tables_in_schema(engine, schema)
    print(tables)

# Getting all `labevents` data and filtering

##### Fetching `demographic` data

In [None]:
from dotenv import load_dotenv
import os
import pandas as pd
from sqlalchemy import create_engine, text
from psycopg2.extras import execute_values

# Load environment variables
load_dotenv()
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")

# Create SQLAlchemy engine
url = f"postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(url)
conn = engine.connect()
cursor = conn.connection.cursor()

# Creating a TEMPORARY table
cursor.execute("""
    CREATE TEMP TABLE temp_cohort (
        subject_id INT,
        hadm_id INT,
        admittime TIMESTAMP,
        dischtime TIMESTAMP,
        target  INT
    );
""")

# Loading the CSV and insert into temp_cohort
cohort_df = pd.read_csv('../assets/cohort1_target.csv')
cohort_df['admittime'] = pd.to_datetime(cohort_df['admittime'], errors='coerce')
cohort_df['dischtime'] = pd.to_datetime(cohort_df['dischtime'], errors='coerce')

values = list(cohort_df.itertuples(index=False, name=None))
execute_values(cursor,
    "INSERT INTO temp_cohort (subject_id, hadm_id, admittime, dischtime, target) VALUES %s",
    values
)

# Fetching demographic data from admissions table
cursor.execute("""
    SELECT DISTINCT
        c.subject_id,
        c.hadm_id,
        c.admittime,
        c.dischtime,
        c.target,
        p.gender,
        p.anchor_age,
        a.race
    FROM temp_cohort c
    JOIN mimiciv_hosp.admissions a ON c.hadm_id = a.hadm_id
    JOIN mimiciv_hosp.patients p ON a.subject_id = p.subject_id

""")

rows = cursor.fetchall()

# Finally creating DataFrame 
columns = ['subject_id', 'hadm_id', 'admittime', 'dischtime', 'target', 'gender', 'anchor_age', 'race']
final_df = pd.DataFrame(rows, columns=columns)
# Save the final DataFrame to a Parquet file
final_df['admittime'] = pd.to_datetime(final_df['admittime'], errors='coerce')
final_df['dischtime'] = pd.to_datetime(final_df['dischtime'], errors='coerce')
final_df['anchor_age'] = pd.to_numeric(final_df['anchor_age'], errors='coerce')
final_df['target'] = pd.to_numeric(final_df['target'], errors='coerce')
final_df.to_parquet("../dataset/raw/cohort_with_demographic_data.parquet", index=False)

# Finalize
conn.connection.commit()
cursor.close()
conn.close()

In [None]:
import pandas as pd
# Load files
demog_df = pd.read_parquet("../dataset/raw/lab_event_data_with_demographics.parquet")
cohort_df = pd.read_csv('../assets/extracted.csv')

# Ensure consistent dtypes
cohort_df['subject_id'] = cohort_df['subject_id'].astype(int)
cohort_df['admid'] = cohort_df['admid'].astype(int)
demog_df['subject_id'] = demog_df['subject_id'].astype(int)
demog_df['hadm_id'] = demog_df['hadm_id'].astype(int)


# Create sets of (subject_id, hadm_id) pairs
cohort_pairs = set(zip(cohort_df['subject_id'], cohort_df['admid']))
demog_pairs = set(zip(demog_df['subject_id'], demog_df['hadm_id']))

# Identify missing pairs
missing_pairs = sorted(cohort_pairs - demog_pairs)

print(f"✅ Total missing (subject_id, hadm_id) pairs: {len(missing_pairs)}")
print("First 10 missing:")
for pair in missing_pairs[:10]:
    print(pair)


In [None]:
demog_df.head(11)

In [None]:
sup_df = pd.read_csv("../assets/ts.csv") 
sup_df.head(11)

In [None]:
from sklearn.preprocessing import LabelEncoder
def map_race(race):
    if pd.isna(race):
        return 'Unknown or Not Reported'
    
    race = race.upper()
    
    if 'HISPANIC' in race or 'LATINO' in race or 'SOUTH AMERICAN' in race:
        return 'Hispanic or Latino'
    elif 'WHITE' in race:
        return 'White'
    elif 'BLACK' in race or 'AFRICAN' in race:
        return 'Black or African American'
    elif 'ASIAN' in race:
        return 'Asian'
    elif 'PACIFIC ISLANDER' in race or 'NATIVE HAWAIIAN' in race:
        return 'Native Hawaiian or Other Pacific Islander'
    elif 'AMERICAN INDIAN' in race or 'ALASKA NATIVE' in race:
        return 'American Indian or Alaska Native'
    elif 'DECLINED' in race or 'UNABLE' in race or 'UNKNOWN' in race:
        return 'Unknown or Not Reported'
    else:
        return 'Other'

le = LabelEncoder()
demog_df['race_grouped'] = demog_df['race'].apply(map_race)  # apply your earlier grouping
demog_df['race_target'] = le.fit_transform(demog_df['race_grouped'])


##### Fetching labevents data prior `7` or `14` days

In [None]:
from loguru import logger
import pandas as pd

logger.info("Starting lab data extraction process.")

# First get all unique patient IDs
patient_ids = pd.read_sql("SELECT DISTINCT subject_id FROM public.temp_cohort ORDER BY subject_id", engine)
logger.info(f"Fetched {len(patient_ids)} unique patient IDs from temp_cohort.")

lab_df = pd.DataFrame()

batch_size = 100
total_batches = (len(patient_ids) + batch_size - 1) // batch_size
logger.info(f"Processing patient data in batches of {batch_size}, total batches: {total_batches}")

for i in range(0, len(patient_ids), batch_size):
    batch_num = i // batch_size + 1
    batch = patient_ids.iloc[i:i+batch_size]
    batch_list = tuple(batch['subject_id'])
    
    logger.info(f"Processing batch {batch_num}/{total_batches} with {len(batch)} patient IDs.")
    
    query = f"""
        SELECT DISTINCT
            le.subject_id, 
            le.hadm_id, 
            le.itemid, 
            le.charttime, 
            le.valuenum,
            tc.dischtime,
            tc.target
        FROM mimiciv_hosp.labevents le
        JOIN public.temp_cohort tc
          ON le.subject_id = tc.subject_id
         AND le.hadm_id = tc.hadm_id
        WHERE le.charttime BETWEEN (tc.dischtime - INTERVAL '7 days') AND tc.dischtime
        AND le.subject_id IN {batch_list}
    """
    
    chunk = pd.read_sql(query, engine)
    logger.info(f"Batch {batch_num} fetched {len(chunk)} lab event records.")
    
    lab_df = pd.concat([lab_df, chunk], ignore_index=True)

lab_df.reset_index(drop=True, inplace=True)
logger.info(f"Lab data extraction complete. Total records collected: {len(lab_df)}")
logger.info("Saving lab data to Parquet file.")
lab_df.to_parquet("../dataset/raw/lab_event_data_with_demographics.parquet", index=False)


# Pre-processing for tabular data

### Aggregating on an `hourly` basis

In [1]:
import polars as pl 
import numpy as np
import pandas as pd
patient_data_df = pd.read_parquet("../dataset/raw/lab_events_7_days_prior.parquet")
sup_df = pd.read_csv("../assets/ts.csv")  
sup_extracted_df = pd.read_csv("../assets/extracted.csv")
cohort_df = pd.read_csv("../assets/cohort1_target.csv")
# len(patient_data_df)

In [24]:
patient_data_df.head()

Unnamed: 0,subject_id,hadm_id,dischtime,target,gender,anchor_age,race,itemid,charttime,valuenum
0,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-16 01:00:00,22.0
1,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-17 00:00:00,22.0
2,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-18 00:00:00,19.0
3,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-19 00:00:00,22.0
4,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-20 01:30:00,26.0


In [2]:
patient_df = patient_data_df.copy()
patient_df = patient_df.dropna(subset=["charttime", "dischtime"])
patient_df["charttime"] = pd.to_datetime(patient_df["charttime"])
patient_df["dischtime"] = pd.to_datetime(patient_df["dischtime"])

delta = patient_df["dischtime"] - patient_df["charttime"]

patient_df["minute"] = (delta.dt.total_seconds() // 60).astype(int)
patient_df["hour"] = (delta.dt.total_seconds() // 3600).astype(int)
patient_df["day"] = (delta.dt.total_seconds() // (3600 * 24)).astype(int)
patient_df.head()


Unnamed: 0,subject_id,hadm_id,dischtime,target,gender,anchor_age,race,itemid,charttime,valuenum,minute,hour,day
0,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-16 01:00:00,22.0,6435,107,4
1,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-17 00:00:00,22.0,5055,84,3
2,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-18 00:00:00,19.0,3615,60,2
3,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-19 00:00:00,22.0,2175,36,1
4,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-20 01:30:00,26.0,645,10,0


In [60]:
# patient_df["hour_bin"] = (patient_df["hour"] // 12).astype(int)
patient_df = patient_df[(patient_df["hour_bin"] >= 0) & (patient_df["hour_bin"] < 14)]
patient_ts = (
    patient_df
    .groupby(["hadm_id", "itemid", "day"])["valuenum"]  # "subject_id"
    .mean()
    .unstack(level=-1)
    .interpolate(method="linear", axis=1, limit_area="inside")
    .ffill(axis=1)
    .bfill(axis=1)
)

In [27]:
patient_df['hour_bin'].max()

np.int64(13)

In [28]:
patient_df.head()

Unnamed: 0,subject_id,hadm_id,dischtime,target,gender,anchor_age,race,itemid,charttime,valuenum,minute,hour,day,hour_bin
0,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-16 01:00:00,22.0,6435,107,4,8
1,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-17 00:00:00,22.0,5055,84,3,7
2,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-18 00:00:00,19.0,3615,60,2,5
3,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-19 00:00:00,22.0,2175,36,1,3
4,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-20 01:30:00,26.0,645,10,0,0


In [61]:
patient_ts.head(15)

Unnamed: 0_level_0,day,0,1,2,3,4,5,6
hadm_id,itemid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
20004072,50861.0,164.0,152.0,48.0,30.0,19.0,21.0,21.0
20004072,50862.0,3.3,3.8,3.4,3.6,3.5,3.5,3.5
20004072,50863.0,56.0,69.0,55.0,67.0,66.0,74.5,74.5
20004072,50868.0,7.0,8.0,6.0,8.0,10.0,8.0,8.0
20004072,50878.0,98.0,106.0,37.0,26.0,19.0,21.5,21.5
20004072,50882.0,28.0,27.0,29.0,27.0,25.0,26.5,26.5
20004072,50885.0,0.4,0.4,0.4,0.2,0.2,0.2,0.2
20004072,50893.0,9.0,9.1,9.2,8.9,9.0,9.4,9.4
20004072,50902.0,110.0,109.0,110.0,108.0,112.0,106.5,106.5
20004072,50912.0,0.8,0.7,0.8,0.8,0.9,0.85,0.85


In [62]:
wide = patient_ts.unstack("itemid").fillna(0)
wide.head(15)

day,0,0,0,0,0,0,0,0,0,0,...,6,6,6,6,6,6,6,6,6,6
itemid,50801.0,50802.0,50803.0,50804.0,50806.0,50808.0,50809.0,50810.0,50811.0,50812.0,...,53163.0,53169.0,53170.0,53171.0,53172.0,53173.0,53174.0,53178.0,53180.0,53187.0
hadm_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
20004072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20004811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20006731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20008395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20010041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20012034,0.0,0.0,0.0,0.0,0.0,1.05,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20012521,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20013201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20015507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20018024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:

wide = wide.reset_index()
wide.head()


hour_bin,hadm_id,0,0,0,0,0,0,0,0,0,...,13,13,13,13,13,13,13,13,13,13
itemid,Unnamed: 1_level_1,50801.0,50802.0,50803.0,50804.0,50806.0,50808.0,50809.0,50810.0,50811.0,...,53163.0,53169.0,53170.0,53171.0,53172.0,53173.0,53174.0,53178.0,53180.0,53187.0
0,20004072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,20004811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20006731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20008395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20010041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
def prepare_ts_features_for_ml(df, bin_hours=1):
    # Calculate hours before discharge
    df["hours_before_discharge"] = (df["dischtime"] - df["charttime"]).dt.total_seconds() / 3600

    # Bin hours into integers (0 = closest to discharge)
    df["bin"] = (df["hours_before_discharge"] // bin_hours)
    df = df[df["bin"].notna()]  # drop rows where bin is NaN
    df["bin"] = df["bin"].astype(int)
    df = df[df["bin"] >= 0]     # keep only bins before discharge

    # Pivot to wide format: each column = itemid_bin
    df["itemid_bin"] = df["itemid"].astype(int).astype(str) + "_" + df["bin"].astype(str)

    pivot_df = df.pivot_table(
        index="hadm_id",  # each row = one admission
        columns="itemid_bin",
        values="valuenum",
        aggfunc="mean"
    )

    # Optionally fill missing values (same strategy as supervisor)
    pivot_df = pivot_df.interpolate(axis=1, limit_area="inside")
    pivot_df = pivot_df.ffill(axis=1).bfill(axis=1)

    return pivot_df


In [33]:
nf_df = pd.read_csv("/home/blackphoenix/Documents/projects/BioInformatics/informative-missingness/MIMIC-IV-data/mimic_iv_labs_nf_14_days.csv.gz", compression="gzip")
nf_df.head()

Unnamed: 0,itemid,label,valuenum,charttime,admid,subject_id,minute,hour,day
0,51133,AbsoluteLymphocyteCount,0.94,2117-12-15 11:00:00,21586397,10010231,7860,131,5
1,51143,AtypicalLymphocytes,1.0,2117-12-15 11:00:00,21586397,10010231,7860,131,5
2,51144,Bands,1.0,2117-12-15 11:00:00,21586397,10010231,7860,131,5
3,51146,Basophils,1.0,2117-12-15 11:00:00,21586397,10010231,7860,131,5
4,51200,Eosinophils,0.0,2117-12-15 11:00:00,21586397,10010231,7860,131,5


In [34]:

nf_df["itemid"] = nf_df["itemid"].astype("int").astype("str")

# Compute how frequently each lab test is recorded across admissions
freq_items = (
    nf_df[["itemid", "admid"]]
    .drop_duplicates()
    .groupby("itemid")["admid"]
    .size()
    .sort_values(ascending=False)
    .reset_index()
    .rename(columns={"admid": "count"})
)
freq_items["freq_ts"] = freq_items["count"] / nf_df["admid"].nunique()

# Compute the average number of measurements per admission for each lab test
ts_items = (
    nf_df.groupby(["admid", "itemid"])
    .size()
    .unstack()
    .mean(axis=0)
    .sort_values(ascending=False)
    .reset_index()
    .rename(columns={0: "num_ts"})
)

In [35]:
# Merge frequency and count information
freq_ts_items = ts_items.merge(freq_items[["itemid", "freq_ts"]], on="itemid")

# Select lab tests with at least npoints and present in minfreq of admissions (customise)
npoints = 5
minfreq = 0.75
sel_vars = freq_ts_items[
    (freq_ts_items["num_ts"] >= npoints) &
    (freq_ts_items["freq_ts"] >= minfreq)
]["itemid"].tolist()

# Keep only the selected lab tests
nf_df = nf_df[nf_df["itemid"].isin(sel_vars)]

# Compute lab-wise mean and standard deviation based only on observed values
# lab_stats = df.groupby("itemid")["valuenum"].agg(["mean", "std"]).rename(columns={"mean": "lab_mean", "std": "lab_std"})
# Merge stats into main dataframe to allow normalisation later
# df = df.merge(lab_stats, on="itemid")

# Pivot lab measurements into time series format (one row per admission-lab, one column per day)
df_ts = (
    nf_df
    #.groupby(["admid", "itemid", "lab_mean", "lab_std", "day"])["valuenum"]
    .groupby(["admid", "itemid", "day"])["valuenum"]
    .mean()
    .unstack(level=-1)
    .interpolate(method='linear', axis=1, limit_area="inside")  # interpolate only between observed values
    .ffill(axis=1)  # fill missing values forward (after last measurement)
    .bfill(axis=1)  # fill missing values backward (before first measurement)
    .reset_index()
)

In [37]:
df_ts.head()

day,admid,itemid,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,20000588,50861,25.0,25.0,25.0,25.0,25.0,25.0,25.0,27.333333,29.666667,32.0,34.333333,36.666667,39.0,39.0
1,20000588,50862,3.9,3.9,3.9,3.9,3.9,3.9,3.9,3.9,3.9,3.9,3.9,3.9,3.9,3.9
2,20000588,50863,79.0,79.0,79.0,79.0,79.0,79.0,79.0,84.0,89.0,94.0,99.0,104.0,109.0,109.0
3,20000588,50868,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
4,20000588,50878,20.0,20.0,20.0,20.0,20.0,20.0,20.0,21.166667,22.333333,23.5,24.666667,25.833333,27.0,27.0


In [38]:
df_ts = df_ts.set_index(["admid", "itemid"])

# Unstack to get one row per admission with multiple lab*time columns
df_mx = df_ts.unstack(level=-1)
df_mx.columns = df_mx.columns.swaplevel(0, 1)
df_mx = df_mx.sort_index(axis=1)
df_mx.columns = ['_'.join(map(str, col)) if isinstance(col, tuple) else str(col) for col in df_mx.columns]

In [39]:
df_mx.head()

Unnamed: 0_level_0,50861_0,50861_1,50861_2,50861_3,50861_4,50861_5,50861_6,50861_7,50861_8,50861_9,...,52172_4,52172_5,52172_6,52172_7,52172_8,52172_9,52172_10,52172_11,52172_12,52172_13
admid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20000588,25.0,25.0,25.0,25.0,25.0,25.0,25.0,27.333333,29.666667,32.0,...,53.1,53.1,53.1,53.1,53.1,53.1,53.1,53.1,53.1,53.1
20004072,20.0,20.0,20.166667,20.333333,20.5,20.666667,20.833333,21.0,21.0,19.0,...,61.55,65.3,69.05,72.8,74.9,80.7,71.7,74.7,72.3,71.8
20004811,138.0,98.0,73.0,61.0,49.0,39.0,36.0,36.0,79.0,105.0,...,55.8,54.0,55.6,56.25,56.0,59.9,59.85,58.9,57.95,56.9
20006731,23.0,23.0,23.0,23.0,23.0,23.0,23.0,23.0,23.0,23.0,...,43.8,43.8,43.8,43.8,43.8,43.8,42.5,43.2,43.1,43.1
20008395,24.0,24.0,24.0,24.0,24.0,24.0,24.0,18.0,31.0,44.0,...,51.8,51.85,51.9,51.9,51.6,51.65,51.7,49.4,51.1,50.0


In [None]:
p_df = patient_data_df.copy()
final_df = prepare_ts_features_for_ml(p_df, bin_hours=12)
final_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["bin"] = df["bin"].astype(int)


itemid_bin,50801_13,50801_5,50802_0,50802_1,50802_10,50802_11,50802_12,50802_13,50802_2,50802_3,...,53174_13,53174_2,53174_3,53174_4,53174_5,53174_6,53174_7,53174_8,53174_9,53180_3
hadm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20004072,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,...,80.7,80.7,80.7,80.7,80.7,80.7,80.7,80.7,80.7,80.7
20004811,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,...,87.0,87.0,87.0,87.0,87.0,87.0,87.0,87.0,87.0,87.0
20006731,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,...,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0
20008395,135.0,135.0,135.0,135.0,135.0,135.0,135.0,135.0,135.0,135.0,...,51.7,51.7,51.7,51.7,51.7,51.7,51.7,51.7,51.7,51.7
20010041,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,...,42.4,42.4,42.4,42.4,42.4,42.4,42.4,42.4,42.4,42.4


In [55]:
patient_data_df.head()

Unnamed: 0,subject_id,hadm_id,dischtime,target,gender,anchor_age,race,itemid,charttime,valuenum
0,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-16 01:00:00,22.0
1,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-17 00:00:00,22.0
2,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-18 00:00:00,19.0
3,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-19 00:00:00,22.0
4,14794992,26014971,2115-12-20 12:15:00,0,F,66,WHITE,50861.0,2115-12-20 01:30:00,26.0


In [54]:
final_df.unstack()

itemid_bin  hadm_id 
50801_13    20004072    164.0
            20004811     67.0
            20006731     75.0
            20008395    135.0
            20010041     12.0
                        ...  
53180_3     29987748     52.8
            29987780     72.4
            29990599     48.3
            29996493     66.1
            29999090    920.0
Length: 18587716, dtype: float64

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
ts_user_imputed = pd.DataFrame(
    imputer.fit_transform(ts),
    columns=ts.columns,
    index=ts.index
).reset_index()

In [None]:
from sklearn.impute import KNNImputer


# Create a copy and convert timestamps
new_df = patient_data_df.copy().dropna(subset=["charttime", "dischtime", "hadm_id", "itemid", "valuenum"])
new_df["charttime"] = pd.to_datetime(new_df["charttime"])
new_df["dischtime"] = pd.to_datetime(new_df["dischtime"])

# Calculate hours before discharge
new_df["hours_before_discharge"] = (new_df["dischtime"] - new_df["charttime"]).dt.total_seconds() / 3600

# Filter to 12-hour window (0 to 12 hours inclusive)
new_df_filtered = new_df[
    (new_df["hours_before_discharge"] >= 0) & (new_df["hours_before_discharge"] <= 168)
].copy()    

print(f"Processing {len(new_df_filtered)} records within 12-hour window...")

# creating 7 day bins
new_df_filtered["hour_bin"] = (np.floor(new_df_filtered["hours_before_discharge"]/6) + 1).astype(int)
new_df_filtered["hour_bin"] = new_df_filtered["hour_bin"].clip(upper=27)  # Cap at 7

# Create feature_id with hour bin
new_df_filtered["feature_id"] = (
    "itemid_" + 
    new_df_filtered["itemid"].astype(str) + 
    "_last_" + 
    new_df_filtered["hour_bin"].astype(str) + 
    "h"
)

# Pivot numeric features (mean aggregation)
numeric_pivot = new_df_filtered.pivot_table(
    index="hadm_id",
    columns="feature_id",
    values="valuenum",
    aggfunc="mean",
    # fill_value=np.nan,
)
# Pivot binary features (existence indicator)
new_df_filtered["has_measurement"] = 1
binary_pivot = new_df_filtered.pivot_table(
    index="hadm_id",
    columns="feature_id",
    values="has_measurement",
    aggfunc="max",  # 1 if any measurement exists
    fill_value=0,
)
binary_pivot.columns = [col + "_measured" for col in binary_pivot.columns]

# Step 3: Impute missing values using KNN
imputer = KNNImputer(n_neighbors=5)
ts_user_imputed = pd.DataFrame(
    imputer.fit_transform(numeric_pivot),
    columns=numeric_pivot.columns,
    index=numeric_pivot.index
).reset_index()


# Get targets
targets = new_df_filtered[["hadm_id", "target"]].drop_duplicates().set_index("hadm_id")

# Combine features with targets (NO forward/backward fill)
numeric_features = numeric_pivot.join(targets).reset_index()
binary_features = binary_pivot.join(targets).reset_index()

print(f"Created numeric features: {numeric_features.shape}")
print(f"Created binary features: {binary_features.shape}")
print(f"Filtered down to {len(new_df_filtered)} rows from {len(new_df)}")
print(f"Number of unique hadm_ids: {new_df_filtered['hadm_id'].nunique()}")

In [None]:
numeric_features.head()

In [None]:
numeric_features.columns[:15]

# Preprocessing for Temporal Data

In [None]:
import pandas as pd
patient_data_df = pd.read_parquet("../dataset/raw/lab_event_data_with_demographics.parquet")
len(patient_data_df)

In [None]:
temp_df = patient_data_df.copy()

# Drop unnecessary columns ["race", "gender", "anchor_age", "target"]
temp_df = temp_df.drop(columns=["race", "gender", "anchor_age", "target"])

# Then drop duplicates based on ["subject_id", "hadm_id", "itemid", "charttime"]
temp_df = temp_df.drop_duplicates(subset=["subject_id", "hadm_id", "itemid", "charttime"])

temp_df.head(10)


In [None]:
temp_df.shape

In [None]:
import numpy as np
def assign_time_bin(hours_before_discharge, window_hours=6):
    """Assign records to fixed time bins (e.g., 0-6h, 6-12h).
    Example: For a 6-hour window:
        0.5h → bin 0, 6.1h → bin 6, 23h → bin 18
    """
    return (np.floor(hours_before_discharge / window_hours) * window_hours)

In [None]:
import numpy as np
df = temp_df.copy()
max_window_days = 7
time_bin_hours = 12

# Convert charttime and dischtime to datetime
unique_items = df["itemid"].unique()
# creating a dictionary to map itemid to index 
# because the itemid can be large and sparse
inputdict = {item: idx for idx, item in enumerate(unique_items)}
n_features = len(inputdict)

# calculating hours before discharge and filter window
df["hours_before_discharge"] = (df["dischtime"] - df["charttime"]).dt.total_seconds() / 3600
df = df[(df["hours_before_discharge"] >= 0) & 
        (df["hours_before_discharge"] <= max_window_days * 24)]

# 3. Assign time bins (aligned to discharge)
df['time_bin'] = (np.floor(df['hours_before_discharge'] / time_bin_hours) 
                    * time_bin_hours)

# Grouping by patient and time bin
grouped = df.sort_values(["subject_id", "hadm_id", "time_bin"])\
            .groupby(["subject_id", "hadm_id", "time_bin"])

# Initializing arrays to hold features, masking, timestamps, and patient IDs
n_timesteps = len(grouped)
x = np.zeros((n_features, n_timesteps))
masking = np.zeros_like(x)
timestamps = np.zeros(n_timesteps)
patient_ids = []

# Populating arrays to  hold features, masking, timestamps, and patient IDs
for i, ((subj_id, adm_id, time_bin), group) in enumerate(grouped):
    # get the time bin as a timestamp
    timestamps[i] = time_bin
    patient_ids.append(f"{subj_id}_{adm_id}")
    
    for _, row in group.iterrows():
        # get the feature index from the inputdict
        feat_idx = inputdict[row["itemid"]]
        # Fill the feature value and masking
        x[feat_idx, i] = row["valuenum"]
        # Set masking to 1 if the feature is present
        masking[feat_idx, i] = 1

# Calculating delta (time since last observation)
delta = np.zeros_like(x)
for i in range(1, n_timesteps):
    # calculate the time gap between the current and previous time bin
    time_gap = timestamps[i-1] - timestamps[i]  # Note: reversed for "before discharge"
    # if the previous time bin was missing, accumulate the time gap
    # else use the actual time gap
    delta[:, i] = np.where(
        masking[:, i-1] == 0,
        time_gap + delta[:, i-1],  # Accumulate if missing
        time_gap                   # Else use actual gap
    )

In [None]:
df.head(15)

In [None]:
delta[0, 0:10]  

In [None]:
masking[0, 0:10]  

In [None]:
x[3, 0:30] 

In [None]:
x.shape, masking.shape, delta.shape, timestamps.shape

In [None]:
np.unique(x)

In [None]:
import numpy as np
import pandas as pd

def df_to_x_m_d(df, max_window_days=7):
    """
    Convert DataFrame to GRU-D inputs (x, masking, delta), using raw `itemid` as indices.
    
    Args:
        df: DataFrame with columns ['subject_id', 'hadm_id', 'itemid', 'charttime', 'valuenum', 'dischtime'].
        max_window_days: Maximum days before discharge to include.
    
    Returns:
        x: Feature matrix of shape (n_features, n_timesteps).
        masking: Binary mask of observed values (same shape as x).
        delta: Time gaps since last observation (same shape as x).
        timestamps: Hours since discharge for each timestep.
        ids: DataFrame with ['subject_id', 'hadm_id'] for each timestep.
    """

    # --- 1. Preprocess Timestamps ---
    df["hours_since_discharge"] = (df["dischtime"] - df["charttime"]).dt.total_seconds() / 3600

    # Filter to keep only within the max window
    df = df[(df["hours_since_discharge"] >= 0) & (df["hours_since_discharge"] <= max_window_days * 24)].copy()

    # --- 2. Group by Patient and Time ---
    # Sort dataframe
    df = df.sort_values(by=["subject_id", "hadm_id", "charttime"])

    # Group by 'subject_id', 'hadm_id', 'charttime'
    grouped = df.groupby(["subject_id", "hadm_id", "charttime"])

    n_timesteps = len(grouped)

    # --- 3. Initialize Arrays ---
    n_features = df["itemid"].max() + 1  # Assumes itemids start at 0
    x = np.zeros((n_features, n_timesteps))
    masking = np.zeros_like(x)
    timestamps = np.zeros(n_timesteps)
    ids = []

    # --- 4. Populate x, masking, and timestamps ---
    for i, ((subj_id, adm_id, time), group) in enumerate(grouped):
        timestamps[i] = (time - group["dischtime"].iloc[0]).total_seconds() / 3600
        ids.append({"subject_id": subj_id, "hadm_id": adm_id})
        for _, row in group.iterrows():
            x[int(row["itemid"]), i] = row["valuenum"]
            masking[int(row["itemid"]), i] = 1

    # --- 5. Calculate delta ---
    delta = np.zeros_like(x)
    for i in range(1, n_timesteps):
        time_gap = timestamps[i] - timestamps[i-1]
        delta[:, i] = np.where(
            masking[:, i-1] == 0,
            time_gap + delta[:, i-1],  # Accumulate if previous value was missing
            time_gap                   # Else use actual time gap
        )

    return x, masking, delta, timestamps, pd.DataFrame(ids)


In [None]:
x, masking, delta, timestamps, ids = df_to_x_m_d(temp_df, max_window_days=7)

print("x shape:", x.shape)          # (max_itemid + 1, n_timesteps)
print("masking shape:", masking.shape)  # Same as x
print("delta shape:", delta.shape)    # Same as x
print("Timestamps (hours before discharge):", timestamps)
# print("Patient IDs:", ids)

In [None]:
ids.head()

In [None]:
timestamps # Display first 5 timestamps