## Show the tables in schema

In [4]:
import sys
import os
sys.path.append(os.path.abspath(".."))  # adding the parent directory of 'notebooks' to sys.path
from utils import Database
engine = Database()
schema_list = ["mimiciv_icu", "mimiciv_hosp"]
for schema in schema_list:
    print(f"Schema: {schema}")
    tables = Database.show_tables_in_schema(engine, schema)
    print(tables)

Schema: mimiciv_icu
['caregiver', 'chartevents', 'datetimeevents', 'd_items', 'icustays', 'ingredientevents', 'inputevents', 'outputevents', 'procedureevents']
Schema: mimiciv_hosp
['admissions', 'd_hcpcs', 'diagnoses_icd', 'd_icd_diagnoses', 'd_icd_procedures', 'd_labitems', 'drgcodes', 'emar_detail', 'emar', 'hcpcsevents', 'labevents', 'microbiologyevents', 'omr', 'patients', 'pharmacy', 'poe_detail', 'poe', 'prescriptions', 'procedures_icd', 'provider', 'services', 'transfers']


## Load tables into a dataframe

In [None]:
import pandas as pd
admission_db = Database()
chunk_size = 100000
admission_df = admission_db.read_table_to_df(
    table_name="admissions", 
    schema_name="mimiciv_hosp", 
    limit=chunk_size,
    # order_by="admittime",
)

labevents_df = admission_db.read_table_to_df(
    table_name="labevents", 
    schema_name="mimiciv_hosp", 
    limit=chunk_size,
)

# datetime conversions
admission_df['admittime'] = pd.to_datetime(admission_df['admittime'], errors='coerce')
admission_df['dischtime'] = pd.to_datetime(admission_df['dischtime'], errors='coerce')
labevents_df['charttime'] = pd.to_datetime(labevents_df['charttime'], errors='coerce')


In [None]:
patient_data_df = pd.merge(
    labevents_df,
    admission_df[["subject_id", "hadm_id", "admittime", "dischtime"]],
    on=["subject_id", "hadm_id"],
    how="inner"
)
# Filter labevents to only include those within the admission time frame
patient_data_df = patient_data_df[
    (patient_data_df.charttime >= patient_data_df.admittime)
    & (patient_data_df.charttime <= patient_data_df.dischtime)
]




In [None]:
patient_data_df["hours_since_admit"] = (
    patient_data_df["charttime"] - patient_data_df["admittime"]
).dt.total_seconds() / 3600
# patient_data_df['hours_since_discht'] = (
#     patient_data_df['dischtime'] - patient_data_df['charttime']
# ).dt.total_seconds() / 3600

# valuenum is the float value of the 'value' column
patient_data_df = patient_data_df[
    [
        "subject_id",
        "hadm_id",
        "admittime",
        "dischtime",
        "hours_since_admit",
        "itemid",
        "valuenum",
        "charttime",
    ]
]
patient_data_df.head()

In [None]:
patient_data_df.columns

In [None]:
pivoted_patient_data_df = patient_data_df.pivot_table(
    index=['subject_id', 'hadm_id', 'hours_since_admit'],
    columns='itemid',
    values='valuenum',
)

In [None]:
pivoted_patient_data_df.head()

In [None]:
masked_patient_data_df = patient_data_df.pivot_table(
    index=['subject_id', 'hadm_id', 'hours_since_admit'],
    columns='itemid',
    values='valuenum',
    aggfunc=lambda x: 1 if not pd.isna(x).any() else 0
)
masked_patient_data_df.head()

In [None]:
# For each itemid, compute time since last observation
deltas = {}
itemids = patient_data_df['itemid'].unique()
for item in itemids:
    item_df = patient_data_df[patient_data_df['itemid'] == item].sort_values('hours_since_admit')
    item_df['delta'] = item_df['hours_since_admit'].diff().fillna(0)
    deltas[item] = item_df.set_index('hours_since_admit')['delta']

# Getting all `labevents` data and filtering

In [2]:
from dotenv import load_dotenv
import os
from sqlalchemy import create_engine

# Load the .env file
load_dotenv()

# Database connection parameters
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")
url = f"postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(url)
conn = engine.connect()
cursor = conn.connection.cursor()
# Create the table temporarily in the database
cursor.execute("""
    CREATE TEMP TABLE temp_cohort (
        subject_id INT,
        hadm_id INT,
        admittime TIMESTAMP,
        dischtime TIMESTAMP,
        target  INT
    );
""")


In [3]:
import pandas as pd
from psycopg2.extras import execute_values
cohort_df = pd.read_csv('../assets/cohort1_target.csv')
cohort_df['admittime'] = pd.to_datetime(cohort_df['admittime'], errors='coerce')
cohort_df['dischtime'] = pd.to_datetime(cohort_df['dischtime'], errors='coerce')

# Insert the data from the DataFrame into the temporary table
cohort_df.to_sql("temp_cohort", engine, schema="public", index=False, if_exists="replace")

values = list(cohort_df.itertuples(index=False, name=None))
execute_values(cursor,
    "INSERT INTO temp_cohort (subject_id, hadm_id, admittime, dischtime, target) VALUES %s",
    values
)

conn.commit()

In [5]:
query = """
SELECT 
    le.subject_id, 
    le.hadm_id, 
    le.itemid, 
    le.charttime, 
    le.valuenum,
    tc.dischtime,
    tc.target
FROM mimiciv_hosp.labevents le
JOIN public.temp_cohort tc
  ON le.subject_id = tc.subject_id
 AND le.hadm_id = tc.hadm_id
 WHERE le.charttime BETWEEN (tc.dischtime - INTERVAL '14 days') AND tc.dischtime;
"""
# WHERE le.charttime BETWEEN (tc.dischtime - INTERVAL '7 days') AND tc.dischtime;

# Run query in chunks
lab_chunks = pd.read_sql(query, engine, chunksize=1000)

# Combine all chunks into a single DataFrame
lab_df = pd.concat(chunk for chunk in lab_chunks)

# Reset index after concat
lab_df.reset_index(drop=True, inplace=True)



OperationalError: (psycopg2.errors.ConfigurationLimitExceeded) temporary file size exceeds temp_file_limit (2097152kB)

[SQL: 
SELECT 
    le.subject_id, 
    le.hadm_id, 
    le.itemid, 
    le.charttime, 
    le.valuenum,
    tc.dischtime,
    tc.target
FROM mimiciv_hosp.labevents le
JOIN public.temp_cohort tc
  ON le.subject_id = tc.subject_id
 AND le.hadm_id = tc.hadm_id
 WHERE le.charttime BETWEEN (tc.dischtime - INTERVAL '14 days') AND tc.dischtime;
]
(Background on this error at: https://sqlalche.me/e/20/e3q8)

In [12]:
try:
    lab_df.to_parquet("../dataset/lab_events_14_days_prior.parquet", index=False)
    # lab_df.to_parquet("../dataset/lab_events_7_days_prior.parquet", index=False)
    print("Data successfully written to lab_events_14_days_prior.parquet")
except Exception as e:
    lab_df.to_parquet("lab_events_14_days_prior.parquet", index=False)
    print(f"File written to current directory: {e}")


Data successfully written to lab_events_14_days_prior.parquet


In [13]:
lab_df.head()

Unnamed: 0,subject_id,hadm_id,itemid,charttime,valuenum,dischtime,target
0,10010231,29368887,51233,2118-01-15 17:45:00,,2118-01-20 14:00:00,1
1,10010231,21586397,51678,2117-12-19 06:20:00,6.0,2117-12-23 16:51:00,1
2,10010231,21586397,50861,2117-12-18 10:12:00,22.0,2117-12-23 16:51:00,1
3,10010231,21586397,50862,2117-12-18 10:12:00,3.9,2117-12-23 16:51:00,1
4,10010231,21586397,50863,2117-12-18 10:12:00,68.0,2117-12-23 16:51:00,1


In [14]:
len(lab_df)

4859673

# Pre-processing data

In [1]:
import pandas as pd
import numpy as np

In [3]:
patient_data_df = pd.read_parquet("../dataset/raw/lab_events_7_days_prior.parquet")
patient_data_df.head()

Unnamed: 0,subject_id,hadm_id,itemid,charttime,valuenum,dischtime,target
0,10010231,29368887,51233,2118-01-15 17:45:00,,2118-01-20 14:00:00,1
1,10022373,27450651,50878,2150-06-03 05:49:00,38.0,2150-06-06 14:30:00,0
2,10022373,27450651,51221,2150-05-31 06:00:00,25.9,2150-06-06 14:30:00,0
3,10022373,27450651,51222,2150-05-31 06:00:00,8.3,2150-06-06 14:30:00,0
4,10022373,27450651,51248,2150-05-31 06:00:00,31.0,2150-06-06 14:30:00,0


In [39]:
import numpy as np

df = patient_data_df.copy()
df["charttime"] = pd.to_datetime(df["charttime"])
df["dischtime"] = pd.to_datetime(df["dischtime"])

# Calculate days before discharge
df["days_before_discharge"] = (df["dischtime"] - df["charttime"]).dt.days

# Filter to 7-day window
df_filtered = df[
    (df["days_before_discharge"] >= 0) & (df["days_before_discharge"] <= 6)
].copy()

print(f"Processing {len(df_filtered)} records within 7-day windows...")

# Create feature identifier combining itemid and day
df_filtered["feature_id"] = (
    "itemid_"
    + df_filtered["itemid"].astype(str)
    + "_day_"
    + df_filtered["days_before_discharge"].astype(str)
)

# NUMERIC FEATURES - Pivot table with mean aggregation
numeric_pivot = df_filtered.pivot_table(
    index="hadm_id",
    columns="feature_id",
    values="valuenum",
    aggfunc="mean",  # Average multiple measurements per day
    fill_value=np.nan,
)

# BINARY FEATURES - Pivot table indicating if measurement exists
# Create binary indicator (1 if any measurement, 0 if none)
df_filtered["has_measurement"] = 1
binary_pivot = df_filtered.pivot_table(
    index="hadm_id",
    columns="feature_id",
    values="has_measurement",
    aggfunc="max",  # Max will be 1 if any measurement exists
    fill_value=0,
)

# Add suffix to distinguish binary features
binary_pivot.columns = [col + "_measured" for col in binary_pivot.columns]

# Get targets for each admission
targets = df_filtered[["hadm_id", "target"]].drop_duplicates().set_index("hadm_id")

# Combine with targets
numeric_pivot = numeric_pivot.sort_index(axis=1)
numeric_pivot = numeric_pivot.ffill(axis=1).bfill(axis=1)  # Forward and backward fill to handle NaNs

numeric_features = numeric_pivot.join(targets).reset_index()
binary_features = binary_pivot.join(targets).reset_index()

print(f"Created numeric features: {numeric_features.shape}")
print(f"Created binary features: {binary_features.shape}")

Processing 3518406 records within 7-day windows...
Created numeric features: (4907, 2199)
Created binary features: (4908, 3083)


In [40]:
patient_data_df.head()

Unnamed: 0,subject_id,hadm_id,itemid,charttime,valuenum,dischtime,target
0,10010231,29368887,51233,2118-01-15 17:45:00,,2118-01-20 14:00:00,1
1,10022373,27450651,50878,2150-06-03 05:49:00,38.0,2150-06-06 14:30:00,0
2,10022373,27450651,51221,2150-05-31 06:00:00,25.9,2150-06-06 14:30:00,0
3,10022373,27450651,51222,2150-05-31 06:00:00,8.3,2150-06-06 14:30:00,0
4,10022373,27450651,51248,2150-05-31 06:00:00,31.0,2150-06-06 14:30:00,0


In [41]:
numeric_features.head()

Unnamed: 0,hadm_id,itemid_50801_day_2,itemid_50801_day_6,itemid_50802_day_0,itemid_50802_day_1,itemid_50802_day_2,itemid_50802_day_3,itemid_50802_day_4,itemid_50802_day_5,itemid_50802_day_6,...,itemid_53173_day_4,itemid_53173_day_6,itemid_53174_day_0,itemid_53174_day_1,itemid_53174_day_2,itemid_53174_day_3,itemid_53174_day_4,itemid_53174_day_6,itemid_53180_day_1,target
0,20004072,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,...,73.85,73.85,73.85,73.85,73.85,73.85,73.85,73.85,73.85,1
1,20004811,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,67.0,...,87.0,87.0,87.0,87.0,87.0,87.0,87.0,87.0,87.0,0
2,20006731,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,...,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,0
3,20008395,135.0,135.0,135.0,135.0,135.0,135.0,135.0,135.0,135.0,...,51.9,51.9,51.9,51.9,51.9,51.9,51.9,51.9,51.9,0
4,20010041,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,...,44.6,44.6,44.6,44.6,44.6,44.6,44.6,44.6,44.6,0


# Training a Random Forest

In [45]:
from sklearn.preprocessing import StandardScaler
