## Show the tables in schema

In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))  # adding the parent directory of 'notebooks' to sys.path
from utils import Database
engine = Database()
schema_list = ["mimiciv_icu", "mimiciv_hosp"]
for schema in schema_list:
    print(f"Schema: {schema}")
    tables = Database.show_tables_in_schema(engine, schema)
    print(tables)

Schema: mimiciv_icu
['caregiver', 'chartevents', 'datetimeevents', 'd_items', 'icustays', 'ingredientevents', 'inputevents', 'outputevents', 'procedureevents']
Schema: mimiciv_hosp
['admissions', 'd_hcpcs', 'diagnoses_icd', 'd_icd_diagnoses', 'd_icd_procedures', 'd_labitems', 'drgcodes', 'emar_detail', 'emar', 'hcpcsevents', 'labevents', 'microbiologyevents', 'omr', 'patients', 'pharmacy', 'poe_detail', 'poe', 'prescriptions', 'procedures_icd', 'provider', 'services', 'transfers']


## Load tables into a dataframe

In [None]:
import pandas as pd
admission_db = Database()
chunk_size = 100000
admission_df = admission_db.read_table_to_df(
    table_name="admissions", 
    schema_name="mimiciv_hosp", 
    limit=chunk_size,
    # order_by="admittime",
)

labevents_df = admission_db.read_table_to_df(
    table_name="labevents", 
    schema_name="mimiciv_hosp", 
    limit=chunk_size,
)

# datetime conversions
admission_df['admittime'] = pd.to_datetime(admission_df['admittime'], errors='coerce')
admission_df['dischtime'] = pd.to_datetime(admission_df['dischtime'], errors='coerce')
labevents_df['charttime'] = pd.to_datetime(labevents_df['charttime'], errors='coerce')


In [None]:
patient_data_df = pd.merge(
    labevents_df,
    admission_df[["subject_id", "hadm_id", "admittime", "dischtime"]],
    on=["subject_id", "hadm_id"],
    how="inner"
)
# Filter labevents to only include those within the admission time frame
patient_data_df = patient_data_df[
    (patient_data_df.charttime >= patient_data_df.admittime)
    & (patient_data_df.charttime <= patient_data_df.dischtime)
]




In [None]:
patient_data_df["hours_since_admit"] = (
    patient_data_df["charttime"] - patient_data_df["admittime"]
).dt.total_seconds() / 3600
# patient_data_df['hours_since_discht'] = (
#     patient_data_df['dischtime'] - patient_data_df['charttime']
# ).dt.total_seconds() / 3600

# valuenum is the float value of the 'value' column
patient_data_df = patient_data_df[
    [
        "subject_id",
        "hadm_id",
        "admittime",
        "dischtime",
        "hours_since_admit",
        "itemid",
        "valuenum",
        "charttime",
    ]
]
patient_data_df.head()

In [None]:
patient_data_df.columns

In [None]:
pivoted_patient_data_df = patient_data_df.pivot_table(
    index=['subject_id', 'hadm_id', 'hours_since_admit'],
    columns='itemid',
    values='valuenum',
)

In [None]:
pivoted_patient_data_df.head()

In [None]:
masked_patient_data_df = patient_data_df.pivot_table(
    index=['subject_id', 'hadm_id', 'hours_since_admit'],
    columns='itemid',
    values='valuenum',
    aggfunc=lambda x: 1 if not pd.isna(x).any() else 0
)
masked_patient_data_df.head()

In [None]:
# For each itemid, compute time since last observation
deltas = {}
itemids = patient_data_df['itemid'].unique()
for item in itemids:
    item_df = patient_data_df[patient_data_df['itemid'] == item].sort_values('hours_since_admit')
    item_df['delta'] = item_df['hours_since_admit'].diff().fillna(0)
    deltas[item] = item_df.set_index('hours_since_admit')['delta']

# Getting all `labevents` data

In [9]:
from dotenv import load_dotenv
import os
from sqlalchemy import create_engine

# Load the .env file
load_dotenv()

# Access the MAP_KEY
# MAP_KEY = os.getenv("MAP_KEY")
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")
url = f"postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(url)
conn = engine.connect()
cursor = conn.connection.cursor()
# Create the table temporarily in the database
cursor.execute("""
    CREATE TEMP TABLE temp_cohort (
        subject_id INT,
        hadm_id INT,
        admittime TIMESTAMP,
        dischtime TIMESTAMP,
        target  INT
    );
""")


In [10]:
import pandas as pd
from psycopg2.extras import execute_values
cohort_df = pd.read_csv('../assets/cohort1_target.csv')
cohort_df['admittime'] = pd.to_datetime(cohort_df['admittime'], errors='coerce')
cohort_df['dischtime'] = pd.to_datetime(cohort_df['dischtime'], errors='coerce')

# Insert the data from the DataFrame into the temporary table
cohort_df.to_sql("temp_cohort", engine, schema="public", index=False, if_exists="replace")

values = list(cohort_df.itertuples(index=False, name=None))
execute_values(cursor,
    "INSERT INTO temp_cohort (subject_id, hadm_id, admittime, dischtime, target) VALUES %s",
    values
)

conn.commit()

In [13]:
query = """
SELECT 
    le.subject_id, 
    le.hadm_id, 
    le.itemid, 
    le.charttime, 
    le.valuenum,
    tc.target
FROM mimiciv_hosp.labevents le
JOIN public.temp_cohort tc
  ON le.subject_id = tc.subject_id
 AND le.hadm_id = tc.hadm_id
WHERE le.charttime BETWEEN (tc.dischtime - INTERVAL '7 days') AND tc.dischtime;
"""
# WHERE le.charttime BETWEEN (tc.dischtime - INTERVAL '14 days') AND tc.dischtime;

# Run query in chunks
lab_chunks = pd.read_sql(query, engine, chunksize=1000)

# Combine all chunks into a single DataFrame
lab_df = pd.concat(chunk for chunk in lab_chunks)

# Reset index after concat
lab_df.reset_index(drop=True, inplace=True)



In [14]:
try:
    # lab_df.to_parquet("../dataset/lab_events_14_days_prior.parquet", index=False)
    lab_df.to_parquet("../dataset/lab_events_7_days_prior.parquet", index=False)
    print("Data successfully written to lab_events_7_days_prior.parquet")
except Exception as e:
    lab_df.to_parquet("lab_events_7_days_prior.parquet", index=False)
    print(f"File written to current directory: {e}")


Data successfully written to lab_events_7_days_prior.parquet


In [15]:
lab_df.head()

Unnamed: 0,subject_id,hadm_id,itemid,charttime,valuenum,target
0,16893819,29809400,50983,2201-03-15 05:10:00,141.0,0
1,16893819,29809400,51006,2201-03-15 05:10:00,11.0,0
2,16893819,29809400,51678,2201-03-15 05:10:00,6.0,0
3,16893819,29809400,50868,2201-03-14 06:42:00,10.0,0
4,16893819,29809400,50882,2201-03-14 06:42:00,28.0,0


In [16]:
len(lab_df)

3518649