# Init

In [1]:
import os
os.chdir('..')
os.getcwd()

'/Users/wliao0504/code/clif/CLIF-epi-of-sedation'

## Import

In [None]:
from clifpy import ClifOrchestrator
import pandas as pd
import duckdb
from pathlib import Path

In [3]:
site = 'mimic'

## Utils

In [None]:
def run_query_from_file(sql_file_path: str) -> pd.DataFrame:
    """
    Loads a query from a .sql file and executes it using the given DuckDB connection.

    Args:
        con: An active DuckDB connection.
        sql_file_path: The absolute path to the .sql file.

    Returns:
        A pandas DataFrame with the results of the query.
    """
    print(f"--- Loading and executing query from {sql_file_path} ---")
    
    # Read the entire content of the .sql file
    query = Path(sql_file_path).read_text()

    # Execute the query and return as a pandas DataFrame
    result_df = duckdb.sql(query).df()

    print("Query executed successfully.")
    return result_df

# Proprocess

In [4]:
cohort_hosp_ids_df = pd.read_csv('data/cohort_hosp_ids.csv')
cohort_hosp_ids = cohort_hosp_ids_df['hospitalization_id'].astype(str).tolist()

## Resp

In [5]:
co = ClifOrchestrator(config_path="config/mimic_config.json")
co.initialize(tables = ['respiratory_support'])

📢 ClifOrchestrator initialized
📢 Initialized respiratory_support table
📢 Data directory: /Users/wliao0504/code/clif/CLIF-MIMIC/output/rclif-dev-test
📢 File type: parquet
📢 Timezone: US/Eastern
📢 Output directory: /Users/wliao0504/code/clif/CLIF-epi-of-sedation/output
📢 Loaded schema from /Users/wliao0504/code/clif/CLIF-epi-of-sedation/.venv/lib/python3.12/site-packages/clifpy/schemas/respiratory_support_schema.yaml
📢 Loaded outlier configuration


In [6]:
# processed_bf = co.respiratory_support.waterfall(bfill = True)

In [7]:
# processed_bf.df.to_parquet(f"output/intermediate/{site}_resp_processed_bf.parquet")

In [8]:
# resp_p = processed_bf.df
resp_p = pd.read_parquet(f"output/intermediate/{site}_resp_processed_bf.parquet")

In [None]:
q = f"""
SELECT hospitalization_id
    , recorded_dttm
    , device_category, device_name
    , mode_category, mode_name
    , fio2_set
    , peep_set
    , tracheostomy
    , _mode_prev: LAG(mode_category) OVER (PARTITION BY hospitalization_id ORDER BY recorded_dttm)
    , _switch_to_ps_cpap: CASE 
        -- last mode is a control mode
        WHEN contains(_mode_prev, 'control') 
            AND mode_category in ('pressure support/cpap') 
            THEN 1 ELSE 0 END
    , 
FROM resp_p
INNER JOIN cohort_hosp_ids_df USING (hospitalization_id)
WHERE hospitalization_id in ('20001361', '20004088', '20005024')
"""
resp_view_0 = duckdb.sql(q).df()

In [63]:
q = f"""
WITH base AS (
    FROM resp_p
    INNER JOIN cohort_hosp_ids_df USING (hospitalization_id)
    SELECT hospitalization_id
         , recorded_dttm
         , device_category, device_name
         , mode_category, mode_name
         , mode_cat_id
         , fio2_set
         , peep_set
         , pressure_support_set
         , tracheostomy
         , _mode_prev: LAG(mode_category) OVER w_mode
         , _switch_to_ps_cpap: CASE
             WHEN contains(_mode_prev, 'control')
              AND mode_category IN ('pressure support/cpap')
             THEN 1 ELSE 0 END
    WHERE hospitalization_id IN ('20001361', '20004088', '20005024')
    WINDOW w_mode AS (PARTITION BY hospitalization_id ORDER BY recorded_dttm)
),
-- segment attributes keyed by existing mode_cat_id
seg_bounds AS (
    FROM base
    SELECT hospitalization_id
         , _seg_id: mode_cat_id 
         , _seg_start: MIN(recorded_dttm)
         , _seg_last_seen: MAX(recorded_dttm)
         , _seg_mode: ANY_VALUE(mode_category)
    GROUP BY hospitalization_id, _seg_id
),
-- compute segment end and previous segment mode
seg_timing AS (
    FROM seg_bounds
    SELECT hospitalization_id
         , _seg_id
         , _seg_mode
         , _seg_start
         , _seg_last_seen -- timing of the last row for this segment
         , _next_seg_start: LEAD(_seg_start) OVER w_seg -- timing of the first row of the next segment
         , _seg_end: COALESCE(_next_seg_start, _seg_last_seen)
         , _prev_seg_mode: LAG(_seg_mode) OVER w_seg
    WINDOW w_seg AS (PARTITION BY hospitalization_id ORDER BY _seg_start)
),
seg_enriched AS (
    FROM seg_timing
    SELECT hospitalization_id
         , _seg_id
         , _seg_mode
         , _seg_start
         , _seg_end
         , _prev_seg_mode
         , _seg_duration_min: date_diff('minute', _seg_start, _seg_end)
         , _is_ps_cpap_after_control: CASE
             WHEN _seg_mode = 'pressure support/cpap'
              AND contains(_prev_seg_mode, 'control')
             THEN 1 ELSE 0 END
         , _persists_30min: CASE
             WHEN _seg_mode = 'pressure support/cpap'
              AND contains(_prev_seg_mode, 'control')
              AND date_diff('minute', _seg_start, _seg_end) >= 30
             THEN 1 ELSE 0 END
),
final AS (
    FROM base AS s
    LEFT JOIN seg_enriched AS se
      ON se.hospitalization_id = s.hospitalization_id
     AND se._seg_id           = s.mode_cat_id
    SELECT s.hospitalization_id
         , s.recorded_dttm
         , s.device_category, s.device_name
         , s.mode_category, s.mode_name
         , s.fio2_set
         , s.peep_set
         , s.pressure_support_set
         , s.tracheostomy
         , s._switch_to_ps_cpap
         , se._is_ps_cpap_after_control
         , se._persists_30min
         , _sbt_done: CASE 
             WHEN se._persists_30min = 1 
              AND s.peep_set <= 8
              AND s.pressure_support_set <= 8
             THEN 1 ELSE 0 END
)
SELECT *
FROM seg_enriched
ORDER BY hospitalization_id, _seg_id
"""
seg_view = duckdb.sql(q).df()

In [123]:
sbt_blocks_df = run_query_from_file('code/sbt.sql')

--- Loading and executing query from code/sbt.sql ---
Query executed successfully.


In [125]:
sbt_results_df = run_query_from_file('code/sbt.sql')

--- Loading and executing query from code/sbt.sql ---
Query executed successfully.


## ADT

In [10]:
adt = co.load_table(
    'adt',
    columns = ['hospitalization_id', 'in_dttm', 'out_dttm', 'location_name', 'location_category'],
    filters = {
        'hospitalization_id': cohort_hosp_ids
    }
)
adt_df = adt.df

📢 Initialized adt table
📢 Data directory: /Users/wliao0504/code/clif/CLIF-MIMIC/output/rclif-dev-test
📢 File type: parquet
📢 Timezone: US/Eastern
📢 Output directory: /Users/wliao0504/code/clif/CLIF-epi-of-sedation/output
📢 Loaded schema from /Users/wliao0504/code/clif/CLIF-epi-of-sedation/.venv/lib/python3.12/site-packages/clifpy/schemas/adt_schema.yaml
📢 Loaded outlier configuration


## Vitals

In [11]:
vitals = co.load_table(
    'vitals', 
    columns = ['hospitalization_id', 'recorded_dttm', 'vital_category', 'vital_value'],
    filters = {
        'vital_category': ['spo2', 'weight_kg'],
        'hospitalization_id': cohort_hosp_ids
    }
    )
vitals_df = vitals.df

📢 Initialized vitals table
📢 Data directory: /Users/wliao0504/code/clif/CLIF-MIMIC/output/rclif-dev-test
📢 File type: parquet
📢 Timezone: US/Eastern
📢 Output directory: /Users/wliao0504/code/clif/CLIF-epi-of-sedation/output
📢 Loaded schema from /Users/wliao0504/code/clif/CLIF-epi-of-sedation/.venv/lib/python3.12/site-packages/clifpy/schemas/vitals_schema.yaml
📢 Loaded outlier configuration


In [12]:
q = """
PIVOT_WIDER vitals_df
ON vital_category
USING MAX(vital_value)
"""
vitals_w = duckdb.sql(q).df()
vitals_w.head()

Unnamed: 0,hospitalization_id,recorded_dttm,spo2,weight_kg
0,25530842,2177-06-06 02:00:00-06:00,95.0,
1,23403978,2193-02-20 16:00:00-06:00,,76.8
2,27687784,2154-08-11 08:00:00-06:00,93.0,
3,27687784,2154-08-14 10:00:00-06:00,96.0,
4,27246915,2179-09-26 18:00:00-06:00,99.0,


## Meds

In [13]:
mac = co.load_table(
    'medication_admin_continuous',
    columns = ['hospitalization_id', 'admin_dttm', 'med_name', 'med_category', 'med_dose', 'med_dose_unit'],
    filters = {
        'med_group': ['vasoactives'], 
        'hospitalization_id': cohort_hosp_ids
    }
    )

📢 Initialized medication_admin_continuous table
📢 Data directory: /Users/wliao0504/code/clif/CLIF-MIMIC/output/rclif-dev-test
📢 File type: parquet
📢 Timezone: US/Eastern
📢 Output directory: /Users/wliao0504/code/clif/CLIF-epi-of-sedation/output
📢 Loaded schema from /Users/wliao0504/code/clif/CLIF-epi-of-sedation/.venv/lib/python3.12/site-packages/clifpy/schemas/medication_admin_continuous_schema.yaml
📢 Loaded outlier configuration


In [14]:
from clifpy.utils.unit_converter import convert_dose_units_by_med_category

preferred_units = {
    'dopamine': 'mcg/kg/min',
    'dobutamine': 'mcg/kg/min',
    'norepinephrine': 'mcg/kg/min',
    'epinephrine': 'mcg/kg/min',
    'phenylephrine': 'mcg/kg/min',
    'angiotensin': 'mcg/kg/min',
    'vasopressin': 'u/min',
    'milrinone': 'mcg/kg/min'
    }

mac_converted, mac_summary = convert_dose_units_by_med_category(
    mac.df,
    vitals_df = vitals_df,
    preferred_units = preferred_units
)

No weight_kg column found, adding the most recent from vitals


In [15]:
q = """
WITH t1 AS (
    SELECT hospitalization_id
        , admin_dttm
        , med_category_unit: med_category || '_' || REPLACE(med_dose_unit_converted, '/', '_')
        , med_dose_converted
    FROM mac_converted
)
, t2 AS (
    PIVOT_WIDER t1
    ON med_category_unit
    USING FIRST(med_dose_converted)
)
SELECT *
FROM t2
ORDER BY hospitalization_id, admin_dttm
"""
mac_w = duckdb.sql(q).df()

In [16]:
co.create_wide_dataset(
    tables_to_load = ['vitals'],
    batch_size = -1
)

📢 🚀 WIDE DATASET CREATION STARTED
📢 Phase 1: Initialization
📢 Phase 2: Encounter Processing
📢 Phase 3: Table Loading
📢   3.1: Auto-loading base tables
📢        - Loading patient table
📢 Initialized patient table
📢 Data directory: /Users/wliao0504/code/clif/CLIF-MIMIC/output/rclif-dev-test
📢 File type: parquet
📢 Timezone: US/Eastern
📢 Output directory: /Users/wliao0504/code/clif/CLIF-epi-of-sedation/output
📢 Loaded schema from /Users/wliao0504/code/clif/CLIF-epi-of-sedation/.venv/lib/python3.12/site-packages/clifpy/schemas/patient_schema.yaml
📢 Loaded outlier configuration
📢        - Loading hospitalization table
📢 Initialized hospitalization table
📢 Data directory: /Users/wliao0504/code/clif/CLIF-MIMIC/output/rclif-dev-test
📢 File type: parquet
📢 Timezone: US/Eastern
📢 Output directory: /Users/wliao0504/code/clif/CLIF-epi-of-sedation/output
📢 Loaded schema from /Users/wliao0504/code/clif/CLIF-epi-of-sedation/.venv/lib/python3.12/site-packages/clifpy/schemas/hospitalization_schema.yaml


In [17]:
wide = co.wide_df

## Join

In [None]:
q = """
SELECT hospitalization_id, recorded_dttm AS event_dttm FROM resp_view
UNION
SELECT hospitalization_id, recorded_dttm AS event_dttm FROM vitals_w
UNION
SELECT hospitalization_id, admin_dttm AS event_dttm FROM mac_w
--UNION
--SELECT hospitalization_id, in_dttm AS event_dttm FROM adt_df
--UNION
--SELECT hospitalization_id, out_dttm AS event_dttm FROM adt_df
"""
all_timestamps = duckdb.sql(q).df()

In [19]:
q = """
WITH t3 AS (
    FROM all_timestamps t
    LEFT JOIN mac_w m ON
        t.hospitalization_id = m.hospitalization_id
        AND t.event_dttm = m.admin_dttm
    SELECT t.hospitalization_id, t.event_dttm
        , LAST_VALUE(COLUMNS('_min') IGNORE NULLS) OVER (
            PARTITION BY t.hospitalization_id ORDER BY event_dttm
        )
), t4 AS (
    SELECT hospitalization_id, event_dttm
        , COALESCE(COLUMNS('_min'), 0)
    FROM t3
), t5 AS (
    SELECT *
        -- ref: https://doi.org/10.1016/j.jcrc.2020.11.002
        , _nee: norepinephrine_mcg_kg_min 
            + epinephrine_mcg_kg_min 
            + phenylephrine_mcg_kg_min / 10.0 
            + dopamine_mcg_kg_min / 100.0 
            + vasopressin_u_min * 2.5 
            + angiotensin_mcg_kg_min * 10
        , _hemo_stable_by_nee: CASE WHEN _nee <= 0.2 THEN 1 ELSE 0 END
        -- to cover the two vasos not in the formula: milrinone and dobutamine
        , _hemo_stable_by_abc: CASE WHEN dobutamine_mcg_kg_min < 0.5
            AND milrinone_mcg_kg_min = 0 THEN 1 ELSE 0 END
        , _hemo_stable: CASE WHEN _hemo_stable_by_nee AND _hemo_stable_by_abc THEN 1 ELSE 0 END
    FROM t4
)
SELECT *
FROM t5
ORDER BY hospitalization_id, event_dttm
"""
# a = augmented
mac_wa = duckdb.sql(q).df()

In [36]:
q = """
WITH t1 AS (
FROM all_timestamps t

LEFT JOIN resp_view r ON
    t.hospitalization_id = r.hospitalization_id
    AND t.event_dttm = r.recorded_dttm
LEFT JOIN vitals_w v ON
    t.hospitalization_id = v.hospitalization_id
    AND t.event_dttm = v.recorded_dttm
    AND v.spo2 IS NOT NULL
LEFT JOIN mac_wa m ON
    t.hospitalization_id = m.hospitalization_id
    AND t.event_dttm = m.event_dttm
SELECT _hospitalization_id: t.hospitalization_id
    , _time: t.event_dttm
    --, _nth_hr: ROW_NUMBER() OVER (PARTITION BY _hospitalization_id ORDER BY _time)
    , device_name, device_category
    , mode_name, mode_category
    , fio2_set
    , peep_set
    , pressure_support_set
    , tracheostomy
    , spo2
    , _spo2: LAST_VALUE(v.spo2 IGNORE NULLS) OVER (PARTITION BY _hospitalization_id ORDER BY _time)
    , _resp_stable: CASE
        WHEN fio2_set <= 0.5
            AND peep_set <= 8
            AND _spo2 >= 88
        THEN 1 ELSE 0 END
    , m.* 
    , _stable: CASE WHEN _resp_stable AND _hemo_stable THEN 1 ELSE 0 END
    , sbt_eligible: CASE WHEN device_category = 'imv' AND _stable = 1 AND tracheostomy = 0 
        THEN 1 ELSE 0 END
    , sbt_done
)
SELECT *
FROM t1
ORDER BY _hospitalization_id, _time
"""
full_view = duckdb.sql(q).df()

In [None]:
q = """
SELECT _hospitalization_id
    , _time
    , sbt_eligible
    , sbt_done
    , device_category, device_name
    , mode_category, mode_name
    , fio2_set
    , peep_set
    , pressure_support_set
    , _spo2
FROM full_view
WHERE hospitalization_id in ('20001361', '20004088', '20005024')
"""
check_view = duckdb.sql(q).df()