This notebook replaces the continuous med-only notebook and now contains the development code for building both the continuous and intermittent med admin tables -- since they are sourced from the same MIMIC-IV table, `inputevents`.

# Init

In [1]:
import os
os.chdir("..")
os.getcwd()

'/Users/wliao0504/code/clif/CLIF-MIMIC'

## Import

In [2]:
# src/tables/medication_admin.py
import numpy as np
import pandas as pd
import logging
from importlib import reload
import src.utils
reload(src.utils)
import duckdb
reload(duckdb)
from src.utils import construct_mapper_dict, fetch_mimic_events, load_mapping_csv, \
    get_relevant_item_ids, find_duplicates, rename_and_reorder_cols, save_to_rclif, \
    convert_and_sort_datetime, setup_logging, search_mimic_items
# from fuzzywuzzy import process
import src.tables.medication_admin as med
reload(med)

2025-09-11 17:44:39,685 - INFO - initialized logging at logs/etl.log
2025-09-11 17:44:39,685 - INFO - initialized logging at logs/etl.log


loaded configuration from /Users/wliao0504/code/clif/CLIF-MIMIC/src/../config/config.json
loaded configuration from /Users/wliao0504/code/clif/CLIF-MIMIC/src/../config/config.json


<module 'src.tables.medication_admin' from '/Users/wliao0504/code/clif/CLIF-MIMIC/src/tables/medication_admin.py'>

In [3]:
duckdb_version = duckdb.__version__
print(f"DuckDB version: {duckdb_version}")

DuckDB version: 1.3.2


In [4]:
setup_logging()

MAC_COLUMNS = [
    "hospitalization_id", "med_order_id", "admin_dttm", "med_name", "med_category", "med_group", 
    "med_route_name", "med_route_category", "med_dose", "med_dose_unit", "mar_action_name", "mar_action_category"
]

MAC_COL_RENAME_MAPPER = {
    "dose": "med_dose",
    "rateuom": "med_dose_unit",
    "amountuom": "med_dose_unit",
    "new_mar": "mar_action_name", 
    "linkorderid": "med_order_id",
    "recorded_dttm": "admin_dttm"
}

MAC_MCIDE_URL = "https://raw.githubusercontent.com/clif-consortium/CLIF/main/mCIDE/clif_medication_admin_continuous_med_categories.csv"

def map_name_to_category(name, categories):
    '''
    Map a medication name to a category using fuzzy matching.
    '''
    match, score = process.extractOne(name, categories)
    return match if score >= 80 else None

def are_doses_close(doses):
    return (abs(doses.iloc[0] - doses.iloc[1]) / max(doses.iloc[0], doses.iloc[1])) <= 0.1

# drop the row with the shorter mar_action_name
def drop_shorter_action_name(group):
    if len(group) == 2 and are_doses_close(group['med_dose']):
        return group.loc[[group['mar_action_name'].str.len().idxmax()]]
    return group

2025-09-11 17:44:39,696 - INFO - initialized logging at logs/etl.log


In [5]:
pt_demo = pd.read_parquet('tests/clif_patient.parquet')

# Query MIMIC items

## Intm

In [None]:
# Load medication categories from GitHub
intm_mcide_df = pd.read_csv("https://raw.githubusercontent.com/clif-consortium/CLIF/main/mCIDE/medication_admin_intermittent/clif_medication_admin_intermittent_med_categories.csv")

list_of_meds_to_query = intm_mcide_df.med_category.unique()

list_of_dfs = [search_mimic_items(med) for med in list_of_meds_to_query]

# Merge all dataframes on itemid
meds_found = pd.concat(list_of_dfs, ignore_index=True)

2025-09-11 11:42:28,658 - INFO - searching for items with keyword 'acetaminophen' in column 'label' with case sensitive = False.
2025-09-11 11:42:28,661 - INFO - identified 1 event tables to be separately queried: ['inputevents']
2025-09-11 11:42:28,807 - INFO - Found and concatenated 1 items from across 1 event table(s)
2025-09-11 11:42:28,807 - INFO - searching for items with keyword 'acyclovir' in column 'label' with case sensitive = False.
2025-09-11 11:42:28,809 - INFO - identified 1 event tables to be separately queried: ['inputevents']
2025-09-11 11:42:28,897 - INFO - Found and concatenated 1 items from across 1 event table(s)
2025-09-11 11:42:28,898 - INFO - searching for items with keyword 'alteplase' in column 'label' with case sensitive = False.
2025-09-11 11:42:28,900 - INFO - identified 1 event tables to be separately queried: ['inputevents']
2025-09-11 11:42:28,994 - INFO - Found and concatenated 1 items from across 1 event table(s)
2025-09-11 11:42:28,995 - INFO - search

In [159]:
# manually check some
df = search_mimic_items("plasma")
df

2025-09-11 17:06:47,258 - INFO - searching for items with keyword 'plasma' in column 'label' with case sensitive = False.
2025-09-11 17:06:47,264 - INFO - identified 2 event tables to be separately queried: ['inputevents', 'procedureevents']
2025-09-11 17:06:47,619 - INFO - Found and concatenated 4 items from across 2 event table(s)


Unnamed: 0,kw,itemid,label,abbreviation,linksto,category,unitname,param_type,count,value_instances,amountuom_instances,rateuom_instances,ordercategoryname_instances,secondaryordercategoryname_instances,ordercategorydescription_instances
0,plasma,220970,Fresh Frozen Plasma,FFP,inputevents,Blood Products/Colloids,mL,Solution,18828,"Rate: 0.84, 318.0, 35640.0; Amount: 0.14, 300....",mL: 18828,"mL/hour: 18797, mL/min: 31",07-Blood Products: 18828,,Continuous IV: 18828
0,plasma,227551,Plasma Pheresis.,Plasma Pheresis.,procedureevents,4-Procedures,,Processes,330,"Min: 0.05416666666666667, Mean: 700.66, Max: 1...",,,,,
1,plasma,227532,Plasma Pheresis,Plasma Pheresis,inputevents,Blood Products/Colloids,mL,Solution,146,"Rate: 20.13, 386.5, 2093.33; Amount: 19.0, 299...",mL: 146,"mL/hour: 143, mL/min: 3",07-Blood Products: 146,,Continuous IV: 146
2,plasma,220971,ESDEP (Solvent / Detergent Virus-Inactivated P...,ESDEP,inputevents,Fluids - Other (Not In Use),mL,Solution,1,"Rate: , , ; Amount: , ,",,,,,


## Cont

In [137]:
med_admin_mapping = load_mapping_csv("med_admin")
current_meds = med_admin_mapping['med_category'].unique()
latest_cont_meds = pd.read_csv("https://raw.githubusercontent.com/Common-Longitudinal-ICU-data-Format/CLIF/refs/heads/main/mCIDE/medication_admin_continuous/clif_medication_admin_continuous_med_categories.csv")
new_cont_meds = set(latest_cont_meds.med_category.unique()) - set(current_meds)
new_cont_meds

{'albumin_infusion',
 'albuterol',
 'alprostadil',
 'baclofen',
 'bivalirudin',
 'bupivacaine',
 'cangrelor',
 'cosyntropin',
 'dextrose_in_water_d5w',
 'ipratropium',
 'isoproterenol',
 'lactated_ringers_solution',
 'liothyronine',
 'nitric_oxide',
 'nitroglycerin',
 'oxytocin',
 'papaverine',
 'phentolamine',
 'pitocin',
 'plasma_lyte',
 'remifentanil',
 'repletion',
 'ropivacaine',
 'sodium chloride',
 'tacrolimus',
 'terbutaline',
 'thyroid replacement',
 'tocolyttics',
 'torsemide',
 'zidovudine'}

In [139]:
list_of_dfs_cont = [search_mimic_items(med) for med in new_cont_meds]

# Merge all dataframes on itemid
cont_meds_found = pd.concat(list_of_dfs_cont, ignore_index=True)

2025-09-11 16:02:54,107 - INFO - searching for items with keyword 'terbutaline' in column 'label' with case sensitive = False.
2025-09-11 16:02:54,112 - INFO - searching for items with keyword 'zidovudine' in column 'label' with case sensitive = False.
2025-09-11 16:02:54,114 - INFO - searching for items with keyword 'tocolyttics' in column 'label' with case sensitive = False.
2025-09-11 16:02:54,116 - INFO - searching for items with keyword 'cangrelor' in column 'label' with case sensitive = False.
2025-09-11 16:02:54,119 - INFO - searching for items with keyword 'dextrose_in_water_d5w' in column 'label' with case sensitive = False.
2025-09-11 16:02:54,121 - INFO - searching for items with keyword 'ropivacaine' in column 'label' with case sensitive = False.
2025-09-11 16:02:54,124 - INFO - searching for items with keyword 'nitroglycerin' in column 'label' with case sensitive = False.
2025-09-11 16:02:54,126 - INFO - identified 1 event tables to be separately queried: ['inputevents']
2

# ETL

## Load

In [37]:
med_admin_mapping = load_mapping_csv("med_admin")

logging.info("parsing the mapping files to identify relevant items and fetch corresponding events...")
med_item_ids = get_relevant_item_ids(
    mapping_df = med_admin_mapping, 
    decision_col = "decision", 
    excluded_labels = ["NO MAPPING", "UNSURE", "NOT AVAILABLE"]
    ) 

med_events = fetch_mimic_events(med_item_ids).pipe(convert_and_sort_datetime)

2025-09-11 18:33:19,757 - INFO - parsing the mapping files to identify relevant items and fetch corresponding events...
2025-09-11 18:33:19,760 - INFO - querying the d_items table to identify which event tables to be separately queried for 123 items
2025-09-11 18:33:19,765 - INFO - identified 1 event tables to be separately queried: ['inputevents']
2025-09-11 18:33:19,766 - INFO - fetching events from inputevents table for 123 items
2025-09-11 18:33:28,005 - INFO - fetched 7261761 events from inputevents table for 123 items
2025-09-11 18:33:28,008 - INFO - concatenated 7261761 events from 1 event table(s)


In [28]:
q = """
SELECT med_category, CAST(itemid AS INT) AS itemid, decision
    , COUNT(*) OVER (PARTITION BY itemid) AS count
FROM med_admin_mapping
WHERE decision in ('BOTH', 'CONTINUOUS', 'INTERMITTENT')
ORDER BY count DESC
"""
med_admin_mapping_c = duckdb.sql(q).df()
med_admin_mapping_c

Unnamed: 0,med_category,itemid,decision,count
0,dextrose,220949,BOTH,2
1,dextrose_in_water_d5w,220949,BOTH,2
2,narcan,222021,BOTH,2
3,naloxone,222021,BOTH,2
4,albumin,220864,CONTINUOUS,1
...,...,...,...,...
120,fluconazole,225869,INTERMITTENT,1
121,piperacillin_tazobactam,225893,INTERMITTENT,1
122,tobramycin,225902,INTERMITTENT,1
123,furosemide,228340,CONTINUOUS,1


## Relevant columns

In [12]:
print(med_events.value_counts("ordercomponenttypedescription"))
print("\n--- which does not seem relevant")

ordercomponenttypedescription
Main order parameter                                                                                    4850627
Mixed solution                                                                                          2312078
Additives                                         Ampoule                                                 99056
Name: count, dtype: int64

--- which does not seem relevant


In [13]:
med_events.value_counts('secondaryordercategoryname')
print("\n--- which does not seem relevant")


--- which does not seem relevant


In [14]:
find_intm_where_clause = """
ordercategoryname = '05-Med Bolus'
    OR ordercategorydescription = 'Drug Push'
    OR statusdescription = 'Bolus'
"""

In [None]:
query = f"""
SELECT subject_id, hadm_id
    , starttime, endtime --, storetime
    , linkorderid
    , statusdescription
    , med_category
    , rate, rateuom
    , amount, amountuom
    , patientweight
    , totalamount, totalamountuom, originalamount, originalrate
    , ordercategoryname, ordercategorydescription
    , e.itemid
    , e.label
    , m.decision AS item_class
    , CASE WHEN (endtime - starttime) = INTERVAL '1 minute'
        THEN 1 ELSE 0 END AS duration_1min
    -- flags to identify intermittents
    , CASE WHEN ordercategoryname = '05-Med Bolus'
        THEN 1 ELSE 0 END AS intm_by_ordercategoryname
    , CASE WHEN ordercategorydescription = 'Drug Push'
        THEN 1 ELSE 0 END AS intm_by_ordercategorydescription
    , CASE WHEN statusdescription = 'Bolus'
        THEN 1 ELSE 0 END AS intm_by_statusdescription
    , CASE WHEN item_class = 'INTERMITTENT'
        OR (item_class = 'BOTH' AND ({find_intm_where_clause}))
        THEN 'intm' ELSE 'cont' END AS "to_table"
FROM med_events e
LEFT JOIN med_admin_mapping_c m
    ON e.itemid = m.itemid
    AND m.decision IN ('BOTH', 'CONTINUOUS', 'INTERMITTENT')
ORDER BY hadm_id, starttime, linkorderid, med_category, endtime
"""
med_selected_and_mapped = duckdb.sql(query).df()

In [None]:
# RESUME as we have confirmed the working of left join 
# we can proceed

In [16]:
print("note that there is a 'Bolus' category under statusdescription, which is the MAR action")
med_selected_and_mapped.value_counts("statusdescription")

note that there is a 'Bolus' category under statusdescription, which is the MAR action


statusdescription
FinishedRunning    4679612
ChangeDose/Rate    2942815
Stopped             496306
Paused              374954
Bolus                 3645
Reported                 1
Name: count, dtype: int64

## Split continuous and intermittents

In [12]:
query = f"""
SELECT *
FROM med_selected_and_mapped
WHERE item_class = 'CONTINUOUS'
    OR (item_class = 'BOTH' AND NOT ({find_intm_where_clause}))
ORDER BY hadm_id, med_category, starttime, endtime, linkorderid
"""
cont_only = duckdb.sql(query).df()

## Process continous

### Check missing dose `rate`

In [None]:
#RESUME
query = """
SELECT *
    , EXTRACT(EPOCH FROM (endtime - starttime)) / 60 AS duration_in_mins
    , amount / duration_in_mins AS rate_imputed
    , CONCAT(amountuom, '/min') AS rateuom_imputed
    , ABS(originalrate - rate_imputed) AS abs_diff
    , abs_diff / ((originalrate + rate_imputed) / 2) AS rel_diff
    , CASE WHEN rel_diff <= 0.01 OR abs_diff <= 0.000000001 THEN 1 ELSE 0 END AS close_rates
    , COALESCE(rate, rate_imputed) AS _rate
    , COALESCE(rateuom, rateuom_imputed) AS _rateuom
FROM cont_only
WHERE rate IS NULL
"""
cont_rate_missing = duckdb.sql(query).df()
print(f"Overall there are {cont_rate_missing.shape[0]} ({cont_rate_missing.shape[0]/cont_only.shape[0] * 100:.1f}%) rows with missing `rate`.")

mask = cont_rate_missing['close_rates'] == 0
print(f"Only {mask.mean() * 100:.1f}% ({mask.sum()}) of the imputed rates (using amount divided by duration) deviate from the `originalrate`.")
print("It seeems safe to use the imputed rates to fill the missing `rate`.")

mask2 = (cont_rate_missing['close_rates'] == 0) & (cont_rate_missing['originalrate'] == 0)
print(f"And {mask2.sum()/mask.sum() * 100:.1f}% ({mask2.sum()}) of the {mask.sum()} deviated cases are caused by a zero `originalrate`, which is odd and does not seem valid.")
cont_rate_missing.value_counts('med_category')

Overall there are 101059 (3.3%) rows with missing `rate`.
Only 7.5% (7582) of the imputed rates (using amount divided by duration) deviate from the `originalrate`.
It seeems safe to use the imputed rates to fill the missing `rate`.
And 98.2% (7442) of the 7582 deviated cases are caused by a zero `originalrate`, which is odd and does not seem valid.


med_category
magnesium             82477
sodium bicarbonate    10203
dextrose               6047
insulin                2245
heparin                  87
Name: count, dtype: int64

### Flatten start-end timestamps to admin timestamps

In [14]:
# check monotonicity
q = """
SELECT *
    , LAG(endtime) OVER (PARTITION BY hadm_id, med_category, linkorderid ORDER BY starttime) AS endtime_prev
    , starttime < endtime_prev AS crossover
FROM cont_only
QUALIFY crossover is True
-- ORDER BY hadm_id, med_category, starttime, endtime, linkorderid
ORDER BY hadm_id, starttime, endtime, linkorderid, med_category
"""
crossovers = duckdb.sql(q).df()
print(f"given there are only {len(crossovers)} cases, we might consider dropping them all")

given there are only 20 cases, we might consider dropping them all


In [15]:
cont_lagged = med._prepare_for_timestamp_flattening(cont_only)

In [16]:
# Check pause #
# find demo data to test
query = """
SELECT *    
FROM cont_lagged
WHERE statusdescription = 'FinishedRunning'
    AND is_last_row = 0
    AND is_first_row = 0
    -- AND subject_id IN (SELECT DISTINCT CAST(patient_id AS INT) FROM pt_demo)
"""
df = duckdb.sql(query).df()

- 'Stopped' for the full sample (i.e. not just the MIMIC demo) would yield zero row, meaning it is not an issue.
- 'Paused' and 'FinishedRunning' both yield results, meaning these are cases we need to hanlde. Unfortunately there are qualifying rows for 'FinishedRunning' in the demo data.

In [47]:
cont_flattened_staging = med._flatten_timestamps_staging(cont_only, "rate")
cont_flattened = med._flatten_timestamps(cont_flattened_staging, "rate")

In [None]:
q = """
SELECT *
    , CASE WHEN med_dose IS NULL THEN 1 ELSE 0 END AS dose_missing
FROM cont_flattened
WHERE mar_action_name = 'Paused' 
    AND subject_id IN (
        SELECT DISTINCT CAST(patient_id AS INT)
        FROM pt_demo
    )
"""
df2 = duckdb.sql(q).df()

In [None]:
longer = cont_only.query("linkorderid == 294375").melt(
    id_vars = [
        "hadm_id", "linkorderid", "med_category", 'rate', 'rateuom', 'statusdescription'
    ],
    value_vars = ["starttime", "endtime"],
    var_name = "time", value_name = "admin_dttm"
).sort_values(["hadm_id", "linkorderid", "med_category", "admin_dttm", "time"])

## Intermittents (Archive)
This part needs updating as it was built to generate cont only without considering keeping intm.

First identify the intermittents and observe their pattern.

In [None]:
query = f"""
SELECT *
    -- flags to identify intermittents
    , CASE WHEN ordercategoryname = '05-Med Bolus'
        THEN 1 ELSE 0 END AS intm_by_ordercategoryname
    , CASE WHEN ordercategorydescription = 'Drug Push'
        THEN 1 ELSE 0 END AS intm_by_ordercategorydescription
    , CASE WHEN statusdescription = 'Bolus'
        THEN 1 ELSE 0 END AS intm_by_statusdescription
    -- flags to count discrepencies, i.e. if all of the ostensible intermittents satisfy the 1min duration rule
    , CASE WHEN intm_by_ordercategorydescription = 1 AND duration_1min = 0
        THEN 1 ELSE 0 END AS not_1min_ordercategorydescription
    , CASE WHEN intm_by_ordercategoryname = 1 AND duration_1min = 0
        THEN 1 ELSE 0 END AS not_1min_ordercategoryname
    , CASE WHEN intm_by_statusdescription = 1 AND duration_1min = 0
        THEN 1 ELSE 0 END AS not_1min_statusdescription
FROM med_selected_and_mapped
WHERE {find_intm_where_clause}

"""
intm_only = duckdb.sql(query).df()

print(f"# of intermittents identified through ordercategoryname that failed the 1-min rule: {intm_only.not_1min_ordercategoryname.sum()}")
print(f"# of intermittents identified through ordercategorydescription that failed the 1-min rule: {intm_only.not_1min_ordercategorydescription.sum()}")
print(f"# of intermittents identified through statusdescription that failed the 1-min rule: {intm_only.not_1min_statusdescription.sum()}")

# of intermittents identified through ordercategoryname that failed the 1-min rule: 0
# of intermittents identified through ordercategorydescription that failed the 1-min rule: 0
# of intermittents identified through statusdescription that failed the 1-min rule: 1249


From the result it seems `intm_by_statusdescription` is particularly messy, while the other two can be used to safety identify intermittents.

In [None]:
query = """
SELECT *
FROM intm_only
WHERE not_1min_statusdescription = 1
"""
df = duckdb.sql(query).df()

In [None]:
# check if any of continuous events has 1-min duration # FIXME: possibly outdated
query = f"""
SELECT *
FROM med_selected_and_mapped
WHERE NOT ({find_intm_where_clause})
"""
intermittent_removed = duckdb.sql(query).df()

In [None]:
query = f"""
SELECT *
FROM intermittent_removed
WHERE duration_1min = 1
"""
cont_yet_1min = duckdb.sql(query).df()

## Resolve duplicates
This is possibly also outdated as we should now consider duplicate using the linearized time.

In [None]:
query = """
SELECT *
FROM cont_only
QUALIFY COUNT(*) OVER (PARTITION BY hadm_id, med_category, starttime, statusdescription) > 1
"""
mac_dups = duckdb.sql(query).df()

In [None]:
# final check there is no dup for god's sake
# mac_ldfd.duplicated(subset=meds_keycols, keep=False).sum()