This notebook replaces the continuous med-only notebook and now contains the development code for building both the continuous and intermittent med admin tables -- since they are sourced from the same MIMIC-IV table, `inputevents`.

# Init

In [1]:
import os
os.chdir("..")
os.getcwd()

'/Users/wliao0504/code/clif/CLIF-MIMIC'

## Import

In [2]:
# src/tables/medication_admin.py
import numpy as np
import pandas as pd
import logging
from importlib import reload
import src.utils
reload(src.utils)
import duckdb
from src.utils import construct_mapper_dict, fetch_mimic_events, load_mapping_csv, \
    get_relevant_item_ids, find_duplicates, rename_and_reorder_cols, save_to_rclif, \
    convert_and_sort_datetime, setup_logging, search_mimic_items
from fuzzywuzzy import process

loaded configuration from /Users/wliao0504/code/clif/CLIF-MIMIC/src/../config/config.json
loaded configuration from /Users/wliao0504/code/clif/CLIF-MIMIC/src/../config/config.json




In [3]:
setup_logging()

MAC_COLUMNS = [
    "hospitalization_id", "med_order_id", "admin_dttm", "med_name", "med_category", "med_group", 
    "med_route_name", "med_route_category", "med_dose", "med_dose_unit", "mar_action_name", "mar_action_category"
]

MAC_COL_RENAME_MAPPER = {
    "dose": "med_dose",
    "rateuom": "med_dose_unit",
    "amountuom": "med_dose_unit",
    "new_mar": "mar_action_name", 
    "linkorderid": "med_order_id",
    "recorded_dttm": "admin_dttm"
}

MAC_MCIDE_URL = "https://raw.githubusercontent.com/clif-consortium/CLIF/main/mCIDE/clif_medication_admin_continuous_med_categories.csv"

def map_name_to_category(name, categories):
    '''
    Map a medication name to a category using fuzzy matching.
    '''
    match, score = process.extractOne(name, categories)
    return match if score >= 80 else None

def are_doses_close(doses):
    return (abs(doses.iloc[0] - doses.iloc[1]) / max(doses.iloc[0], doses.iloc[1])) <= 0.1

# drop the row with the shorter mar_action_name
def drop_shorter_action_name(group):
    if len(group) == 2 and are_doses_close(group['med_dose']):
        return group.loc[[group['mar_action_name'].str.len().idxmax()]]
    return group

2025-08-10 22:24:47,957 - INFO - initialized logging at logs/etl.log


# Dev

## Load

In [4]:
med_admin_mapping = load_mapping_csv("med_admin")

logging.info("parsing the mapping files to identify relevant items and fetch corresponding events...")
med_item_ids = get_relevant_item_ids(
    mapping_df = med_admin_mapping, 
    decision_col = "decision", 
    excluded_labels = ["NO MAPPING", "UNSURE", "NOT AVAILABLE"]
    ) 

med_events = fetch_mimic_events(med_item_ids).pipe(convert_and_sort_datetime)

med_events.head()

2025-08-10 22:24:49,987 - INFO - parsing the mapping files to identify relevant items and fetch corresponding events...
2025-08-10 22:24:49,989 - INFO - querying the d_items table to identify which event tables to be separately queried for 79 items
2025-08-10 22:24:49,993 - INFO - identified 1 event tables to be separately queried: ['inputevents']
2025-08-10 22:24:49,994 - INFO - fetching events from inputevents table for 79 items
2025-08-10 22:24:53,475 - INFO - fetched 4845165 events from inputevents table for 79 items
2025-08-10 22:24:53,475 - INFO - concatenated 4845165 events from 1 event table(s)


Unnamed: 0,index,subject_id,hadm_id,stay_id,caregiver_id,starttime,endtime,storetime,itemid,amount,...,originalamount,originalrate,label,abbreviation,linksto,category,unitname,param_type,lownormalvalue,highnormalvalue
0,0,14046553,20000094,35605481,19825,2150-03-02 15:25:00,2150-03-02 16:48:00,2150-03-02 15:26:00,221906,1.623472,...,8.0,0.18,Norepinephrine,Norepinephrine,inputevents,Medications,mg,Solution,,
1,1,14046553,20000094,35605481,19825,2150-03-02 16:34:00,2150-03-02 18:42:00,2150-03-02 16:35:00,220949,69.565216,...,250.0,32.6087,Dextrose 5%,Dextrose 5%,inputevents,Fluids/Intake,mL,Solution,,
2,2,14046553,20000094,35605481,19825,2150-03-02 16:34:00,2150-03-02 18:42:00,2150-03-02 16:35:00,221653,69.565211,...,250.0,5.0,Dobutamine,Dobutamine,inputevents,Medications,mg,Solution,,
3,3,14046553,20000094,35605481,40982,2150-03-02 16:48:00,2150-03-02 18:32:00,2150-03-02 17:00:00,221906,1.806973,...,6.376528,0.16,Norepinephrine,Norepinephrine,inputevents,Medications,mg,Solution,,
4,4,14046553,20000094,35605481,40982,2150-03-02 18:32:00,2150-03-02 18:45:00,2150-03-02 18:38:00,221906,0.253864,...,4.569556,0.18,Norepinephrine,Norepinephrine,inputevents,Medications,mg,Solution,,


## Relevant columns

In [5]:
print(med_events.value_counts("ordercomponenttypedescription"))
print("--- which does not seem relevant:\n")

ordercomponenttypedescription
Main order parameter                                                                                    3590523
Mixed solution                                                                                          1155586
Additives                                         Ampoule                                                 99056
Name: count, dtype: int64
--- which does not seem relevant:



In [6]:
med_events.value_counts('secondaryordercategoryname')
print("--- which does not seem relevant:\n")

--- which does not seem relevant:



In [7]:
find_intm_where_clause = """
ordercategoryname = '05-Med Bolus'
    OR ordercategorydescription = 'Drug Push'
    OR statusdescription = 'Bolus'
"""

In [8]:
query = f"""
SELECT subject_id, hadm_id
    , starttime, endtime --, storetime
    , linkorderid
    , statusdescription
    , med_category
    , rate, rateuom
    , amount, amountuom
    , patientweight
    , totalamount, totalamountuom, originalamount, originalrate
    , ordercategoryname, ordercategorydescription
    , itemid
    , e.label
    , m.decision AS item_class
    , CASE WHEN (endtime - starttime) = INTERVAL '1 minute'
        THEN 1 ELSE 0 END AS duration_1min
    -- flags to identify intermittents
    , CASE WHEN ordercategoryname = '05-Med Bolus'
        THEN 1 ELSE 0 END AS intm_by_ordercategoryname
    , CASE WHEN ordercategorydescription = 'Drug Push'
        THEN 1 ELSE 0 END AS intm_by_ordercategorydescription
    , CASE WHEN statusdescription = 'Bolus'
        THEN 1 ELSE 0 END AS intm_by_statusdescription
    , CASE WHEN item_class = 'INTERMITTENT'
        OR (item_class = 'BOTH' AND ({find_intm_where_clause}))
        THEN 'intm' ELSE 'cont' END AS "to_table"
FROM med_events e
LEFT JOIN med_admin_mapping m USING (itemid)
ORDER BY hadm_id, starttime, linkorderid, med_category, endtime
"""
med_selected_and_mapped = duckdb.sql(query).df()

In [9]:
print("note that there is a 'Bolus' category under statusdescription, which is the MAR action")
med_selected_and_mapped.value_counts("statusdescription")

note that there is a 'Bolus' category under statusdescription, which is the MAR action


statusdescription
FinishedRunning    2639783
ChangeDose/Rate    1722437
Stopped             265600
Paused              215504
Bolus                 2595
Name: count, dtype: int64

## Split continuous and intermittents

In [None]:
query = f"""
SELECT *
FROM med_selected_and_mapped
WHERE item_class = 'CONTINUOUS'
    OR (item_class = 'BOTH' AND NOT ({find_intm_where_clause}))
"""
cont_only = duckdb.sql(query).df()

## Remove intermittents

First identify the intermittents and observe their pattern.

In [None]:
query = f"""
SELECT *
    -- flags to identify intermittents
    , CASE WHEN ordercategoryname = '05-Med Bolus'
        THEN 1 ELSE 0 END AS intm_by_ordercategoryname
    , CASE WHEN ordercategorydescription = 'Drug Push'
        THEN 1 ELSE 0 END AS intm_by_ordercategorydescription
    , CASE WHEN statusdescription = 'Bolus'
        THEN 1 ELSE 0 END AS intm_by_statusdescription
    -- flags to count discrepencies, i.e. if all of the ostensible intermittents satisfy the 1min duration rule
    , CASE WHEN intm_by_ordercategorydescription = 1 AND duration_1min = 0
        THEN 1 ELSE 0 END AS not_1min_ordercategorydescription
    , CASE WHEN intm_by_ordercategoryname = 1 AND duration_1min = 0
        THEN 1 ELSE 0 END AS not_1min_ordercategoryname
    , CASE WHEN intm_by_statusdescription = 1 AND duration_1min = 0
        THEN 1 ELSE 0 END AS not_1min_statusdescription
FROM mac_selected_and_mapped
WHERE {find_intm_where_clause}

"""
mac_intm = duckdb.sql(query).df()

print(f"# of intermittents identified through ordercategoryname that failed the 1-min rule: {mac_intm.not_1min_ordercategoryname.sum()}")
print(f"# of intermittents identified through ordercategorydescription that failed the 1-min rule: {mac_intm.not_1min_ordercategorydescription.sum()}")
print(f"# of intermittents identified through statusdescription that failed the 1-min rule: {mac_intm.not_1min_statusdescription.sum()}")

# of intermittents identified through ordercategoryname that failed the 1-min rule: 0
# of intermittents identified through ordercategorydescription that failed the 1-min rule: 0
# of intermittents identified through statusdescription that failed the 1-min rule: 1262


From the result it seems `intm_by_statusdescription` is particularly messy, while the other two can be used to safety identify intermittents.

In [53]:
query = """
SELECT *
FROM mac_intm
WHERE not_1min_statusdescription = 1
"""
df = duckdb.sql(query).df()

In [54]:
# check if any of continuous events has 1-min duration
query = f"""
SELECT *
FROM mac_selected_and_mapped
WHERE NOT ({find_intm_where_clause})
"""
intermittent_removed = duckdb.sql(query).df()

In [55]:
query = f"""
SELECT *
FROM intermittent_removed
WHERE duration_1min = 1
"""
cont_yet_1min = duckdb.sql(query).df()

## Resolve duplicates

In [60]:
query = """
SELECT *
FROM intermittent_removed
QUALIFY COUNT(*) OVER (PARTITION BY hadm_id, med_category, starttime, statusdescription) > 1
"""
mac_dups = duckdb.sql(query).df()

In [None]:
# final check there is no dup for god's sake
# mac_ldfd.duplicated(subset=meds_keycols, keep=False).sum()

SyntaxError: invalid syntax (434724452.py, line 2)

## MAR

In [10]:
query = """
SELECT subject_id, hadm_id
    , linkorderid
    , starttime, endtime 
    , LEAD(starttime) OVER (PARTITION BY hadm_id, linkorderid, med_category ORDER BY starttime) AS starttime_next
    , ROW_NUMBER() OVER (PARTITION BY hadm_id, linkorderid, med_category ORDER BY starttime) AS rn
    , starttime = MIN(starttime) OVER (PARTITION BY hadm_id, linkorderid, med_category) AS is_first_row
    , starttime = MAX(starttime) OVER (PARTITION BY hadm_id, linkorderid, med_category) AS is_last_row
    , statusdescription
    , LAG(statusdescription) OVER (PARTITION BY hadm_id, linkorderid, med_category ORDER BY starttime) AS statusdescription_prev
    , med_category
    , to_table
    , rate, rateuom
    , amount, amountuom
    , patientweight
    --, totalamount, totalamountuom, originalamount, originalrate
    , ordercategoryname, ordercategorydescription
    , itemid
    , label
    , item_class
FROM med_selected_and_mapped
ORDER BY hadm_id, linkorderid, med_category, starttime, endtime
"""
med_lagged = duckdb.sql(query).df()

In [11]:
query = """
SELECT hadm_id as hospitalization_id
    , linkorderid as med_order_id
    , label as med_name
    , med_category
    , starttime AS admin_dttm
    , CASE WHEN is_first_row = 1 THEN 'start' ELSE statusdescription_prev END AS mar_action_name
    , rate AS med_dose
    , rateuom as med_dose_unit
FROM med_lagged
UNION ALL

SELECT hadm_id as hospitalization_id
    , linkorderid as med_order_id
    , label as med_name
    , med_category
    , endtime AS admin_dttm
    , statusdescription AS mar_action_name
    , 0 AS med_dose
    , rateuom as med_dose_unit
FROM med_lagged
WHERE is_last_row = 1 
ORDER BY hadm_id, linkorderid, med_category, admin_dttm
"""
admin_time_flattened = duckdb.sql(query).df()

In [12]:
# pivot longer and merge the starttime and endtime into a single column
med_l = med_selected_and_mapped.melt(
    id_vars = [
        "hadm_id", "itemid", "index", "rate", "rateuom", # "amount", "amountuom", 
        "statusdescription", "linkorderid", "label"],
    value_vars = ["starttime", "endtime"],
    var_name = "time", value_name = "recorded_dttm"
).sort_values(["hadm_id", "itemid", "index", "time"], ascending = [True, True, True, False])

KeyError: "The following id_vars or value_vars are not present in the DataFrame: ['index']"

In [None]:
med_l["diff"] = med_l.groupby(['hadm_id', 'itemid'])[['recorded_dttm']].transform("diff")
med_l['mar'] = np.where(med_l['time'] == 'starttime', 'start', med_l['statusdescription'])
med_l['dose'] = np.where(med_l['time'] == 'starttime', med_l['rate'], np.nan)
# mac_l['dose'] = np.where(mac_l['time'] == 'starttime', mac_l['amount'], np.nan)
med_l['last_mar'] = med_l['mar'].shift(1)

med_l['new_mar'] = np.where(
    med_l['diff'] == pd.Timedelta(0),
    med_l['last_mar'].apply(lambda x: f"continue after {x}"),
    med_l['mar']
)

# removing duplicates by filter out rows with NA "dose"
med_l['time_dup'] = med_l.duplicated(["hadm_id", "itemid", "recorded_dttm"], keep = False)
med_l['keep'] = (~med_l["time_dup"]) | pd.notna(med_l["dose"])
mac_ld = med_l[med_l['keep']].copy()
# mac_ld["med_name"] = mac_ld["itemid"].map(mac_id_to_name_mapper)
mac_ld["med_category"] = mac_ld["itemid"].map(mac_mapper)
mac_ld["med_group"] = mac_ld["med_category"].map(mac_category_to_group_mapper)
mac_ldf = rename_and_reorder_cols(mac_ld, MAC_COL_RENAME_MAPPER, MAC_COL_NAMES)