This notebook replaces the continuous med-only notebook and now contains the development code for building both the continuous and intermittent med admin tables -- since they are sourced from the same MIMIC-IV table, `inputevents`.

# Init

In [1]:
import os
os.chdir("..")
os.getcwd()

'/Users/wliao0504/code/clif/CLIF-MIMIC'

## Import

In [45]:
# src/tables/medication_admin.py
import numpy as np
import pandas as pd
import logging
from importlib import reload
import src.utils
reload(src.utils)
import duckdb
from src.utils import construct_mapper_dict, fetch_mimic_events, load_mapping_csv, \
    get_relevant_item_ids, find_duplicates, rename_and_reorder_cols, save_to_rclif, \
    convert_and_sort_datetime, setup_logging, search_mimic_items
from fuzzywuzzy import process
import src.tables.medication_admin as med
reload(med)

2025-08-20 17:21:53,446 - INFO - initialized logging at logs/etl.log


loaded configuration from /Users/wliao0504/code/clif/CLIF-MIMIC/src/../config/config.json


<module 'src.tables.medication_admin' from '/Users/wliao0504/code/clif/CLIF-MIMIC/src/tables/medication_admin.py'>

In [46]:
setup_logging()

MAC_COLUMNS = [
    "hospitalization_id", "med_order_id", "admin_dttm", "med_name", "med_category", "med_group", 
    "med_route_name", "med_route_category", "med_dose", "med_dose_unit", "mar_action_name", "mar_action_category"
]

MAC_COL_RENAME_MAPPER = {
    "dose": "med_dose",
    "rateuom": "med_dose_unit",
    "amountuom": "med_dose_unit",
    "new_mar": "mar_action_name", 
    "linkorderid": "med_order_id",
    "recorded_dttm": "admin_dttm"
}

MAC_MCIDE_URL = "https://raw.githubusercontent.com/clif-consortium/CLIF/main/mCIDE/clif_medication_admin_continuous_med_categories.csv"

def map_name_to_category(name, categories):
    '''
    Map a medication name to a category using fuzzy matching.
    '''
    match, score = process.extractOne(name, categories)
    return match if score >= 80 else None

def are_doses_close(doses):
    return (abs(doses.iloc[0] - doses.iloc[1]) / max(doses.iloc[0], doses.iloc[1])) <= 0.1

# drop the row with the shorter mar_action_name
def drop_shorter_action_name(group):
    if len(group) == 2 and are_doses_close(group['med_dose']):
        return group.loc[[group['mar_action_name'].str.len().idxmax()]]
    return group

2025-08-20 17:21:53,455 - INFO - initialized logging at logs/etl.log


In [47]:
pt_demo = pd.read_parquet('tests/clif_patient.parquet')

# Dev

## Load

In [5]:
med_admin_mapping = load_mapping_csv("med_admin")

logging.info("parsing the mapping files to identify relevant items and fetch corresponding events...")
med_item_ids = get_relevant_item_ids(
    mapping_df = med_admin_mapping, 
    decision_col = "decision", 
    excluded_labels = ["NO MAPPING", "UNSURE", "NOT AVAILABLE"]
    ) 

med_events = fetch_mimic_events(med_item_ids).pipe(convert_and_sort_datetime)

med_events.head()

2025-08-20 16:22:59,609 - INFO - parsing the mapping files to identify relevant items and fetch corresponding events...
2025-08-20 16:22:59,610 - INFO - querying the d_items table to identify which event tables to be separately queried for 79 items
2025-08-20 16:22:59,613 - INFO - identified 1 event tables to be separately queried: ['inputevents']
2025-08-20 16:22:59,613 - INFO - fetching events from inputevents table for 79 items
2025-08-20 16:23:03,086 - INFO - fetched 4845165 events from inputevents table for 79 items
2025-08-20 16:23:03,087 - INFO - concatenated 4845165 events from 1 event table(s)


Unnamed: 0,index,subject_id,hadm_id,stay_id,caregiver_id,starttime,endtime,storetime,itemid,amount,...,originalamount,originalrate,label,abbreviation,linksto,category,unitname,param_type,lownormalvalue,highnormalvalue
0,0,14046553,20000094,35605481,19825,2150-03-02 15:25:00,2150-03-02 16:48:00,2150-03-02 15:26:00,221906,1.623472,...,8.0,0.18,Norepinephrine,Norepinephrine,inputevents,Medications,mg,Solution,,
1,1,14046553,20000094,35605481,19825,2150-03-02 16:34:00,2150-03-02 18:42:00,2150-03-02 16:35:00,220949,69.565216,...,250.0,32.6087,Dextrose 5%,Dextrose 5%,inputevents,Fluids/Intake,mL,Solution,,
2,2,14046553,20000094,35605481,19825,2150-03-02 16:34:00,2150-03-02 18:42:00,2150-03-02 16:35:00,221653,69.565211,...,250.0,5.0,Dobutamine,Dobutamine,inputevents,Medications,mg,Solution,,
3,3,14046553,20000094,35605481,40982,2150-03-02 16:48:00,2150-03-02 18:32:00,2150-03-02 17:00:00,221906,1.806973,...,6.376528,0.16,Norepinephrine,Norepinephrine,inputevents,Medications,mg,Solution,,
4,4,14046553,20000094,35605481,40982,2150-03-02 18:32:00,2150-03-02 18:45:00,2150-03-02 18:38:00,221906,0.253864,...,4.569556,0.18,Norepinephrine,Norepinephrine,inputevents,Medications,mg,Solution,,


## Relevant columns

In [6]:
print(med_events.value_counts("ordercomponenttypedescription"))
print("\n--- which does not seem relevant")

ordercomponenttypedescription
Main order parameter                                                                                    3590523
Mixed solution                                                                                          1155586
Additives                                         Ampoule                                                 99056
Name: count, dtype: int64

--- which does not seem relevant


In [7]:
med_events.value_counts('secondaryordercategoryname')
print("\n--- which does not seem relevant")


--- which does not seem relevant


In [8]:
find_intm_where_clause = """
ordercategoryname = '05-Med Bolus'
    OR ordercategorydescription = 'Drug Push'
    OR statusdescription = 'Bolus'
"""

In [9]:
query = f"""
SELECT subject_id, hadm_id
    , starttime, endtime --, storetime
    , linkorderid
    , statusdescription
    , med_category
    , rate, rateuom
    , amount, amountuom
    , patientweight
    , totalamount, totalamountuom, originalamount, originalrate
    , ordercategoryname, ordercategorydescription
    , itemid
    , e.label
    , m.decision AS item_class
    , CASE WHEN (endtime - starttime) = INTERVAL '1 minute'
        THEN 1 ELSE 0 END AS duration_1min
    -- flags to identify intermittents
    , CASE WHEN ordercategoryname = '05-Med Bolus'
        THEN 1 ELSE 0 END AS intm_by_ordercategoryname
    , CASE WHEN ordercategorydescription = 'Drug Push'
        THEN 1 ELSE 0 END AS intm_by_ordercategorydescription
    , CASE WHEN statusdescription = 'Bolus'
        THEN 1 ELSE 0 END AS intm_by_statusdescription
    , CASE WHEN item_class = 'INTERMITTENT'
        OR (item_class = 'BOTH' AND ({find_intm_where_clause}))
        THEN 'intm' ELSE 'cont' END AS "to_table"
FROM med_events e
LEFT JOIN med_admin_mapping m USING (itemid)
ORDER BY hadm_id, starttime, linkorderid, med_category, endtime
"""
med_selected_and_mapped = duckdb.sql(query).df()

In [10]:
print("note that there is a 'Bolus' category under statusdescription, which is the MAR action")
med_selected_and_mapped.value_counts("statusdescription")

note that there is a 'Bolus' category under statusdescription, which is the MAR action


statusdescription
FinishedRunning    2639783
ChangeDose/Rate    1722437
Stopped             265600
Paused              215504
Bolus                 2595
Name: count, dtype: int64

## Split continuous and intermittents

In [11]:
query = f"""
SELECT *
FROM med_selected_and_mapped
WHERE item_class = 'CONTINUOUS'
    OR (item_class = 'BOTH' AND NOT ({find_intm_where_clause}))
"""
cont_only = duckdb.sql(query).df()

## Process continous

### Check missing dose `rate`

In [12]:
query = """
SELECT *
    , EXTRACT(EPOCH FROM (endtime - starttime)) / 60 AS duration_in_mins
    , amount / duration_in_mins AS rate_imputed
    , CONCAT(amountuom, '/min') AS rateuom_imputed
    , ABS(originalrate - rate_imputed) AS abs_diff
    , abs_diff / ((originalrate + rate_imputed) / 2) AS rel_diff
    , CASE WHEN rel_diff <= 0.01 OR abs_diff <= 0.000000001 THEN 1 ELSE 0 END AS close_rates
FROM cont_only
WHERE rate IS NULL
"""
cont_missing_rates = duckdb.sql(query).df()
mask = cont_missing_rates['close_rates'] == 1
print(f"% of the imputed rates (using amount divided by duration) close to the `originalrate`: {mask.mean() * 100:.1f}% ({mask.sum()})")
print("It seeems safe to use the imputed rates to fill the missing `rate`.")

cont_missing_rates.value_counts('med_category')

% of the imputed rates (using amount divided by duration) close to the `originalrate`: 92.5% (93477)
It seeems safe to use the imputed rates to fill the missing `rate`.


med_category
magnesium             82477
sodium bicarbonate    10203
dextrose               6047
insulin                2245
heparin                  87
Name: count, dtype: int64

### Flatten start-end timestamps to admin timestamps

In [13]:
cont_lagged = med._prepare_for_timestamp_flattening(cont_only)

In [14]:
# Check pause
# find demo data to test
query = """
SELECT *    
FROM cont_lagged
WHERE statusdescription = 'Paused'
    AND is_last_row = 0
    AND is_first_row = 0
    AND subject_id IN (
        SELECT DISTINCT CAST(patient_id AS INT)
        FROM pt_demo
    )
"""

df = duckdb.sql(query).df()

In [None]:
cont_flattened = med._flatten_timestamps(cont_lagged, "rate")
# RESUME: continue checking the result using the the two focal example orderid

In [None]:
# cont_flattened.value_counts('mar_action_name')

mar_action_name
ChangeDose/Rate    1722394
start              1195110
FinishedRunning     814202
Stopped             265415
Paused              215448
Bolus                  566
Name: count, dtype: int64

In [20]:
longer = cont_only.query("linkorderid == 294375").melt(
    id_vars = [
        "hadm_id", "linkorderid", "med_category", 'rate', 'rateuom', 'statusdescription'
    ],
    value_vars = ["starttime", "endtime"],
    var_name = "time", value_name = "admin_dttm"
).sort_values(["hadm_id", "linkorderid", "med_category", "admin_dttm", "time"])

## Intermittents (Archive)
This part needs updating as it was built to generate cont only without considering keeping intm.

First identify the intermittents and observe their pattern.

In [None]:
query = f"""
SELECT *
    -- flags to identify intermittents
    , CASE WHEN ordercategoryname = '05-Med Bolus'
        THEN 1 ELSE 0 END AS intm_by_ordercategoryname
    , CASE WHEN ordercategorydescription = 'Drug Push'
        THEN 1 ELSE 0 END AS intm_by_ordercategorydescription
    , CASE WHEN statusdescription = 'Bolus'
        THEN 1 ELSE 0 END AS intm_by_statusdescription
    -- flags to count discrepencies, i.e. if all of the ostensible intermittents satisfy the 1min duration rule
    , CASE WHEN intm_by_ordercategorydescription = 1 AND duration_1min = 0
        THEN 1 ELSE 0 END AS not_1min_ordercategorydescription
    , CASE WHEN intm_by_ordercategoryname = 1 AND duration_1min = 0
        THEN 1 ELSE 0 END AS not_1min_ordercategoryname
    , CASE WHEN intm_by_statusdescription = 1 AND duration_1min = 0
        THEN 1 ELSE 0 END AS not_1min_statusdescription
FROM med_selected_and_mapped
WHERE {find_intm_where_clause}

"""
intm_only = duckdb.sql(query).df()

print(f"# of intermittents identified through ordercategoryname that failed the 1-min rule: {intm_only.not_1min_ordercategoryname.sum()}")
print(f"# of intermittents identified through ordercategorydescription that failed the 1-min rule: {intm_only.not_1min_ordercategorydescription.sum()}")
print(f"# of intermittents identified through statusdescription that failed the 1-min rule: {intm_only.not_1min_statusdescription.sum()}")

# of intermittents identified through ordercategoryname that failed the 1-min rule: 0
# of intermittents identified through ordercategorydescription that failed the 1-min rule: 0
# of intermittents identified through statusdescription that failed the 1-min rule: 1249


From the result it seems `intm_by_statusdescription` is particularly messy, while the other two can be used to safety identify intermittents.

In [None]:
query = """
SELECT *
FROM intm_only
WHERE not_1min_statusdescription = 1
"""
df = duckdb.sql(query).df()

In [None]:
# check if any of continuous events has 1-min duration # FIXME: possibly outdated
query = f"""
SELECT *
FROM med_selected_and_mapped
WHERE NOT ({find_intm_where_clause})
"""
intermittent_removed = duckdb.sql(query).df()

In [None]:
query = f"""
SELECT *
FROM intermittent_removed
WHERE duration_1min = 1
"""
cont_yet_1min = duckdb.sql(query).df()

## Resolve duplicates
This is possibly also outdated as we should now consider duplicate using the linearized time.

In [None]:
query = """
SELECT *
FROM cont_only
QUALIFY COUNT(*) OVER (PARTITION BY hadm_id, med_category, starttime, statusdescription) > 1
"""
mac_dups = duckdb.sql(query).df()

In [None]:
# final check there is no dup for god's sake
# mac_ldfd.duplicated(subset=meds_keycols, keep=False).sum()