# ncDMARDS

In [1]:
import os
import glob
import sys
import time
import json

import pandas as pd
import numpy as np
import regex as re
import seaborn as sns

from utils import medswitch
from utils import ncdmard

import logging
logger = logging.getLogger('distributed.scheduler')
logger.setLevel(logging.ERROR)
logger = logging.getLogger('distributed.core')
logger.setLevel(logging.ERROR)


# Load cluster
from utils.dask_cluster import *
#load_cluster(cores=8, queue="long.q", memory="48GiB", walltime='04:00:00', scale=250)

# options
%matplotlib inline
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_colwidth', 30)

sns.set_style("white")
sns.set_context("talk")



ic-app.wynton.ucsf.edu


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Saving for manual review
orders_table = load_register_table("DEID_CDW", "medicationorderfact")

all_med_order_counts = orders_table.groupby(["medicationname", "medicationgenericname", "medicationtherapeuticclass", "medicationpharmaceuticalclass", "medicationpharmaceuticalsubclass"])["medicationorderkey"].count().compute()
all_med_order_counts = all_med_order_counts.reset_index()
all_med_order_counts.columns = ["order_count" if c=="medicationorderkey" else c for c in all_med_order_counts.columns]
all_med_order_counts = all_med_order_counts.sort_values(by="order_count")
all_med_order_counts.to_csv("./data/all_med_order_counts.csv")


# Cohort extraction

In [None]:
# Create brand name -> generic mapping
all_drugs = pd.read_csv("./data/ncdmard/raw/med_mapping.csv")

all_drugs = all_drugs.dropna(subset=["drug_class"])
all_drugs = all_drugs[all_drugs["drug_class"].str.contains("anti-TNF")]

# Create brand name -> generic mapping
med_mapping = dict(zip(all_drugs["reference_brand_name"].str.lower(), all_drugs["generic_name"].str.lower()))
generic_dict = {b.lower():g.lower()+"-"+s.lower() for b,g,s in 
                zip(all_drugs["biosimilar_brand_name"], all_drugs["generic_name"], all_drugs["biosimilar_suffix"])
               if type(b)==str}
med_mapping.update(generic_dict)

# Get medication values
ncdmard.getMedications(med_mapping, output="./data/ncdmard/raw")
ncdmard.getDemographics(filepath="./data/ncdmard/raw")
ncdmard.addNotes(filepath="./data/ncdmard/raw", meds_min_months_fu=6)
ncdmard.finalWeakAnnotations(med_mapping, filepath="./data/ncdmard/raw")


# GPT4 prompt development dataset

In [None]:
### Split to prompt_dev and test datasets
filepath= "./data/ncdmard/gpt4"
#medswitch.split_prompt_test(med_class_name="ncdmard", pt_frac=0.05, random_state=0)

### Run GPT4 on prompt_dev datasets
valid_df = pd.read_parquet(f"{filepath}/validation.parquet.gzip")
note_keys = list(valid_df["note_deid_note_key"])
note_texts = list(valid_df["note_text"])
print(len(note_keys))

# Query
model = "gpt-4-turbo-128k"
date = "2024-02-21"

with open(f"{filepath}/prompt_configs.json", "r") as file:
    prompt_configs = json.load(file)
    
for task_name, task in prompt_configs.items():      
    
    if task_name!="reasons-provided":
        continue
    
    outfile_path = f"{filepath}/prompt_dev/{date}_{model}_{task_name}_prompt_dev.csv"
    
    if ~os.path.exists(outfile_path):
        medswitch.openai_query(note_keys=note_keys,
                     note_texts=note_texts,
                     task=task,
                     outfile=outfile_path,
                     save_every=15,
                     functions=None,
                        **{"model":"gpt-4-turbo-128k",
                        "max_tokens":1024,
                        "frequency_penalty":0,
                        "presence_penalty":0,
                        "temperature":0,
                        "top_p":1,})
    
    # For determining number of reasons provided
    query_df = pd.read_csv(f"{filepath}/prompt_dev/{date}_{model}_{task_name}_prompt_dev.csv", index_col=0)
    if "reason_last_TNFi_stopped" in query_df.columns:
        print(task_name, query_df["reason_last_TNFi_stopped"].dropna().shape)
    else:
        print(task_name, query_df["full_reason_last_TNFi_stopped"].dropna().shape)
        
    '''
    default-task (45,)
    drugs-provided (47,)
    reasons-provided (49,)
    all-values-provided (56,)
    '''



# GPT4 test set

In [2]:
### Split to prompt_dev and test datasets
filepath= "./data/ncdmard/gpt4"

### Run GPT4 on prompt_dev datasets
valid_df = pd.read_parquet(f"{filepath}/test.parquet.gzip")
note_keys = list(valid_df["note_deid_note_key"])
note_texts = list(valid_df["note_text"])
print(len(note_keys))


2958


In [4]:
### Split to prompt_dev and test datasets
filepath= "./data/ncdmard/gpt4"

### Run GPT4 on prompt_dev datasets
valid_df = pd.read_parquet(f"{filepath}/test.parquet.gzip")
note_keys = list(valid_df["note_deid_note_key"])
note_texts = list(valid_df["note_text"])
print(len(note_keys))

# Query
model = "gpt-4-turbo-128k"
date = "2024-03-30"
task_name = "reasons-provided"

with open(f"{filepath}/prompt_configs.json", "r") as file:
    prompt_configs = json.load(file)
    
task = prompt_configs["reasons-provided"]    
outfile_path = f"{filepath}/{date}_{model}_{task_name}_test.csv"

if not os.path.exists(outfile_path):
    medswitch.openai_query(note_keys=note_keys,
                 note_texts=note_texts,
                 task=task,
                 outfile=outfile_path,
                 save_every=15,
                 functions=None,
                    **{"model":"gpt-4-turbo-128k",
                    "max_tokens":1024,
                    "frequency_penalty":0,
                    "presence_penalty":0,
                    "temperature":0,
                    "top_p":1,})
    


2958
SYS MESSAGE: None
PROMPT: Task: Tumor necrosis factor inhibitors (TNFis) describe biologic drugs targeting TNF proteins. Using the clinical note provided, extract the following information into this JSON format: {"new_TNFi":"What new TNFi was prescribed or started? If the patient is not starting a new TNFi, write "NA"","last_TNFi":"What was the last TNFi the patient used? If none, write "NA"","reason_type_last_TNFi_stopped":"Which best describes why the last TNFi was stopped or planned to be stopped? "Adverse event", "Drug resistance", "Insurance/Cost","Lack of efficacy","Patient preference","Other", "NA"","full_reason_last_TNFi_stopped":"Provide a description for why the last TNFi was stopped or planned to be stopped?"}
Answer:
Saving up to note key: D56E4F1683EAB0
Saving up to note key: DA3BAC613F33CA
Saving up to note key: DF204CC13884C9
Error converting response to json
Saving up to note key: DBA4E6BE502424
Saving up to note key: D17025874F2515
Saving up to note key: D50000D1F

# Notes for annotation

In [3]:
import csv

# Create dataset for annotation
annot_med_df = pd.read_parquet("./data/ncdmard/gpt4/validation.parquet.gzip").reset_index(drop=True)
annot_med_df = annot_med_df[["note_deid_note_key", "note_text"]]

gpt4_df = pd.read_csv("./data/ncdmard/gpt4/prompt_dev/2024-02-21_gpt-4-turbo-128k_reasons-provided_prompt_dev.csv")
reasons = gpt4_df[["Unnamed: 0", "reason_type_last_TNFi_stopped", "full_reason_last_TNFi_stopped"]]

annot_med_df = annot_med_df.merge(reasons,
                                  how="inner",
                                  left_on="note_deid_note_key",
                                  right_on="Unnamed: 0")
annot_med_df["TNFi started"] = np.nan
annot_med_df["TNFi stopped"] = np.nan
annot_med_df["Reason type for stopping"] = np.nan
annot_med_df["Additional comments"] = np.nan

annot_med_df = annot_med_df[['note_deid_note_key', 
                             'note_text', 
                             'TNFi started', 'TNFi stopped',
                             "Reason type for stopping",
                            "Additional comments"]].set_index("note_deid_note_key")
annot_med_df.to_csv("./data/ncdmard/annotation/annotation_set.csv",encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC)
#annot_med_df.to_excel("./data/ncdmard/annotation/annotation_set.xlsx")

# Provide annotators with dictionary of brand name -> generic mapping
all_drugs = pd.read_csv("./data/ncdmard/raw/med_mapping.csv")

all_drugs = all_drugs.dropna(subset=["drug_class"])
all_drugs = all_drugs[all_drugs["drug_class"].str.contains("anti-TNF")]

med_mapping = dict(zip(all_drugs["reference_brand_name"].str.lower(), all_drugs["generic_name"].str.lower()))
generic_dict = {b.lower():g.lower()+"-"+s.lower() for b,g,s in 
                zip(all_drugs["biosimilar_brand_name"], all_drugs["generic_name"], all_drugs["biosimilar_suffix"])
               if type(b)==str}
med_mapping.update(generic_dict)

full_map_df = pd.DataFrame.from_dict(med_mapping, orient="index")
full_map_df.columns = ["TNFi class"]
full_map_df.to_csv("./data/ncdmard/annotation/med_mapping_dictionary.csv")


In [None]:
## Cleaned comparison values
start_annot = list(annot_med["medicationname_generic_clean"])
stop_annot = list(annot_med["medication_stopped_clean"])

start_gpt = list(gpt_by_encounter["contraceptive_started_clean"])
stop_gpt = list(gpt_by_encounter["contraceptive_stopped_clean"])

## Evaluate
#[(gpt, annot) for gpt, annot in zip(start_gpt, start_annot) if len(set(gpt).difference(set(annot)))!=0]

print("Medication starting")
benchmark.em_accuracy(start_gpt, start_annot)
benchmark.f1_scores(preds=start_gpt, references=start_annot, average="macro")
print("F1: %.2f"%benchmark.f1_scores(preds=start_gpt, references=start_annot, average="macro"))
print()

print("Medication stopping")
benchmark.em_accuracy(stop_gpt, stop_annot)
print("F1: %.2f"%benchmark.f1_scores(preds=stop_gpt, references=stop_annot, average="macro"))
print()

# Other 
#https://www.nature.com/articles/s41573-020-00092-2
#lupus_treatments = "belimumab|obexelimab|ianalumab"
#ms_treatments = "fingolimod|ozanimod|natalizumab|ocrelizumab|atumumab|ublituximab|ofatumumab|inebilizumab"

"""
Non-RA but biologic meds
"ilaris":"canakinumab": antiIL1
"rituxan":"rituximab": CD20
"sylvant":"siltuximab": antiIL6
"arcalyst":"rilonacept": antiIL1 
"stelara":"ustekinumab": antiIL12/23
"consentyx":"secukinumab":antiIL17
"nulojix":"belatacept":"CTLA4"
"""

proc_rdd = load_register_table("DEID_CDW", "procedureeventfact")
tnf_proc = proc_rdd[proc_rdd["procedurename"].str.contains("|".join(list(tnf_mapping.keys())+list(tnf_mapping.values())), case=False)]
tnf_proc = tnf_proc.compute()

tnf_proc["procedurename"].value_counts()

"""### Can GPT perform clustering on a toy example? Not really. Not even GPT4
t0 = "Patient has continued on methotrexate and 4-5 mg prednisone daily. Patient decided they did not want to add humira. "
t1 = "The Remicade has been helpful, but had worse joint pain in the hands, so frequency of Remicade infusions was increased to every 6 weeks.  After two infusions at every 6 weeks, the arthritis is doing much better."
t2 = "Patient tried 300 mg of Anakinra per day for 3 weeks, but had fever, nausea, and hot flashes, so is back on 200 mg of Anakinra per day."
df = pd.DataFrame([(t0, 0)] * 8 + [(t1, 1)] * 7 + [(t2, 2)] * 5, columns=["text", "value"])

shuffled = df.sample(frac = 1).reset_index(drop=True)
shuffled["value"].value_counts(normalize=True)

for i, t in enumerate(shuffled["text"]):
    print(i, "\t", t)

"""
"""# Load patients with valve replacemnet
surg_cols = ["encounterkey", "patientdurablekey", "primarysurgeontype", "primarysurgeonprimaryspecialty",
             "primarysurgeonkey", "primaryprocedurename", "surgerydatekeyvalue"]
surg_rdd = load_register_table("DEID_CDW", "surgicalcasefact", **{"columns":surg_cols})

avr = ["TRANSCATHETER AORTIC VALVE REPLACEMENT (TAVR)", 
       "AORTIC VALVE REPAIR/ REPLACEMENT (AVR) WITH  CARDIO PULMONARY BYPASS (CPB)"]
surg_rdd = surg_rdd[surg_rdd['primaryprocedurename'].isin(avr)]

# Get patients with severe or critical aortic stenosis & valve replacements
diag_cols = ["patientdurablekey", "diagnosiseventkey", "diagnosisname", "encounterkey", "documentedbyproviderprimaryspecialty"]
diag_rdd = load_register_table("DEID_CDW", "diagnosiseventfact", **{"columns":diag_cols})
stenosis_rdd = diag_rdd[diag_rdd["diagnosisname"].str.lower().str.contains("aort") &
                        diag_rdd["diagnosisname"].str.lower().str.contains("sten") &
                       diag_rdd["diagnosisname"].str.lower().str.contains("severe|critical")&
                       ~diag_rdd["diagnosisname"].str.lower().str.contains("moderate")]

stenosis_rdd = stenosis_rdd[stenosis_rdd["patientdurablekey"].isin(list(surg_rdd["patientdurablekey"]))]

# Merge to notes by diagnosis date
stenosis_rdd.compute()


"""
notes_rdd = load_register_table("DEID_CDW", "note_text")
notes_meta_rdd = load_register_table("DEID_CDW", "note_metadata")
notes_rdd = notes_rdd[notes_rdd["note_text"].str.contains("[pP]ain level [0-9]", na=False)]
notes_rdd = notes_rdd.merge(notes_meta_rdd, right_on="deid_note_key", left_on="deid_note_key",how="inner")
notes_rdd = notes_rdd.compute()

testing = notes_rdd[notes_rdd["note_text"].str.contains("iud", case=False)][["note_text"]]

testing["pain"] = [n.split("ain level")[-1] for n in testing["note_text"]]
testing["pain"] = [n.strip() for n in testing["pain"]]
testing["pain"] = [n.split(" ")[0] for n in testing["pain"]]
testing["pain"] = [n.split("/")[0] if "/" in n
                   else n.strip(".") if "." in n
                   else n.strip(";") if ";" in n
                   else n.strip(",") if "," in n
                   else n.split("-")[0] if "-" in n
                   else n for n in testing["pain"]]
testing["pain"] = testing["pain"].astype(int, errors="ignore")
testing["pain"] = pd.to_numeric(testing.pain, errors='coerce')

testing["pain"].describe()

'''
count    59.000000
mean      4.542373
std       3.385171
min       0.000000
25%       2.000000
50%       4.000000
75%       7.000000
max      12.000000
'''




# Add diagnosis

In [3]:
# Add diagnosis to medication information
pts = pd.read_parquet("./data/ncdmard/raw/ncdmards_all_meds.parquet.gzip").reset_index(drop=True)
diagnosis = pd.read_parquet(f'/wynton/protected/project/ic/data/parquet/DEID_CDW/diagnosisbridge')

diagnosis = diagnosis[diagnosis["diagnosiscombokey"].isin(pts["associateddiagnosiscombokey"].unique())]
pts = pts.merge(diagnosis, left_on="associateddiagnosiscombokey", right_on="diagnosiscombokey", how="inner")

pts["diagnosisname_clean"] = ["IBD" if "IBD" in disease
                              else "IBD" if "inflammatory bowel" in disease.lower()
                              else "IBD" if "crohn" in disease.lower()
                              else "IBD" if "ulcerative" in disease.lower()
                              else "IBD" if "colitis" in disease.lower()
                              else "IBD" if "ileitis" in disease.lower()
                              else "Psoriatic arthritis" if "psoriatic arthritis" in disease.lower()
                              else "Psoriatic arthritis" if "psoriatic arthropathy" in disease.lower()
                              else "Psoriasis" if "psoriasis" in disease.lower()
                              else "JIA" if "juvenile" in disease.lower()
                              else "RA" if "rheumatoid arthritis" in disease.lower()
                              else "SA" if "spondyloarthritis" in disease.lower()
                              else "SA" if "spondylarthritis" in disease.lower()
                              else "SA" if "spondyloarthropathy" in disease.lower()
                              else "SA" if "sacroiliitis" in disease.lower()
                              else "SA" if "ankylosing spondylitis" in disease.lower()
                              else "Hidradenitis" if "hidradenitis" in disease.lower()
                              else "Uveitis" if "uveitis" in disease.lower()
                              else "Sarcoidosis" if "sarcoid" in disease.lower()
                              else "Vasculitis" if "behcet" in disease.lower()
                              else "Vasculitis" if "takayasu" in disease.lower()
                              else "Vasculitis" if "vasculitis" in disease.lower()
                              else "Vasculitis" if "arteritis" in disease.lower()
                              else "Unspecified" if "*Unspecified" in disease
                              else "Other" for disease in pts["diagnosisname"]
                             ]


pts[["patientdurablekey", "medicationorderkey", "encounterkey", "diagnosiscombokey",
     "diagnosiskey", "diagnosisname", "diagnosisname_clean"]].to_csv('./data/ncdmard/medications_with_diagnosis.csv')

