In [1]:
import os
import pandas as pd

from tqdm import tqdm
from transformers import pipeline, set_seed
from transformers import BioGptTokenizer, BioGptForCausalLM
from aug.gpt import *

import warnings
warnings.filterwarnings("ignore")

MIMIC_EYE_PATH = "F:\\mimic-eye"

REFLACX_LESION_LABEL_COLS = [
    # "Fibrosis",
    # "Quality issue",
    # "Wide mediastinum",
    # "Fracture",
    # "Airway wall thickening",

    ######################
    # "Hiatal hernia",
    # "Acute fracture",
    # "Interstitial lung disease",
    # "Enlarged hilum",
    # "Abnormal mediastinal contour",
    # "High lung volume / emphysema",
    # "Pneumothorax",
    # "Lung nodule or mass",
    # "Groundglass opacity",
    ######################
    "Pulmonary edema",
    "Enlarged cardiac silhouette",
    "Consolidation",
    "Atelectasis",
    "Pleural abnormality",
    # "Support devices",
]


CHEXPERT_LABEL_COLS = [
    "Atelectasis_chexpert",
    "Cardiomegaly_chexpert",
    "Consolidation_chexpert",
    "Edema_chexpert",
    "Enlarged Cardiomediastinum_chexpert",
    "Fracture_chexpert",
    "Lung Lesion_chexpert",
    "Lung Opacity_chexpert",
    "No Finding_chexpert",
    "Pleural Effusion_chexpert",
    "Pleural Other_chexpert",
    "Pneumonia_chexpert",
    "Pneumothorax_chexpert",
    "Support Devices_chexpert", 
]


In [2]:
import openai
from secret import *
openai.api_key = OPENAI_API_KEY

In [3]:
features_to_aug =  [
            "temperature_c",
            "heartrate",
            "resprate",
            "o2sat",
            "sbp",
            "dbp",
        ]

feature_to_name_map = {
    "temperature_c": "body temperature in degrees Celsius",
    "heartrate": "heart rate in beats per minute",
    "resprate": "respiratory rate in breaths per minute",
    "o2sat": "peripheral oxygen saturation (%)",
    "sbp": "systolic blood pressure (mmHg)",
    "dbp":"diastolic blood pressure (mmHg)",
}

In [4]:
def get_next_word_value(outputs, min_max_v) :
    for o in outputs: 
        next_word = o.split(" ")[0]
        # next_num_str =  "".join(filter(str.isnumeric, next_word))
        next_num_str = re.sub("[^0-9.]", "", next_word).replace("..", ".")
        # if next_num_str.count(".") <= 1 and len(next_num_str.replace(".", "")) > 0:
        try:
            v = float(next_num_str)
        except:
            continue
            # check if v in the range
        min_v, max_v = min_max_v
        if v > min_v and v < max_v:
            return True, v
    return False, next_word

In [5]:
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
    retry_if_exception_type
)  # for exponential backoff

@retry(
    retry=retry_if_exception_type((openai.error.APIError, openai.error.APIConnectionError, openai.error.RateLimitError, openai.error.ServiceUnavailableError, openai.error.Timeout)), 
    wait=wait_random_exponential(multiplier=1, max=60), 
    stop=stop_after_attempt(10)
)
def chat_completion_with_backoff(**kwargs):
    return openai.ChatCompletion.create(**kwargs)

In [6]:
# Couldn't find value for [5] prompt: EXAMINATION:  CHEST PA AND LAT  INDICATION:  History: M with chest pain  TECHNIQUE:  Chest PA and lateral  COMPARISON:    FINDINGS:   The patient is status post median sternotomy and CABG.  Cardiac mediastinal and hilar contours are unchanged with the heart size within normal limits. Minimal atherosclerotic calcifications are noted at the aortic knob. Pulmonary vasculature is normal. Calcified granuloma is seen within the right apex.  Lungs are clear. Pulmonary vasculature is normal. No pleural effusion or pneumothorax is present. Minimal degenerative changes are seen within the thoracic spine.  IMPRESSION:   No acute cardiopulmonary abnormality. LESIONS: Atelectasi. AGE: 73. GENDER: Male. body temperature in degrees Celsius:, the next word is There
# failed in sbp.

In [7]:
# What can be the next word for the following context (please only return one numerical word): EXAMINATION:  CHEST PA AND LAT  INDICATION:  History: M with chest pain  TECHNIQUE:  Chest PA and lateral  COMPARISON:    FINDINGS:   The patient is status post median sternotomy and CABG.  Cardiac mediastinal and hilar contours are unchanged with the heart size within normal limits. Minimal atherosclerotic calcifications are noted at the aortic knob. Pulmonary vasculature is normal. Calcified granuloma is seen within the right apex.  Lungs are clear. Pulmonary vasculature is normal. No pleural effusion or pneumothorax is present. Minimal degenerative changes are seen within the thoracic spine.  IMPRESSION:   No acute cardiopulmonary abnormality. LESIONS: Atelectasi. AGE: 73. GENDER: Male. body temperature in degrees Celsius:

In [8]:
features_to_aug

['temperature_c', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp']

In [22]:
# res = chat_completion_with_backoff(
#                 model="gpt-4",
#                 messages=[
#                     {"role": "system", "content": "You are a helpful assistant providing information and answering questions related to Taiwan."},
#                     {"role": "system", "content": "Where is NTU?"},
#                 ],
#                 temperature=0.1,
#             )
# res['choices'][0]['message']['content']

'NTU, or National Taiwan University, is located in Taipei, Taiwan. Specifically, it is in the Da’an District of Taipei City.'

In [9]:
for rf in [
    True,
    # False,
]:
    report_format = rf
    df = pd.read_csv('./spreadsheets/reflacx_clinical.csv')
    df['temperature_c'] = df['temperature'].apply(lambda f :(f-32) * 5/9 )
    aug_feature_range = {f: (df[f].min(), df[f].max()) for f in features_to_aug}

    for f in features_to_aug:
        df[f"aug_{f}"] = None

    for f in features_to_aug:
        print(f"Resolving {f}")
        # aug the instance one by one
        for idx, data in tqdm(df.iterrows(), total=df.shape[0]):
            prompt = get_prompt(
                MIMIC_EYE_PATH,
                data,
                REFLACX_LESION_LABEL_COLS,
                feature_to_name_map,
                f,
                report_format=report_format,
            )
            res = chat_completion_with_backoff(
                # model="gpt-3.5-turbo",
                model="gpt-4",
                messages=[
                    # {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "system", "content": "You are a medical expert predicting the possible value of clinical features."},
                    {"role": "user", "content": f"What can be the next word for the following context (please only return one numerical word):\n\n {prompt}"},
                ],
                temperature=0.2,
                n=50
            )

            res = [c['message']['content'] for c in res['choices']]
            success, v = get_next_word_value(res,  aug_feature_range[f])

            if not success:
                pass
                # print(
                #     f"Couldn't find value for [{idx}] prompt: {prompt}, the next word is {v}"
                # )
            else:
                df.at[idx, f"aug_{f}"] = v

    df["aug_temperature"] = df["aug_temperature_c"].apply(lambda c: (c*1.8)+32 if not c is None else None)
    if report_format:
        df.to_csv('./spreadsheets/gpt4_aug_report.csv')
    else:
        df.to_csv('./spreadsheets/gpt4_aug_text.csv')

Resolving temperature_c


  0%|          | 0/799 [00:00<?, ?it/s]

100%|██████████| 799/799 [1:01:07<00:00,  4.59s/it]  


Resolving heartrate


100%|██████████| 799/799 [58:47<00:00,  4.41s/it]    


Resolving resprate


100%|██████████| 799/799 [18:13<00:00,  1.37s/it]  


Resolving o2sat


100%|██████████| 799/799 [31:37<00:00,  2.37s/it]    


Resolving sbp


100%|██████████| 799/799 [47:46<00:00,  3.59s/it]    


Resolving dbp


100%|██████████| 799/799 [46:38<00:00,  3.50s/it]    


In [10]:
# ["I'm sorry, but I cannot provide a numerical value for body temperature based on the given context.",
#  'There is no numerical value provided in the given context for body temperature in degrees Celsius.',
#  'There is no information provided in the given context about body temperature in degrees Celsius.',
#  "I'm sorry, but I cannot provide a numerical value for body temperature based on the given context.",
#  'There is no numerical value provided in the given context to predict the body temperature in degrees Celsius.',
#  'There is no numerical value provided in the given context for body temperature in degrees Celsius.',
#  'There is no numerical value provided in the given context to predict the body temperature in degrees Celsius.',
#  'Unfortunately, the given context does not provide any information about the body temperature in degrees Celsius.',
#  'There is no numerical value provided in the given context to predict the body temperature in degrees Celsius.',
#  'There is no numerical value provided in the given context for body temperature in degrees Celsius.',
#  'There is no numerical value provided in the given context for body temperature in degrees Celsius.',
#  'There is no numerical value provided in the given context for body temperature in degrees Celsius.',
#  'There is no numerical value provided in the given context for body temperature in degrees Celsius.',
#  "I'm sorry, but I cannot provide a numerical value for body temperature based on the given context.",
#  'There is no numerical value provided in the given context, so I cannot predict the body temperature in degrees Celsius.',
#  'There is no numerical value provided in the given context for body temperature in degrees Celsius.',
#  'There is no numerical value provided in the given context.',
#  'There is no numerical value provided in the given context, so I cannot predict the body temperature in degrees Celsius.',
#  'There is no numerical value provided in the given context for body temperature in degrees Celsius.',
#  "I'm sorry, but I cannot provide a numerical value for body temperature based on the given context.",
#  'There is no numerical value provided in the given context for body temperature in degrees Celsius.',
#  "I'm sorry, but I cannot predict the body temperature in degrees Celsius based on the given context.",
#  "I'm sorry, but I cannot provide a numerical value for body temperature based on the given context.",
#  "I'm sorry, but I cannot provide a numerical value for body temperature based on the given context.",
#  'There is no numerical value provided in the given context to predict the body temperature in degrees Celsius.',
#  'There is no numerical value provided in the given context for body temperature in degrees Celsius.',
#  'Unfortunately, the provided context does not mention the body temperature in degrees Celsius.',
#  "I'm sorry, but I cannot provide a numerical value for body temperature based on the given context.",
#  "I'm sorry, but I cannot provide a numerical value for body temperature based on the given context.",
#  'There is no numerical value provided in the given context for body temperature in degrees Celsius.',
#  'There is no numerical value provided in the given context for body temperature in degrees Celsius.',
#  '37.5',
#  'There is no numerical value provided in the given context for body temperature in degrees Celsius.',
#  "I'm sorry, but I cannot provide a numerical value for body temperature based on the given context.",
#  'There is no numerical value provided in the given context to predict the body temperature in degrees Celsius.',
#  "I'm sorry, but I cannot provide a numerical value for body temperature based on the given context.",
#  "I'm sorry, but I cannot provide a numerical value for body temperature based on the given context.",
#  'There is no numerical value provided in the given context for body temperature in degrees Celsius.',
#  "I'm sorry, but I cannot provide a numerical value for body temperature based on the given context.",
#  "I'm sorry, but I cannot provide a numerical value for body temperature based on the given context.",
#  'There is no numerical value provided in the given context for body temperature in degrees Celsius.',
#  "I'm sorry, but I cannot provide a numerical value for body temperature based on the given context.",
#  'Unfortunately, the given context does not provide any information about body temperature in degrees Celsius.',
#  'There is no numerical value provided in the given context, so it is not possible to predict a specific body temperature in degrees Celsius.',
#  'There is no numerical value provided in the given context for body temperature in degrees Celsius.',
#  'There is no numerical value provided in the given context for body temperature in degrees Celsius.',
#  'There is no numerical value provided in the given context to predict the body temperature in degrees Celsius.',
#  'There is no numerical value provided in the given context for body temperature.',
#  "I'm sorry, but I cannot provide a numerical value for body temperature based on the given context.",
#  "I'm sorry, but I cannot provide a numerical value for body temperature based on the given context."]

In [11]:
# for rf in [
#     True,
#     False,
# ]:
#     report_format = rf
#     df = pd.read_csv("./spreadsheets/reflacx_clinical.csv")
#     df["temperature_c"] = df["temperature"].apply(lambda f: (f - 32) * 5 / 9)
#     df = aug_df(
#         MIMIC_EYE_PATH,
#         REFLACX_LESION_LABEL_COLS,
#         features_to_aug,
#         feature_to_name_map,
#         df,
#         generator,
#         progress=[15],
#         report_format=report_format,
#     )
#     df["aug_temperature"] = df["aug_temperature_c"].apply(lambda c: (c*1.8)+32 if not c is None else None)
#     if report_format:
#         df.to_csv("./spreadsheets/zephyr_aug_report.csv")
#     else:
#         df.to_csv("./spreadsheets/zephyr_aug_text.csv")

In [12]:
# df['temperature_c'] = df['temperature'].apply(lambda f :(f-32) * 5/9 )
# df = aug_df(MIMIC_EYE_PATH, REFLACX_LESION_LABEL_COLS, features_to_aug, feature_to_name_map, df, generator, progress=[1, 5, 25, 50], report_format=report_format)
# df["aug_temperature"] = df["aug_temperature_c"].apply(lambda c: (c*1.8)+32)

In [13]:
# df["aug_temperature"] = df["aug_temperature_c"].apply(lambda c: (c*1.8)+32)
# if report_format:
#     df.to_csv('./spreadsheets/llama2_aug_report.csv')
# else:
#     df.to_csv('./spreadsheets/llama2_aug_text.csv')

In [14]:
# # df = aug_df(MIMIC_EYE_PATH, REFLACX_LESION_LABEL_COLS, features_to_aug, feature_to_name_map, df, generator, progress=[1, 5, 25, 50], report_format=report_format)
# aug_feature_range = {f: (df[f].min(), df[f].max()) for f in features_to_aug}

# for f in features_to_aug:
#     df[f"aug_{f}"] = None

# for f in features_to_aug:
#     print(f"Resolving {f}")
#     # aug the instance one by one
#     for idx, data in tqdm(df.iterrows(), total=df.shape[0]):
#         prompt = get_prompt_for_mask(
#             MIMIC_EYE_PATH,
#             data,
#             REFLACX_LESION_LABEL_COLS,
#             feature_to_name_map,
#             f,
#             report_format=report_format,
#         )


#         v = get_generated_value(
#             mask_filler, prompt, aug_feature_range[f], top_k=100,
#         )
#         if v is None:
#             print(
#                 f"Couldn't find value for [{idx}] prompt: {prompt}"
#             )


#         df.at[idx, f"aug_{f}"] = v

In [15]:
# df["aug_temperature"] = df["aug_temperature_c"].apply(lambda c: (c*1.8)+32)
# if report_format:
#     df.to_csv('./spreadsheets/bcb_aug_report.csv')
# else:
#     df.to_csv('./spreadsheets/bcb_aug_text.csv')