In [1]:
import os
import pandas as pd

from tqdm import tqdm
from transformers import pipeline, set_seed
from transformers import BioGptTokenizer, BioGptForCausalLM
from aug.gpt import *
import warnings
warnings.filterwarnings("ignore")
MIMIC_EYE_PATH = "F:\\mimic-eye"

REFLACX_LESION_LABEL_COLS = [
    # "Fibrosis",
    # "Quality issue",
    # "Wide mediastinum",
    # "Fracture",
    # "Airway wall thickening",

    ######################
    # "Hiatal hernia",
    # "Acute fracture",
    # "Interstitial lung disease",
    # "Enlarged hilum",
    # "Abnormal mediastinal contour",
    # "High lung volume / emphysema",
    # "Pneumothorax",
    # "Lung nodule or mass",
    # "Groundglass opacity",
    ######################
    "Pulmonary edema",
    "Enlarged cardiac silhouette",
    "Consolidation",
    "Atelectasis",
    "Pleural abnormality",
    # "Support devices",
]


CHEXPERT_LABEL_COLS = [
    "Atelectasis_chexpert",
    "Cardiomegaly_chexpert",
    "Consolidation_chexpert",
    "Edema_chexpert",
    "Enlarged Cardiomediastinum_chexpert",
    "Fracture_chexpert",
    "Lung Lesion_chexpert",
    "Lung Opacity_chexpert",
    "No Finding_chexpert",
    "Pleural Effusion_chexpert",
    "Pleural Other_chexpert",
    "Pneumonia_chexpert",
    "Pneumothorax_chexpert",
    "Support Devices_chexpert", 
]


In [2]:
from transformers import AutoTokenizer, AutoModel, DistilBertForMaskedLM, BertForMaskedLM, LlamaForCausalLM, LlamaTokenizer, LlamaForCausalLM
import torch
from secret import *
tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", token=HUGGING_FACE_TOKEN)
model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", token=HUGGING_FACE_TOKEN, device_map='auto', torch_dtype=torch.float16)
# model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
set_seed(0)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
# model_id="meta-llama/Llama-2-7b-hf"
    
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# import torch
# model =AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True, device_map='auto', torch_dtype=torch.float16)

In [4]:
generator("The average blood pressure for human is ",  max_new_tokens=10, num_return_sequences=1, do_sample=True)

[{'generated_text': 'The average blood pressure for human is 120/80 mmHg.'}]

In [5]:
df = pd.read_csv('./spreadsheets/reflacx_clinical.csv')
df['temperature_c'] = df['temperature'].apply(lambda f :(f-32) * 5/9 )

In [6]:
features_to_aug =  [
            "temperature_c",
            "heartrate",
            "resprate",
            "o2sat",
            "sbp",
            "dbp",
        ]

feature_to_name_map = {
    "temperature_c": "body temperature in degrees Celsius",
    "heartrate": "heart rate in beats per minute",
    "resprate": "respiratory rate in breaths per minute",
    "o2sat": "peripheral oxygen saturation (%)",
    "sbp": "systolic blood pressure (mmHg)",
    "dbp":"diastolic blood pressure (mmHg)",
}

In [7]:
for rf in [True, False]:
    report_format = rf
    df = pd.read_csv('./spreadsheets/reflacx_clinical.csv')
    df['temperature_c'] = df['temperature'].apply(lambda f :(f-32) * 5/9 )
    df = aug_df(MIMIC_EYE_PATH, REFLACX_LESION_LABEL_COLS, features_to_aug, feature_to_name_map, df, generator, progress=[1, 25], report_format=report_format)
    df["aug_temperature"] = df["aug_temperature_c"].apply(lambda c: (c*1.8)+32 if not c is None else None)
    if report_format:
        df.to_csv('./spreadsheets/llama2_aug_report.csv')
    else:
        df.to_csv('./spreadsheets/llama2_aug_text.csv')

Resolving temperature_c


  0%|          | 0/799 [00:00<?, ?it/s]

100%|██████████| 799/799 [02:08<00:00,  6.23it/s]


Resolving heartrate


100%|██████████| 799/799 [02:19<00:00,  5.74it/s]


Resolving resprate


100%|██████████| 799/799 [05:41<00:00,  2.34it/s]  


Resolving o2sat


100%|██████████| 799/799 [05:26<00:00,  2.45it/s]  


Resolving sbp


100%|██████████| 799/799 [09:59<00:00,  1.33it/s]  


Resolving dbp


100%|██████████| 799/799 [21:29<00:00,  1.61s/it]  


Resolving temperature_c


100%|██████████| 799/799 [03:19<00:00,  4.00it/s]


Resolving heartrate


100%|██████████| 799/799 [03:20<00:00,  3.99it/s]


Resolving resprate


100%|██████████| 799/799 [13:44<00:00,  1.03s/it]  


Resolving o2sat


100%|██████████| 799/799 [20:17<00:00,  1.52s/it]  


Resolving sbp


100%|██████████| 799/799 [03:18<00:00,  4.03it/s]


Resolving dbp


100%|██████████| 799/799 [03:20<00:00,  3.99it/s]


In [8]:
# df['temperature_c'] = df['temperature'].apply(lambda f :(f-32) * 5/9 )
# df = aug_df(MIMIC_EYE_PATH, REFLACX_LESION_LABEL_COLS, features_to_aug, feature_to_name_map, df, generator, progress=[1, 5, 25, 50], report_format=report_format)
# df["aug_temperature"] = df["aug_temperature_c"].apply(lambda c: (c*1.8)+32)

In [9]:
# df["aug_temperature"] = df["aug_temperature_c"].apply(lambda c: (c*1.8)+32)
# if report_format:
#     df.to_csv('./spreadsheets/llama2_aug_report.csv')
# else:
#     df.to_csv('./spreadsheets/llama2_aug_text.csv')

In [10]:
# # df = aug_df(MIMIC_EYE_PATH, REFLACX_LESION_LABEL_COLS, features_to_aug, feature_to_name_map, df, generator, progress=[1, 5, 25, 50], report_format=report_format)
# aug_feature_range = {f: (df[f].min(), df[f].max()) for f in features_to_aug}

# for f in features_to_aug:
#     df[f"aug_{f}"] = None

# for f in features_to_aug:
#     print(f"Resolving {f}")
#     # aug the instance one by one
#     for idx, data in tqdm(df.iterrows(), total=df.shape[0]):
#         prompt = get_prompt_for_mask(
#             MIMIC_EYE_PATH,
#             data,
#             REFLACX_LESION_LABEL_COLS,
#             feature_to_name_map,
#             f,
#             report_format=report_format,
#         )


#         v = get_generated_value(
#             mask_filler, prompt, aug_feature_range[f], top_k=100,
#         )
#         if v is None:
#             print(    
#                 f"Couldn't find value for [{idx}] prompt: {prompt}"
#             )

            
#         df.at[idx, f"aug_{f}"] = v

In [11]:
# df["aug_temperature"] = df["aug_temperature_c"].apply(lambda c: (c*1.8)+32)
# if report_format:
#     df.to_csv('./spreadsheets/bcb_aug_report.csv')
# else:
#     df.to_csv('./spreadsheets/bcb_aug_text.csv')