In [1]:
import os
import pandas as pd

from tqdm import tqdm
from transformers import pipeline, set_seed
from transformers import BioGptTokenizer, BioGptForCausalLM
from aug.gpt import *

import warnings
warnings.filterwarnings("ignore")

MIMIC_EYE_PATH = "F:\\mimic-eye"

REFLACX_LESION_LABEL_COLS = [
    # "Fibrosis",
    # "Quality issue",
    # "Wide mediastinum",
    # "Fracture",
    # "Airway wall thickening",

    ######################
    # "Hiatal hernia",
    # "Acute fracture",
    # "Interstitial lung disease",
    # "Enlarged hilum",
    # "Abnormal mediastinal contour",
    # "High lung volume / emphysema",
    # "Pneumothorax",
    # "Lung nodule or mass",
    # "Groundglass opacity",
    ######################
    "Pulmonary edema",
    "Enlarged cardiac silhouette",
    "Consolidation",
    "Atelectasis",
    "Pleural abnormality",
    # "Support devices",
]


CHEXPERT_LABEL_COLS = [
    "Atelectasis_chexpert",
    "Cardiomegaly_chexpert",
    "Consolidation_chexpert",
    "Edema_chexpert",
    "Enlarged Cardiomediastinum_chexpert",
    "Fracture_chexpert",
    "Lung Lesion_chexpert",
    "Lung Opacity_chexpert",
    "No Finding_chexpert",
    "Pleural Effusion_chexpert",
    "Pleural Other_chexpert",
    "Pneumonia_chexpert",
    "Pneumothorax_chexpert",
    "Support Devices_chexpert", 
]


In [2]:
from transformers import AutoTokenizer, AutoModel, DistilBertForMaskedLM, BertForMaskedLM, LlamaForCausalLM, LlamaTokenizer, LlamaForCausalLM
import torch
from secret import *
generator = pipeline("text-generation", model="mistralai/Mistral-7B-v0.1", torch_dtype=torch.bfloat16, device_map="auto", pad_token_id=2, eos_token_id=2)
set_seed(0)
# tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", token=HUGGING_FACE_TOKEN)
# model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", token=HUGGING_FACE_TOKEN) #, device_map='auto', torch_dtype=torch.float16)
# model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT") 

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
# model_id="meta-llama/Llama-2-7b-hf"
    
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# import torch
# model =AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True, device_map='auto', torch_dtype=torch.float16)

In [4]:
generator("The average blood pressure for human is ",  max_new_tokens=10, num_return_sequences=1, do_sample=True)

[{'generated_text': 'The average blood pressure for human is 100-140mm Hg'}]

In [5]:
features_to_aug =  [
            "temperature_c",
            "heartrate",
            "resprate",
            "o2sat",
            "sbp",
            "dbp",
        ]

feature_to_name_map = {
    "temperature_c": "body temperature in degrees Celsius",
    "heartrate": "heart rate in beats per minute",
    "resprate": "respiratory rate in breaths per minute",
    "o2sat": "peripheral oxygen saturation (%)",
    "sbp": "systolic blood pressure (mmHg)",
    "dbp":"diastolic blood pressure (mmHg)",
}

In [6]:
for rf in [True, False]:
    report_format = rf
    df = pd.read_csv('./spreadsheets/reflacx_clinical.csv')
    df['temperature_c'] = df['temperature'].apply(lambda f :(f-32) * 5/9 )
    df = aug_df(MIMIC_EYE_PATH, REFLACX_LESION_LABEL_COLS, features_to_aug, feature_to_name_map, df, generator, progress=[1, 5, 25, 50], report_format=report_format)
    df["aug_temperature"] = df["aug_temperature_c"].apply(lambda c: (c*1.8)+32 if not c is None else None)
    
    if report_format:
        df.to_csv('./spreadsheets/mistral_aug_report.csv')  
    else:   
        df.to_csv('./spreadsheets/mistral_aug_text.csv')

Resolving temperature_c


100%|██████████| 799/799 [03:20<00:00,  3.99it/s]


Resolving heartrate


100%|██████████| 799/799 [02:48<00:00,  4.73it/s]


Resolving resprate


100%|██████████| 799/799 [03:41<00:00,  3.61it/s]


Resolving o2sat


100%|██████████| 799/799 [03:43<00:00,  3.58it/s]


Resolving sbp


 23%|██▎       | 186/799 [01:00<03:42,  2.76it/s]