<a href="https://colab.research.google.com/github/BS-Atlas/BS-MedX-MedChat/blob/main/BS%7CMedX_MedChat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade transformers trl datasets
!pip install --upgrade unsloth




In [None]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')
print(HF_TOKEN)

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


In [None]:
import torch
from unsloth import FastVisionModel
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
from transformers import TextStreamer

In [None]:
# Loading the model
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit = True,
    use_gradient_checkpointing = "unsloth"
)


==((====))==  Unsloth 2024.11.11: Fast Mllama vision patching. Transformers: 4.46.3.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Defining layer which we are going to fine-tune
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers = True,
    finetune_attention_modules =True,
    finetune_mlp_modules = True,
    r = 16,
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None
)

In [None]:
# Load the dataset, combining train and validation splits for a larger training set
dataset = load_dataset("unsloth/Radiology_mini", split="train")

instruction = """You are an expert radiographer.
Carefully examine the provided medical image.
Describe your observations accurately and comprehensively, including any abnormalities or significant features.
Based on your expertise, suggest the next steps or potential solutions, if applicable.
Ensure your response is clear, concise, and professional."""

Generating train split:   0%|          | 0/1978 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/327 [00:00<?, ? examples/s]

In [None]:
def convert_to_conversation(sample):
  conversation = [
      {
          "role": "user",
          "content": [
              {
                  "type": "text",
                  "text": instruction

              },
              {
                  "type": "image",
                  "image": sample["image"]

              }
          ]
      },
      {
          "role": "assistant",
          "content": [
              { "type": "text", "text" : sample["caption"]}
          ]
      }
  ]

  return { "messages" : conversation }

In [None]:
converted_dataset = [convert_to_conversation(sample) for sample in dataset]

In [None]:
FastVisionModel.for_inference(model)
image = dataset[5]["image"]
instruction = """You are an expert radiographer.
Carefully examine the provided medical image.
Describe your observations accurately and comprehensively, including any abnormalities or significant features.
Based on your expertise, suggest the next steps or potential solutions, if applicable.
Ensure your response is clear, concise, and professional."""

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",

            },
            {
                "type": "text",
                "text": instruction
            }
        ]
    }
]

input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
inputs = tokenizer(
    image,
    input_text,
    return_tensors="pt",
    add_special_tokens=False
).to("cuda")

print("\nBT-Answer:\n")
text_streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model.generate(
    **inputs,
    streamer=text_streamer,
    max_new_tokens=125,
    use_cache=True,
    temperature=1.5,
    min_p=0.1
)


BT-Answer:

The provided medical image appears to be an angiogram, showcasing the subclavian artery and its branching into the subclavian and brachial arteries. A notable abnormality is present.

*   A stenotic or narrowed section of the left internal mammary artery (LIMA) is evident. This could be indicative of atherosclerosis, which is a process involving the buildup of plaque in the artery walls.
*   In addition to this, there may be an indication of atherosclerotic plaques or calcification on the vessel walls in this region.

Based on these observations, the next steps in this


In [None]:
FastVisionModel.for_training(model)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer),
    train_dataset = converted_dataset,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        max_steps = 30,
        warmup_steps = 5,
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048
    )
)

max_steps is given, it will override any value given in num_train_epochs


In [None]:

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

trainer_stats = trainer.train()

used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.564 GB.
8.48 GB of memory reserved.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,978 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 30
 "-____-"     Number of trainable parameters = 67,174,400
🦥 Unsloth needs about 1-3 minutes to load everything - please wait!


Step,Training Loss
1,3.3712
2,3.351
3,3.2597
4,3.1507
5,2.66
6,2.3239
7,1.8898
8,1.4862
9,1.1745
10,0.9201


223.7693 seconds used for training.
3.73 minutes used for training.
Peak reserved memory = 10.16 GB.
Peak reserved memory for training = 1.68 GB.
Peak reserved memory % of max memory = 25.68 %.
Peak reserved memory for training % of max memory = 4.246 %.


In [None]:




print("\nAfter training:\n")
FastVisionModel.for_inference(model)
image = dataset[5]["image"]

instruction =  """You are an expert radiographer.
Carefully examine the provided medical image, but more precised please.
Describe your observations accurately and comprehensively, including any abnormalities or significant features.
Based on your expertise, suggest the next steps or potential solutions, if applicable.
Ensure your response is clear, concise, and professional."""

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 600,
                   use_cache = True, temperature = 1.5, min_p = 0.1)



After training:

Left subclavian artery stenosis (arrow). LIMA: left internal mammary artery.<|eot_id|>


In [None]:
# 6. Save the model

model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

model.save_pretrained_merged("BS_MedX_MedChat", tokenizer)


Unsloth: Merging QLoRA weights directly to the 16bit version of unsloth/llama-3.2-11b-vision-instruct.


Unsloth: Merging weights into 16bit: 100%|██████████| 5/5 [01:12<00:00, 14.59s/it]


In [None]:
model.push_to_hub_merged(
   "BSAtlas/BS_MedX_MedChat",
   tokenizer=tokenizer,
   save_method="merged_16bit",
   token=HF_TOKEN
)

Unsloth: Merging QLoRA weights directly to the 16bit version of unsloth/llama-3.2-11b-vision-instruct.


Unsloth: Merging weights into 16bit: 100%|██████████| 5/5 [06:46<00:00, 81.38s/it]


In [None]:
%cd /content/
print("list of content folder")
%ls
print("list of lora_model folder")
%ls /content/lora_model/
print("list of BS_MedX_MedChat folder")
%ls /content/BS_MedX_MedChat/
print("list of outputs folder")
%ls /content/outputs/

/content
list of content folder
[0m[01;34mBSAtlas[0m/  [01;34mBS_MedX_MedChat[0m/  [01;34mlora_model[0m/  [01;34moutputs[0m/  [01;34msample_data[0m/  [01;34munsloth_compiled_cache[0m/
list of lora_model folder
adapter_config.json        chat_template.json        README.md                tokenizer_config.json
adapter_model.safetensors  preprocessor_config.json  special_tokens_map.json  tokenizer.json
list of BS_MedX_MedChat folder
chat_template.json                model-00002-of-00005.safetensors  model.safetensors.index.json
config.json                       model-00003-of-00005.safetensors  preprocessor_config.json
generation_config.json            model-00004-of-00005.safetensors
model-00001-of-00005.safetensors  model-00005-of-00005.safetensors
list of outputs folder
[0m[01;34mcheckpoint-30[0m/


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
import torch
from PIL import Image
import requests

class Model:
    def __init__(self, **kwargs):
        self.model = AutoModelForCausalLM.from_pretrained("BSAtlas/BS_MedX_MedChat")
        self.tokenizer = AutoTokenizer.from_pretrained("BSAtlas/BS_MedX_MedChat")

    def predict(self, request):
        image_url = request.get('image_url')
        instruction = request.get('instruction')

        # Load image from URL
        image = Image.open(requests.get(image_url, stream=True).raw)

        messages = [
            {"role": "user", "content": [
                {"type": "image"},
                {"type": "text", "text": instruction}
            ]}
        ]
        input_text = self.tokenizer.apply_chat_template(messages, add_generation_prompt=True)
        inputs = self.tokenizer(
            image,
            input_text,
            add_special_tokens=False,
            return_tensors="pt",
        ).to("cuda")

        text_streamer = TextStreamer(self.tokenizer, skip_prompt=True)
        _ = self.model.generate(**inputs, streamer=text_streamer, max_new_tokens=600,
                           use_cache=True, temperature=1.5, min_p=0.1)

        # Assuming generated_text contains the model's output
        return {'prediction': generated_text}