# Dependency installations

To start, we install necessary libraries including `unsloth`, `xformers`, and other dependencies.

In [3]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes

# Load model

We load a pre-trained language model using the unsloth library. We choose 4-bit quantization to optimize memory usage.

In [4]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.7
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

# Add LoRA Adapters

We add LoRA adapters to reduce the number of parameters that need to be updated during training.

In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


# Data Preparation

We format the dataset to match the input format expected by our model.

In [6]:
alpaca_prompt = """The following is a conversation with a medical chatbot. The medical chatbot provides clear and informative answers to health-related questions.

### Question:
{}

### Answer:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    inputs       = examples["question"]
    outputs      = examples["answer"]
    texts = []
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("tkxwaweru/medical_QnA", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

Downloading data:   0%|          | 0.00/22.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16407 [00:00<?, ? examples/s]

Map:   0%|          | 0/16407 [00:00<?, ? examples/s]

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up.

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/16407 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [8]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
5.594 GB of memory reserved.


# Training the Model

We train the model and print memory and time statistics post-training.

In [9]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 16,407 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.6073
2,1.5377
3,1.5631
4,1.2797
5,1.3029
6,1.6349
7,1.3838
8,1.2959
9,1.2978
10,1.1165


In [10]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

676.4683 seconds used for training.
11.27 minutes used for training.
Peak reserved memory = 9.654 GB.
Peak reserved memory for training = 4.06 GB.
Peak reserved memory % of max memory = 65.46 %.
Peak reserved memory for training % of max memory = 27.529 %.


<a name="Inference"></a>
### Inference
We run the model to generate responses for given questions.

In [11]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['<|begin_of_text|>The following is a conversation with a medical chatbot. The medical chatbot provides clear and informative answers to health-related questions.\n\n### Question:\nContinue the fibonnaci sequence.\n\n### Answer:\n1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946, 17711, 28657, 46368, ']

# Continuous Inference

We use a TextStreamer for continuous token-by-token inference.

In [12]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>The following is a conversation with a medical chatbot. The medical chatbot provides clear and informative answers to health-related questions.

### Question:
Continue the fibonnaci sequence.

### Answer:
1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811, 514229, 832040, 1346269, 2178309, 3524578, 5702887, 9227465, 14930352, 24157817, 39088169, 


# Save and Load Finetuned Models

In [13]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

Set `False` to `True` to load the LoRA adapters we just saved for inference

In [14]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is a famous tall tower in Paris?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['<|begin_of_text|>The following is a conversation with a medical chatbot. The medical chatbot provides clear and informative answers to health-related questions.\n\n### Question:\nWhat is a famous tall tower in Paris?\n\n### Answer:\nThe Eiffel Tower is a wrought iron tower located on the Champ de Mars in Paris, named after the engineer Gustave Eiffel, who supervised its design and construction. The tower is 324 metres (1,063 ft) tall, about the same height as the Statue of Liberty, located in New York']

In [15]:
# Merge to 16bit
# Save the model in 16-bit format
#model.save_pretrained_merged("model_16bit", save_method="merged_16bit")
#tokenizer.save_pretrained("model_16bit")


# Testing model with prompts

In [16]:
# Function to generate responses from the model
def generate_response(question, input_text=""):
    prompt = alpaca_prompt.format(question, input_text, "")
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return response[0]

# Example questions to prompt the model
questions = [
    "What are the symptoms of diabetes?",
    "How can I treat a common cold?",
    "What are the side effects of taking aspirin?",
]

# Generate and print responses for each question
for question in questions:
    response = generate_response(question)
    print(f"Question: {question}")
    print(f"Response: {response}\n")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What are the symptoms of diabetes?
Response: The following is a conversation with a medical chatbot. The medical chatbot provides clear and informative answers to health-related questions.

### Question:
What are the symptoms of diabetes?

### Answer:
What are the signs and symptoms of Diabetes? The Human Phenotype Ontology provides the following list of signs and symptoms for Diabetes. If the information is available, the table below includes how often the symptom is seen in people with this condition. You can use the MedlinePlus Medical Dictionary to look up the definitions for these



Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: How can I treat a common cold?
Response: The following is a conversation with a medical chatbot. The medical chatbot provides clear and informative answers to health-related questions.

### Question:
How can I treat a common cold?

### Answer:
To treat a common cold, there are several steps you can take to ease symptoms and feel better. These include:
    - Getting plenty of rest.
    - Drinking fluids, especially water, to prevent dehydration.
    - Taking over-the-counter medicines, such as decongestants, pain relievers, and cough suppress

Question: What are the side effects of taking aspirin?
Response: The following is a conversation with a medical chatbot. The medical chatbot provides clear and informative answers to health-related questions.

### Question:
What are the side effects of taking aspirin?

### Answer:
These side effects may go away during treatment as your body adjusts to the medicine. However, check with your doctor if any of the following side effects cont

In [17]:
# Example questions to prompt the model
questions = [
    "What are some signs of pregnancy?",
    "How can I treat a stomache ache?",
    "What are the side effects of taking antibiotics?",
]

# Generate and print responses for each question
for question in questions:
    response = generate_response(question)
    print(f"Question: {question}")
    print(f"Response: {response}\n")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What are some signs of pregnancy?
Response: The following is a conversation with a medical chatbot. The medical chatbot provides clear and informative answers to health-related questions.

### Question:
What are some signs of pregnancy?

### Answer:
Many women have no signs of pregnancy. Others may have some signs as soon as a week after conception. Signs of pregnancy can include: - Missed period. You may not have your period for a month or more after conception. But if you do have a period, it may be lighter or shorter than usual. -



Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: How can I treat a stomache ache?
Response: The following is a conversation with a medical chatbot. The medical chatbot provides clear and informative answers to health-related questions.

### Question:
How can I treat a stomache ache?

### Answer:
What are the treatments for Stomach Ache? The treatment of stomach ache depends on its cause. If the cause is unknown, it is called functional abdominal pain. For functional abdominal pain, the following may help:  Avoiding eating or drinking before bed.  Taking antacids or acid blockers.  Taking

Question: What are the side effects of taking antibiotics?
Response: The following is a conversation with a medical chatbot. The medical chatbot provides clear and informative answers to health-related questions.

### Question:
What are the side effects of taking antibiotics?

### Answer:
Antibiotics can cause side effects. They can also interact with other drugs. Antibiotics can cause side effects such as allergic reactions, diarrhea, and

In [18]:
# Example questions to prompt the model
questions = [
    "What are the causes of high blood pressure?",
    "How can I improve my mental health?",
    "What are the benefits of regular exercise?",
]

# Generate and print responses for each question
for question in questions:
    response = generate_response(question)
    print(f"Question: {question}")
    print(f"Response: {response}\n")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What are the causes of high blood pressure?
Response: The following is a conversation with a medical chatbot. The medical chatbot provides clear and informative answers to health-related questions.

### Question:
What are the causes of high blood pressure?

### Answer:
There are many causes of high blood pressure, and in some people, the cause is never found. The most common cause of high blood pressure is a lack of blood flow through the arteries. This is called atherosclerosis, or hardening of the arteries. It is caused by the buildup of plaque inside the arteries.



Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: How can I improve my mental health?
Response: The following is a conversation with a medical chatbot. The medical chatbot provides clear and informative answers to health-related questions.

### Question:
How can I improve my mental health?

### Answer:
There are many things you can do to improve your mental health.  The following are some suggestions.  - Get enough sleep.  - Eat healthy foods.  - Exercise.  - Take time to relax.  - Connect with others.  - Get help if you need it.

Question: What are the benefits of regular exercise?
Response: The following is a conversation with a medical chatbot. The medical chatbot provides clear and informative answers to health-related questions.

### Question:
What are the benefits of regular exercise?

### Answer:
Regular exercise has many benefits for your health and well-being. It can: 
    - reduce your risk of heart disease, stroke, type 2 diabetes, and some types of cancer
    - strengthen your bones and muscles
    - improve your

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
model.save_pretrained("/content/drive/MyDrive/medical_chatbot/model/lora_model")
tokenizer.save_pretrained("/content/drive/MyDrive/medical_chatbot/model/lora_model")

('/content/drive/MyDrive/medical_chatbot/model/lora_model/tokenizer_config.json',
 '/content/drive/MyDrive/medical_chatbot/model/lora_model/special_tokens_map.json',
 '/content/drive/MyDrive/medical_chatbot/model/lora_model/tokenizer.json')

# Using gradio for a simple interface to showcase the fine-tuned model

In [21]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.37.2-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==1.0.2 (from gradio)
  Downloading gradio_client-1.0.2-py3-none-any.whl (318 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.2/318.2 kB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━

In [22]:
import gradio as gr

def generate_response(question, input_text=""):
    prompt = alpaca_prompt.format(question, input_text, "")
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=300, use_cache=True)  # Further increase max_new_tokens
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    # Extract the answer portion from the response
    answer_start = "### Answer:\n"
    answer = response.split(answer_start)[1].strip() if answer_start in response else response.strip()
    return answer

# Function to clear input and output fields
def clear_fields():
    return "", ""

# Define the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Medical Chatbot")
    gr.Markdown("Ask health-related questions and get clear and informative answers from the medical chatbot.")

    with gr.Row():
        question = gr.Textbox(label="Enter your question here...", lines=2, placeholder="Type your question...", interactive=True)

    with gr.Row():
        output = gr.Textbox(label="Response", lines=10, interactive=True)  # Increase the number of lines to display more text

    with gr.Row():
        submit_btn = gr.Button("Submit")
        clear_btn = gr.Button("Clear")

    submit_btn.click(fn=generate_response, inputs=question, outputs=output)
    clear_btn.click(fn=clear_fields, inputs=None, outputs=[question, output])

# Launch the interface
demo.launch(share=True)




Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://200b1bc78da5555de9.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


