<a href="https://colab.research.google.com/github/DataBytes-Organisation/Fine-Tuning-LLMs-for-Enterprise-Applications/blob/ed_branch/Ed_LLM_FineTuning_Chatbot_UI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Personalized Healthcare QA System (Chatbot) - Chatbot UI
#### Ed:215279167
---


---
## 1. Import libraries and model for Chatbot UI

In [None]:
#!pip install datasets requests bitsandbytes accelerate peft trl sentencepiece wandb transformers evaluate rouge_score bert-score gradio langchain-huggingface
!pip install requests bitsandbytes wandb transformers gradio langchain-huggingface

### IMPORTANT: Restart Colab runtime after PIP install!!!

## 2. Implement base model and imports
First step is to import all necessary libraries and implement base model, to validate it can generate a response from prompt.

In [None]:
from google.colab import userdata
from huggingface_hub import login
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    logging,
    pipeline
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model, TaskType
import torch
import wandb
import pandas as pd
import os
import random
import re

# for chatbot UI
import gradio as gr

In [None]:
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
#model_name = "digitalblue/model_e_merge_v2"
project_name = "ed_medical"
hf_username = "digitalblue"

# define peft adapters saved to huggingface hub
MODEL_A = "digitalblue/ed_medical-2025-04-03_09.11.29" # trained on 200 rows
MODEL_B = "digitalblue/ed_medical-2025-04-05_10.59.54" # trained on 800 rows
MODEL_C = "digitalblue/ed_medical-2025-04-05_11.36.32" # trained on 1600 rows
MODEL_D = "digitalblue/ed_medical-2025-04-06_10.40.29" # trained on 1600 rows with NEFTune
MODEL_E = "digitalblue/ed_medical-2025-04-15_12.43.48" # trained on 8000 rows of pubmedqa only w/ NEFTune
MODEL_F = "digitalblue/ed_medical-2025-05-05_23.51.25" # trained on 9000 row, lower learing rate, 3 epochs
MODEL_G = "digitalblue/ed_medical-2025-05-08_01.23.37" # trained on 9000 rows, NEFTune false
MODEL_H = "digitalblue/ed_medical-2025-05-11_04.05.52" # trained on 900 row, with context, 3 epochs
MODEL_I = "digitalblue/ed_medical-2025-05-12_02.31.47" # trained on 900 rows of MedDialog

In [None]:
# log into hugging face and wandb
hf_token = userdata.get('HF_TOKEN')
login(hf_token)

wandb_api_key = userdata.get('WANDB_API_KEY')
os.environ["WANDB_API_KEY"] = wandb_api_key
wandb.login()

# Configure Weights & Biases to record against our project
os.environ["WANDB_PROJECT"] = "ed_medical"
os.environ["WANDB_LOG_MODEL"] = "checkpoint" if True else "end"
os.environ["WANDB_WATCH"] = "gradients"

In [None]:
# quantisation config to use less memory when loading model
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    quantization_config=quant_config)

In [None]:
# initialise tokenizer and pipeline
tokenizer = AutoTokenizer.from_pretrained(base_model_name, token=hf_token)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
#model_g = PeftModel.from_pretrained(model, MODEL_G)
model_i = PeftModel.from_pretrained(model, MODEL_I)

In [None]:
system_prompt = "You are a helpful personalised medical assistant. In your response do not include any personal names and ensure it is detailed. Do not say I have gone through your report or query. Indicate that you understand the questions with medical information."

def get_model_response(model_x, prompt, chat_history):

  messages = [
    {"role": "system", "content": system_prompt},
  ]
  for turn in chat_history:
    print(f"turn={turn}")
    messages.append(turn)

  messages.append({"role": "user", "content": prompt})

  print(f"messages={messages}")

  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
  outputs = model_x.generate(inputs, max_new_tokens=512, temperature=0.1)
  response = tokenizer.decode(outputs[0], skip_special_tokens=False)

  print(f"response={response}")
  answer = response.split("[/INST]")[-1].strip()
  answer = answer.replace('</s>', '') # remove trailing special token if present
  print(f"answer={answer}")
  return answer

In [None]:
get_model_response(model_i, "Hi, does utility of preliminary bronchoalveolar lavage result in suspected ventilator-associated pneumonia?", [])

In [None]:

def respond(message, chat_history):
  try:
    # Format prompt for Llama 2
    print(f"chat_history={chat_history}")
    print(f"message={message}")

    # Get model output
    model_output = get_model_response(model_i, message, chat_history)
    print(f"model_output = {model_output}")
    print("--------")

    return model_output


  except Exception as e:
    print(f"Error in respond(): {str(e)}")
    return "Error occurred", chat_history

In [None]:
force_dark_mode = """
function refresh() {
    const url = new URL(window.location);
    if (url.searchParams.get('__theme') !== 'dark') {
        url.searchParams.set('__theme', 'dark');
        window.location.href = url.href;
    }
}
"""

chatbot_ui = gr.ChatInterface(
    fn=respond,
    type="messages",
    flagging_mode="never",
    js=force_dark_mode)


chatbot_ui.launch(debug=True)

In [None]:
# clear gpu memory
torch.cuda.empty_cache()
torch.cuda.reset_max_memory_allocated()