In [None]:
pip install datasets peft trl transformers pandas torch spacy nltk rouge_score bert_score sentence_transformers bitsandbytes accelerate

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.17.0-py3-none-any.whl.metadata (12 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Colle

In [None]:
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model, PeftModel
from trl import SFTTrainer
from transformers import TrainingArguments, TrainerCallback
import json
import os

In [None]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, PeftConfig

def create_prompt(instruction, input_text):
    """Format the instruction and input into a prompt"""
    if input_text:
        return f"{instruction}\n\n{input_text}"
    return instruction

import os
import json
import pandas as pd
from datasets import load_dataset

def load_and_format_dataset(file_path, train_split=0.8, output_dir="data"):
    """Prepare BioGPT-compatible dataset from plain text prompts."""
    os.makedirs(output_dir, exist_ok=True)
    df = pd.read_csv(file_path)

    if "text" not in df.columns:
        raise ValueError("Dataset must contain a 'text' column.")

    formatted_data = [{"text": row["text"]} for _, row in df.iterrows()]

    # Split into train/validation
    train_size = int(len(formatted_data) * train_split)
    splits = {
        "train": formatted_data[:train_size],
        "validation": formatted_data[train_size:]
    }

    # Save as JSONL files
    for split, data in splits.items():
        with open(os.path.join(output_dir, f"{split}.jsonl"), "w") as f:
            for item in data:
                json.dump(item, f)
                f.write("\n")

    print(f"Saved {train_size} training and {len(formatted_data) - train_size} validation examples.")

    return load_dataset("json", data_files={
        "train": os.path.join(output_dir, "train.jsonl"),
        "validation": os.path.join(output_dir, "validation.jsonl")
    })


In [None]:
from peft import PeftModel, PeftModelForCausalLM, get_peft_model, prepare_model_for_kbit_training, LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

def configure_qlora_model(model_name="microsoft/biogpt"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=False
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        low_cpu_mem_usage=True
    )

    model.config.pad_token_id = tokenizer.eos_token_id
    model = prepare_model_for_kbit_training(model)

    for name, module in model.named_modules():
      if "attn" in name or "proj" in name:
          print(name)


    lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],  # ✅ Matches BioGPT exactly
    bias="none",
    task_type="CAUSAL_LM"
)



    print("Applying PEFT adapters to the model...")
    peft_model = get_peft_model(model, lora_config)

    if not isinstance(peft_model, (PeftModel, PeftModelForCausalLM)):
        raise ValueError("Model is not a PEFT model instance!")
    else:
        print("[OK] Model wrapped with PEFT successfully.")

    peft_model.print_trainable_parameters()

    return peft_model, tokenizer


In [None]:
pip install -U sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [None]:
def setup_trainer(model, dataset, tokenizer, output_dir="biogpt_finetuned"):
    if not isinstance(model, (PeftModel, PeftModelForCausalLM)):
        raise ValueError("Model is not a PEFT-wrapped instance! Cannot continue with training.")

    print("Configuring TrainingArguments...")

    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=4,  # Use >1 if memory allows
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=8,
        num_train_epochs=3,
        learning_rate=2e-5,
        fp16=True,
        save_strategy="epoch",
        #evaluation_strategy="epoch",  # ✅ FIXED: previously misspelled
        #load_best_model_at_end=True,
        #metric_for_best_model="eval_loss",
        logging_steps=10,
        save_total_limit=2,
        push_to_hub=False,
        gradient_checkpointing=True,
        optim="adamw_torch_fused",
        max_grad_norm=0.3,
        warmup_ratio=0.03,
        lr_scheduler_type="cosine",
    )

    from transformers import DataCollatorForLanguageModeling
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    return Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator
    )

from transformers import AutoTokenizer
from peft import PeftModel, PeftModelForCausalLM


In [None]:
print("🔹 Loading and formatting dataset...")
dataset = load_and_format_dataset("bio_gpt_formatted.csv")

print("🔹 Configuring QLoRA BioGPT model...")
model, tokenizer = configure_qlora_model("microsoft/biogpt")

print("🔹 Tokenizing dataset...")
def tokenize(example):
    output = tokenizer(example["text"], truncation=True, padding="max_length", max_length=256)
    output["labels"] = output["input_ids"].copy()
    return output

tokenized_dataset = dataset.map(tokenize, batched=True)

🔹 Loading and formatting dataset...
Saved 89530 training and 22383 validation examples.


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

🔹 Configuring QLoRA BioGPT model...
biogpt.layers.0.self_attn
biogpt.layers.0.self_attn.k_proj
biogpt.layers.0.self_attn.v_proj
biogpt.layers.0.self_attn.q_proj
biogpt.layers.0.self_attn.out_proj
biogpt.layers.0.self_attn_layer_norm
biogpt.layers.1.self_attn
biogpt.layers.1.self_attn.k_proj
biogpt.layers.1.self_attn.v_proj
biogpt.layers.1.self_attn.q_proj
biogpt.layers.1.self_attn.out_proj
biogpt.layers.1.self_attn_layer_norm
biogpt.layers.2.self_attn
biogpt.layers.2.self_attn.k_proj
biogpt.layers.2.self_attn.v_proj
biogpt.layers.2.self_attn.q_proj
biogpt.layers.2.self_attn.out_proj
biogpt.layers.2.self_attn_layer_norm
biogpt.layers.3.self_attn
biogpt.layers.3.self_attn.k_proj
biogpt.layers.3.self_attn.v_proj
biogpt.layers.3.self_attn.q_proj
biogpt.layers.3.self_attn.out_proj
biogpt.layers.3.self_attn_layer_norm
biogpt.layers.4.self_attn
biogpt.layers.4.self_attn.k_proj
biogpt.layers.4.self_attn.v_proj
biogpt.layers.4.self_attn.q_proj
biogpt.layers.4.self_attn.out_proj
biogpt.layers.4.

Map:   0%|          | 0/89530 [00:00<?, ? examples/s]

Map:   0%|          | 0/22383 [00:00<?, ? examples/s]

🔹 Setting up trainer...
Configuring TrainingArguments...


NameError: name 'Trainer' is not defined

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

print("🔹 Setting up trainer...")
trainer = setup_trainer(model, tokenized_dataset, tokenizer)

print("🔹 Starting training...")
trainer.train()

print("✅ Training complete. Saving model and tokenizer...")
model.save_pretrained(trainer.args.output_dir)
tokenizer.save_pretrained(trainer.args.output_dir)

🔹 Setting up trainer...
Configuring TrainingArguments...


  return Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


🔹 Starting training...


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maparnabharathi-suresh[0m ([33maparnabharathi-suresh-san-jose-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,4.2834
20,4.2054
30,4.2437
40,4.3045
50,4.2516
60,4.2511
70,4.2055
80,4.1917
90,4.2147
100,4.2261


✅ Training complete. Saving model and tokenizer...


('biogpt_finetuned/tokenizer_config.json',
 'biogpt_finetuned/special_tokens_map.json',
 'biogpt_finetuned/vocab.json',
 'biogpt_finetuned/merges.txt',
 'biogpt_finetuned/added_tokens.json')

# Evaluation of Finetuned model

In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import torch
import pandas as pd
import re
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util
import spacy

# Load spaCy biomedical model (replace with clinical/bio model if available)
nlp = spacy.load("en_core_web_sm")
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def create_prompt(instruction, input_text):
    return f"{instruction}\n\n{input_text}" if input_text else instruction

def validate_answer(answer):
    return answer.strip().replace("\n", " ").replace("  ", " ")

class QADataset(Dataset):
    def __init__(self, questions, inputs, references, tokenizer, max_length=256):
        self.questions = questions
        self.inputs = inputs
        self.references = references
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        input_text = self.inputs[idx]
        prompt = create_prompt(question, input_text)
        encoding = self.tokenizer(
            prompt,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'prompt_length': encoding['input_ids'].shape[1],
            'question': question,
            'input_text': input_text,
            'reference': self.references[idx]
        }

def generate_responses(model, tokenizer, questions, inputs, references):
    generation_kwargs = {
        'max_new_tokens': 150,
        'do_sample': False,
        'temperature': 0.3,
        'repetition_penalty': 1.5,
        'no_repeat_ngram_size': 4,
        'eos_token_id': tokenizer.eos_token_id,
        'pad_token_id': tokenizer.pad_token_id
    }

    qa_dataset = QADataset(questions, inputs, references, tokenizer)
    dataloader = DataLoader(qa_dataset, batch_size=1, shuffle=False)
    generated_outputs = []

    for batch in dataloader:
        input_ids = batch['input_ids'].to("cuda")
        attention_mask = batch['attention_mask'].to("cuda")
        question = batch['question'][0]
        input_text = batch['input_text'][0]
        reference = batch['reference'][0]
        prompt_length = batch['prompt_length'][0]

        with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=torch.float16):
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                **generation_kwargs
            )

        new_tokens = outputs[0, prompt_length:] if outputs.shape[1] > prompt_length else outputs[0]
        generated_answer = tokenizer.decode(new_tokens, skip_special_tokens=True)
        generated_outputs.append(validate_answer(generated_answer))

    return generated_outputs

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    return ' '.join(tokens)

def compute_bleu_score(generated, reference):
    gen_tokens = word_tokenize(generated)
    ref_tokens = word_tokenize(reference)
    smoothie = SmoothingFunction().method4
    return sentence_bleu([ref_tokens], gen_tokens, smoothing_function=smoothie)

def compute_hybrid_score(bert_f1, bleu, bert_weight=0.7):
    return bert_weight * bert_f1 + (1 - bert_weight) * bleu

def compute_metrics_per_query(generated_outputs, references, questions, nlp, embedder):
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    for i, (gen, ref, question) in enumerate(zip(generated_outputs, references, questions), 1):
        gen_norm = preprocess_text(gen)
        ref_norm = preprocess_text(ref)

        p, r, f1 = bert_score([gen_norm], [ref_norm], lang="en", model_type="roberta-large")
        bert_f1 = f1.item()
        rouge_l = rouge_scorer_obj.score(ref_norm, gen_norm)['rougeL'].fmeasure

        gen_entities = set(ent.text.lower() for ent in nlp(gen).ents)
        ref_entities = set(ent.text.lower() for ent in nlp(ref).ents)
        if ref_entities:
            precision = len(gen_entities & ref_entities) / len(gen_entities) if gen_entities else 0
            recall = len(gen_entities & ref_entities) / len(ref_entities)
            entity_f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        else:
            entity_f1 = 1.0 if not gen_entities else 0.0

        exact_match = 1.0 if gen_norm == ref_norm else 0.0

        gen_embedding = embedder.encode(gen, convert_to_tensor=True, device="cuda")
        ref_embedding = embedder.encode(ref, convert_to_tensor=True, device="cuda")
        fcs = util.cos_sim(gen_embedding, ref_embedding)[0][0].item()

        mcr = len(gen_entities & ref_entities) / len(ref_entities) if ref_entities else 1.0 if not gen_entities else 0.0
        bleu = compute_bleu_score(gen_norm, ref_norm)
        hybrid_score = compute_hybrid_score(bert_f1, bleu)

        print(f"\nSample {i}: {question}")
        print(f"Generated: {gen}")
        print(f"Reference: {ref}")
        print(f"BERT F1: {bert_f1:.4f}, ROUGE-L: {rouge_l:.4f}, FCS: {fcs:.4f}, MCR: {mcr:.4f}")
        print(f"Entity F1: {entity_f1:.4f}, Exact Match: {exact_match:.4f}, BLEU: {bleu:.4f}, Hybrid: {hybrid_score:.4f}")

# === MAIN EXECUTION ===
if __name__ == "__main__":
    from datasets import load_dataset
    import pandas as pd

    dataset = load_dataset("json", data_files={"validation": "/content/data/validation_Combined.jsonl"})
    df = pd.DataFrame(dataset["validation"])
    df = df.sample(n=5, random_state=42).reset_index(drop=True)
    questions = df["instruction"].tolist()
    inputs = df["input"].tolist()
    references = df["output"].tolist()


    questions, inputs, references = df["instruction"].tolist(), df["input"].tolist(), df["output"].tolist()

    tokenizer = AutoTokenizer.from_pretrained("biogpt_finetuned")
    #model = AutoModelForCausalLM.from_pretrained("biogpt_finetuned").to("cuda").eval()
    from peft import PeftModel

    base_model = AutoModelForCausalLM.from_pretrained("microsoft/BioGPT", device_map="auto").eval()
    model = PeftModel.from_pretrained(base_model, "biogpt_finetuned").to("cuda").eval()

    tokenizer.pad_token = tokenizer.eos_token


    outputs = generate_responses(model, tokenizer, questions, inputs, references)
    compute_metrics_per_query(outputs, references, questions, nlp, embedder)
    torch.cuda.empty_cache()



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Sample 1: Hello doctor, My father has diabetes, and the doctor said he loses the fat of the body through urine. Like no muscle inside the body. He takes electro homeopathy NaCl drip two to three times a week. Please suggest some thing how to be better?
Generated: What is it for you when your blood sugar level rises above 100 mg / dl or if there are any signs that one cannot control his condition in time? Is this problem due to insulin resistance (IR); does I have an increased risk from obesity-related diseases such as hypertension - another cause which can lead to hyperglycemia with its complications like heart attack etc.. If yes then help yourself manage my disease by taking regular medication regularly at home without having too much food intake while on treatment? You should take medicine every day instead of just once daily unless they feel ill enough anyway during their illness period. Do not worry about getting sick again after coming back into office because many doctors do no

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Sample 2: What is (are) Leukoplakia ?
Generated: The term leukoplakia refers to a condition in which the epithelium of an organ or tissue has become abnormal. It can be found anywhere on skin, mucosa and / or oral cavity; it may also occur at other sites such as tongue base, lips, buccal vestibule etc... This disease affects people who have no obvious cause for their lesions but do not know how they develop them: some cases appear spontaneously without any apparent reason while others require treatment with topical agents like retinoids - usually applied topically twice daily over several months-called "retinoid therapy '-until now considered only effective against precancerous conditions that will progress into cancer if left untreated. In this paper we provide information about what types of leukoplakias exist worldwide so far and discuss whether there might
Reference: Leukoplakia is a condition in which thickened, white patches form on the tongue, gums, inside of the cheek, or some

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Sample 3: Hello doctor,I am a 19-year-old male. I got a whole body checkup done a couple of days ago and all the results came out well. However, the ECG reports revealed left ventricular hypertrophy. I consulted the cardiologist and she said that it was an error and I did not have to worry about anything. I sometimes feel a little heaviness in my chest area but I feel that it is because of gases or because I do not work out at all. How do I know my heart is fine?Kindly help.
Generated: What are some causes for this condition? Can we prevent its occurrence by taking proper medication if possible? Are there any other conditions which can cause cardiac enlargement like hypertension, diabetes mellitus etc.. If you suspect such as these things then your medical history should be taken into consideration before making decisions on treatment options - The patient's blood pressure must always remain normal while he / her weight needs no special attention during his visit with him / herself. H

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Sample 4: What are the patient's activity limitations or restrictions after discharge?
Generated: The provided context does not include information about how to manage these patients. However, they do have some activities that require assistance with their mobility and / or transfers (e.g. walking) as well as other daily tasks such as eating a meal without food for several days before returning home from hospital. They also need help managing symptoms of nausea during hospitalization due to dehydration caused by vomiting while on medications prescribed prior to surgery; however, there may be no specific instructions regarding when it should occur if an emergency room visit occurs because of worsening abdominal discomfort related to bowel obstruction requiring surgical intervention within 48 hours following admission. Also included here were two questions asked upon leaving our facility concerning whether any changes had occurred since being discharged back into hospice care facilities

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Sample 5: What are the discharge conditions?
Generated: Discharge Condition: Transfer to home. The patient is ambulatory with a walker and an orthosis for ambulation in bed or chair without assistance at night as well as ambulating independently on their own during daytime hours (e.g., 0800 - 2200) after surgery if they have no other medical problems that would preclude them from using these devices safely while traveling through hospital facilities such as intensive care units / rehabilitation centers. They also can use bracing when needed due to pain associated with swelling of soft tissues around joints which may interfere with mobility; however, this condition does not require any specific treatment options including analgesics, physical therapy modalities like massage, stretching exercises, manual lymphatic drainage techniques, splints, tapers, bandaging systems etc.. If there was
Reference: The patient's discharge condition is stable.
BERT F1: 0.8120, ROUGE-L: 0.0458, FCS: 0.544