In [None]:
pip install datasets peft trl transformers pandas torch spacy nltk rouge_score bert_score sentence_transformers bitsandbytes accelerate sentencepiece

[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
import os
import json
import pandas as pd
import torch
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, PeftModelForCausalLM
from trl import SFTTrainer

from peft import LoraConfig, get_peft_model, PeftModel
from trl import SFTTrainer
from transformers import TrainingArguments, TrainerCallback
import json
import os

# Note: MedAlpaca-7B is publicly available on Hugging Face
# No special authentication required

# Custom callback to monitor loss and stop training when loss < 0.1
class EarlyStoppingOnLossCallback(TrainerCallback):
    """
    Custom callback to save checkpoint and stop training when loss drops below threshold
    """
    def __init__(self, target_loss_threshold=0.1, patience=50, auto_stop=True):
        self.target_loss_threshold = target_loss_threshold
        self.patience = patience
        self.auto_stop = auto_stop
        self.steps_below_threshold = 0
        self.best_checkpoint_saved = False

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None and "loss" in logs:
            current_loss = logs["loss"]
            current_step = state.global_step

            print(f"\n[Step {current_step}] Current Loss: {current_loss:.4f}")

            # Check if loss is below threshold
            if current_loss < self.target_loss_threshold:
                self.steps_below_threshold += 1
                print(f"‚úÖ Loss below {self.target_loss_threshold}! ({self.steps_below_threshold}/{self.patience} steps)")

                # Save checkpoint when first reaching below threshold
                if not self.best_checkpoint_saved:
                    print(f"üíæ Saving checkpoint at loss {current_loss:.4f}")
                    control.should_save = True
                    self.best_checkpoint_saved = True

                # Stop training if stayed below threshold for patience steps
                if self.auto_stop and self.steps_below_threshold >= self.patience:
                    print(f"\nüõë Stopping training! Loss has been below {self.target_loss_threshold} for {self.patience} steps.")
                    print(f"Final loss: {current_loss:.4f}")
                    control.should_training_stop = True
            else:
                self.steps_below_threshold = 0

        return control

  from .autonotebook import tqdm as notebook_tqdm
  warn(


In [None]:
def create_prompt(instruction, input_text):
    """Format the instruction and input into a prompt"""
    if input_text:
        return f"{instruction}\n\n{input_text}"
    return instruction

def load_and_format_dataset(file_path, train_split=0.8, output_dir="data", max_samples=None):
    """Improved dataset preparation with optional sample limit"""
    os.makedirs(output_dir, exist_ok=True)
    df = pd.read_csv(file_path)

    # Validate and filter
    required_columns = ["instruction", "input", "output"]
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Dataset must contain {required_columns} columns")
    df = df[df["input"] != "No structured clinical data available."]

    # Optional: limit dataset size for faster training/testing
    if max_samples and len(df) > max_samples:
        df = df.sample(n=max_samples, random_state=42)
        print(f"Using {max_samples} samples for faster training")

    formatted_data = []
    for _, row in df.iterrows():
        # Create chat format
        user_msg = create_prompt(row["instruction"], row["input"])
        assistant_msg = row["output"]

        # Create both formats
        formatted_data.append({
            "messages": [
                {"role": "user", "content": user_msg},
                {"role": "assistant", "content": assistant_msg}
            ],
            "text": f"### User: {user_msg} ###\n### Assistant: {assistant_msg} ###"
        })

    # Split and save
    train_size = int(len(formatted_data) * train_split)
    for split, data in [("train", formatted_data[:train_size]),
                       ("validation", formatted_data[train_size:])]:
        with open(os.path.join(output_dir, f"{split}.jsonl"), "w") as f:
            for item in data:
                json.dump(item, f)
                f.write("\n")

    print(f"Saved {train_size} training and {len(formatted_data)-train_size} validation examples")
    return load_dataset("json", data_files={
        "train": os.path.join(output_dir, "train.jsonl"),
        "validation": os.path.join(output_dir, "validation.jsonl")
    })

In [None]:
def preprocess_and_save_dataset(dataset, tokenizer, output_dir="preprocessed_data"):
    """Pre-tokenize and cache dataset"""
    os.makedirs(output_dir, exist_ok=True)
    def tokenize_function(example):
        return tokenizer(example["text"], truncation=True, max_length=512)  # 512 is good balance
    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["messages", "text"])
    tokenized_dataset.save_to_disk(output_dir)
    return tokenized_dataset

def configure_qlora_model(model_name="medalpaca/medalpaca-7b"):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=False
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        dtype=torch.float16,
        trust_remote_code=True
    )
    model.config.pad_token_id = tokenizer.eos_token_id
    model = prepare_model_for_kbit_training(model)

    lora_config = LoraConfig(
        r=4,  # Reduced from 8
        lora_alpha=8,  # Reduced from 16
        lora_dropout=0.05,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        bias="none",
        task_type="CAUSAL_LM",
        inference_mode=False,
        fan_in_fan_out=False,
        modules_to_save=["embed_tokens", "lm_head"]
    )

    print("Applying PEFT adapters to the model...")
    peft_model = get_peft_model(model, lora_config)
    print(f"[DEBUG] Type after get_peft_model: {type(peft_model)}")

    if not isinstance(peft_model, (PeftModel, PeftModelForCausalLM)):
        raise ValueError("Model is not a PEFT model instance!")
    else:
        print("[OK] Model wrapped with PEFT successfully.")

    print(peft_model.print_trainable_parameters())

    for name, param in peft_model.named_parameters():
        if 'lora' in name:
            param.requires_grad = True

    return peft_model, tokenizer

In [None]:
import os
from huggingface_hub import login, HfApi, whoami

# 1) put your NEW token here (don‚Äôt share it)
os.environ["HUGGINGFACE_HUB_TOKEN"] = "HF_TOKEN_FROM_ENV"

# 2) login so the credential is cached
login(os.environ["HUGGINGFACE_HUB_TOKEN"])

# 3) sanity checks
print("whoami:", whoami())
api = HfApi()
# This will raise 401 (not authed) or 403 (no access) if something's off
print(api.model_info("meta-llama/Meta-Llama-3-8B", token=os.environ["HUGGINGFACE_HUB_TOKEN"]).sha)

whoami: {'type': 'user', 'id': '67a65f47497698c82022ada1', 'name': 'AparnaSuresh', 'fullname': 'AparnaSuresh', 'isPro': False, 'avatarUrl': '/avatars/6fcc5210516443950f310cdd623057d8.svg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'Project', 'role': 'fineGrained', 'createdAt': '2025-10-27T23:01:22.527Z', 'fineGrained': {'canReadGatedRepos': True, 'global': ['discussion.write', 'post.write'], 'scoped': [{'entity': {'_id': '661f97d48e7f3438386f755d', 'type': 'model', 'name': 'meta-llama/Meta-Llama-3-8B'}, 'permissions': ['repo.content.read', 'discussion.write']}, {'entity': {'_id': '67a65f47497698c82022ada1', 'type': 'user', 'name': 'AparnaSuresh'}, 'permissions': ['repo.content.read', 'repo.write', 'inference.serverless.write', 'inference.endpoints.infer.write', 'inference.endpoints.write', 'user.webhooks.read', 'user.webhooks.write', 'collection.read', 'collection.write', 'discussion.write', 'job.write']}]}}}}
8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920


In [None]:
def setup_trainer(model, tokenizer, dataset, output_dir="medalpaca_finetuned"):
    if not isinstance(model, (PeftModel, PeftModelForCausalLM)):
        raise ValueError("Model is not a PEFT-wrapped instance! Cannot continue with training.")

    print(f"Model is a PEFT model: {isinstance(model, (PeftModel, PeftModelForCausalLM))}")

    # Standard GPU settings (no A100-specific optimization)
    device_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
    print(f"üìä Using {device_name}. Standard GPU settings applied.")

    batch_size = 8  # Standard batch size for most GPUs
    gradient_accum = 2  # Effective batch size = 16
    workers = 4

    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=batch_size,  # Auto-adjusted for GPU
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accum,
        num_train_epochs=2,  # Increased to 2 for better training
        learning_rate=1e-4,  # Conservative LR to prevent overfitting
        bf16=True,
        bf16_full_eval=True,
        save_strategy="steps",  # Save periodically during training
        save_steps=1000,  # Save every 1000 steps
        eval_strategy="no",  # Disable eval during training for speed
        load_best_model_at_end=False,
        logging_steps=50,  # Log every 50 steps
        save_total_limit=2,  # Keep ONLY last 2 checkpoints (saves Drive space!)
        push_to_hub=False,
        gradient_checkpointing=False,  # Disabled for speed
        optim="adamw_torch_fused",
        max_grad_norm=0.3,
        warmup_steps=100,  # Warmup for stability
        lr_scheduler_type="cosine",  # Cosine decay for smooth convergence
        dataloader_num_workers=workers,
        dataloader_pin_memory=True,
        dataloader_prefetch_factor=2,  # Prefetch for efficiency
        group_by_length=True,  # Group similar lengths for efficiency
        max_steps=-1,  # No limit - early stopping will handle this
    )

    print(f"‚úÖ Training config: Batch size={batch_size}, Gradient accum={gradient_accum}, Workers={workers}")

    def formatting_func(example):
        return "\n".join([
            f"### {msg['role'].capitalize()}: {msg['content']} ###"
            for msg in example["messages"]
        ])

    # Create callback to monitor loss and stop if in optimal range
    early_stop_callback = EarlyStoppingOnLossCallback(
        target_loss_threshold=0.1,  # Stop when loss drops below 0.1
        patience=50,                # Wait 50 steps below threshold before stopping
        auto_stop=True              # Automatically stop training
    )

    print("Creating SFTTrainer with EarlyStoppingOnLoss callback...")
    print(f"üìä Will monitor loss and save checkpoint when it drops below 0.1")
    print(f"üõë Auto-stop enabled: Training will stop after 50 steps below threshold")

    return SFTTrainer(
        model=model,
        processing_class=tokenizer,
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"] if training_args.eval_strategy != "no" else None,
        args=training_args,
        formatting_func=formatting_func,
        peft_config=None,
        callbacks=[early_stop_callback],
    )

def main():
    torch.cuda.empty_cache()  # Clear GPU memory
    print("Loading and preparing dataset...")
    dataset = load_and_format_dataset("bio_mistral_qa_combined.csv", max_samples=None)  # Use full dataset
    print(f"Dataset loaded: {dataset}")

    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained("medalpaca/medalpaca-7b", use_fast=False)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    print("Preprocessing and tokenizing dataset...")
    dataset = preprocess_and_save_dataset(dataset, tokenizer)
    print(f"Preprocessed dataset: {dataset}")

    # Clear tokenizer from memory before loading full model
    del tokenizer
    torch.cuda.empty_cache()

    print("Configuring QLoRA model...")
    model, tokenizer = configure_qlora_model()  # Load model only once
    print("QLoRA model configured successfully!")

    if not isinstance(model, (PeftModel, PeftModelForCausalLM)):
        raise ValueError("Model is not properly wrapped as a PEFT model!")

    print("Setting up trainer...")
    trainer = setup_trainer(model, tokenizer, dataset)
    print("Trainer configured successfully!")

    print("Starting training...")
    trainer.train()

    trainer.save_model()
    print(f"Model trained and saved to {trainer.args.output_dir}")

    return model, tokenizer, trainer

# Run training
model, tokenizer, trainer = main()

Loading and preparing dataset...
Saved 89530 training and 22383 validation examples


Generating train split: 89530 examples [00:00, 132003.32 examples/s]
Generating validation split: 22383 examples [00:00, 85046.32 examples/s]


Dataset loaded: DatasetDict({
    train: Dataset({
        features: ['messages', 'text'],
        num_rows: 89530
    })
    validation: Dataset({
        features: ['messages', 'text'],
        num_rows: 22383
    })
})
Loading tokenizer...
Preprocessing and tokenizing dataset...


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 89530/89530 [01:00<00:00, 1486.94 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 22383/22383 [00:23<00:00, 939.72 examples/s]
Saving the dataset (1/1 shards): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 89530/89530 [00:00<00:00, 181804.64 examples/s]
Saving the dataset (1/1 shards): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 22383/22383 [00:00<00:00, 127663.74 examples/s]


Preprocessed dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 89530
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 22383
    })
})
Configuring QLoRA model...


Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:10<00:00,  3.57s/it]


Applying PEFT adapters to the model...


You passed a dataset that is already processed (contains an `input_ids` field) together with a formatting function. Therefore `formatting_func` will be ignored. Either remove the `formatting_func` or pass a dataset that is not already processed.


[DEBUG] Type after get_peft_model: <class 'peft.peft_model.PeftModelForCausalLM'>
[OK] Model wrapped with PEFT successfully.
trainable params: 272,146,432 || all params: 7,010,570,240 || trainable%: 3.8819
None
QLoRA model configured successfully!
Setting up trainer...
Model is a PEFT model: True
üìä Using NVIDIA GeForce RTX 5090. Standard GPU settings applied.
‚úÖ Training config: Batch size=8, Gradient accum=2, Workers=4
Creating SFTTrainer with EarlyStoppingOnLoss callback...
üìä Will monitor loss and save checkpoint when it drops below 0.1
üõë Auto-stop enabled: Training will stop after 50 steps below threshold


Truncating train dataset: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 89530/89530 [00:00<00:00, 823483.83 examples/s]
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 2, 'pad_token_id': 32000}.
The model is already on multiple devices. Skipping the move to device specified in `args`.


Trainer configured successfully!
Starting training...


  return fn(*args, **kwargs)


Step,Training Loss
50,2.0851
100,1.3
150,1.1272
200,1.104
250,1.0423
300,1.0317
350,1.0111
400,0.9849
450,0.9916
500,0.9762



[Step 50] Current Loss: 2.0851

[Step 100] Current Loss: 1.3000

[Step 150] Current Loss: 1.1272

[Step 200] Current Loss: 1.1040

[Step 250] Current Loss: 1.0423

[Step 300] Current Loss: 1.0317

[Step 350] Current Loss: 1.0111

[Step 400] Current Loss: 0.9849

[Step 450] Current Loss: 0.9916

[Step 500] Current Loss: 0.9762

[Step 550] Current Loss: 0.9661

[Step 600] Current Loss: 0.9429

[Step 650] Current Loss: 0.9233

[Step 700] Current Loss: 0.9065

[Step 750] Current Loss: 0.9494

[Step 800] Current Loss: 0.9361

[Step 850] Current Loss: 0.9318

[Step 900] Current Loss: 0.9347

[Step 950] Current Loss: 0.8991

[Step 1000] Current Loss: 0.9220


  return fn(*args, **kwargs)



[Step 1050] Current Loss: 0.9322

[Step 1100] Current Loss: 0.8976

[Step 1150] Current Loss: 0.9199

[Step 1200] Current Loss: 0.9283

[Step 1250] Current Loss: 0.8897

[Step 1300] Current Loss: 0.8586

[Step 1350] Current Loss: 0.8889

[Step 1400] Current Loss: 0.8728

[Step 1450] Current Loss: 0.8951

[Step 1500] Current Loss: 0.8809

[Step 1550] Current Loss: 0.8805

[Step 1600] Current Loss: 0.8998

[Step 1650] Current Loss: 0.8665

[Step 1700] Current Loss: 0.8528

[Step 1750] Current Loss: 0.8755

[Step 1800] Current Loss: 0.8641

[Step 1850] Current Loss: 0.8768

[Step 1900] Current Loss: 0.8853

[Step 1950] Current Loss: 0.8282

[Step 2000] Current Loss: 0.8810


  return fn(*args, **kwargs)



[Step 2050] Current Loss: 0.8396

[Step 2100] Current Loss: 0.8601

[Step 2150] Current Loss: 0.8817

[Step 2200] Current Loss: 0.8405

[Step 2250] Current Loss: 0.8656

[Step 2300] Current Loss: 0.8477

[Step 2350] Current Loss: 0.8412

[Step 2400] Current Loss: 0.8289

[Step 2450] Current Loss: 0.8192

[Step 2500] Current Loss: 0.8249

[Step 2550] Current Loss: 0.8444

[Step 2600] Current Loss: 0.8306

[Step 2650] Current Loss: 0.8501

[Step 2700] Current Loss: 0.8641

[Step 2750] Current Loss: 0.8641

[Step 2800] Current Loss: 0.8216

[Step 2850] Current Loss: 0.8420

[Step 2900] Current Loss: 0.8254

[Step 2950] Current Loss: 0.8339

[Step 3000] Current Loss: 0.8111


  return fn(*args, **kwargs)



[Step 3050] Current Loss: 0.8008

[Step 3100] Current Loss: 0.8478

[Step 3150] Current Loss: 0.8217

[Step 3200] Current Loss: 0.8259

[Step 3250] Current Loss: 0.8300

[Step 3300] Current Loss: 0.8261

[Step 3350] Current Loss: 0.8579

[Step 3400] Current Loss: 0.8496

[Step 3450] Current Loss: 0.7911

[Step 3500] Current Loss: 0.8141

[Step 3550] Current Loss: 0.7968

[Step 3600] Current Loss: 0.8370

[Step 3650] Current Loss: 0.8283

[Step 3700] Current Loss: 0.8004

[Step 3750] Current Loss: 0.8025

[Step 3800] Current Loss: 0.7850

[Step 3850] Current Loss: 0.7681

[Step 3900] Current Loss: 0.8080

[Step 3950] Current Loss: 0.8159

[Step 4000] Current Loss: 0.8251


  return fn(*args, **kwargs)



[Step 4050] Current Loss: 0.7837

[Step 4100] Current Loss: 0.8024

[Step 4150] Current Loss: 0.8020

[Step 4200] Current Loss: 0.7619

[Step 4250] Current Loss: 0.7952

[Step 4300] Current Loss: 0.7808

[Step 4350] Current Loss: 0.7838

[Step 4400] Current Loss: 0.7940

[Step 4450] Current Loss: 0.8332

[Step 4500] Current Loss: 0.7974

[Step 4550] Current Loss: 0.7863

[Step 4600] Current Loss: 0.8056

[Step 4650] Current Loss: 0.8058

[Step 4700] Current Loss: 0.7867

[Step 4750] Current Loss: 0.7467

[Step 4800] Current Loss: 0.8067

[Step 4850] Current Loss: 0.7739

[Step 4900] Current Loss: 0.7762

[Step 4950] Current Loss: 0.7787

[Step 5000] Current Loss: 0.7666


  return fn(*args, **kwargs)



[Step 5050] Current Loss: 0.7713

[Step 5100] Current Loss: 0.7873

[Step 5150] Current Loss: 0.7982

[Step 5200] Current Loss: 0.7457

[Step 5250] Current Loss: 0.7806

[Step 5300] Current Loss: 0.7570

[Step 5350] Current Loss: 0.7933

[Step 5400] Current Loss: 0.7568

[Step 5450] Current Loss: 0.7756

[Step 5500] Current Loss: 0.7865

[Step 5550] Current Loss: 0.7922

[Step 5600] Current Loss: 0.8039

[Step 5650] Current Loss: 0.7495

[Step 5700] Current Loss: 0.7109

[Step 5750] Current Loss: 0.7043

[Step 5800] Current Loss: 0.7211

[Step 5850] Current Loss: 0.7026

[Step 5900] Current Loss: 0.7228

[Step 5950] Current Loss: 0.7019

[Step 6000] Current Loss: 0.7374


  return fn(*args, **kwargs)



[Step 6050] Current Loss: 0.7267

[Step 6100] Current Loss: 0.7022

[Step 6150] Current Loss: 0.7464

[Step 6200] Current Loss: 0.6975

[Step 6250] Current Loss: 0.7173

[Step 6300] Current Loss: 0.7157

[Step 6350] Current Loss: 0.6936

[Step 6400] Current Loss: 0.7360

[Step 6450] Current Loss: 0.7151

[Step 6500] Current Loss: 0.7311

[Step 6550] Current Loss: 0.6954

[Step 6600] Current Loss: 0.6973

[Step 6650] Current Loss: 0.7154

[Step 6700] Current Loss: 0.7177

[Step 6750] Current Loss: 0.6782

[Step 6800] Current Loss: 0.7075

[Step 6850] Current Loss: 0.6924

[Step 6900] Current Loss: 0.7313

[Step 6950] Current Loss: 0.7193

[Step 7000] Current Loss: 0.6989


  return fn(*args, **kwargs)



[Step 7050] Current Loss: 0.7323

[Step 7100] Current Loss: 0.7250

[Step 7150] Current Loss: 0.6919

[Step 7200] Current Loss: 0.7159

[Step 7250] Current Loss: 0.6888

[Step 7300] Current Loss: 0.7187

[Step 7350] Current Loss: 0.6959

[Step 7400] Current Loss: 0.6827

[Step 7450] Current Loss: 0.6898

[Step 7500] Current Loss: 0.7227

[Step 7550] Current Loss: 0.7073

[Step 7600] Current Loss: 0.7025

[Step 7650] Current Loss: 0.7068

[Step 7700] Current Loss: 0.7035

[Step 7750] Current Loss: 0.7032

[Step 7800] Current Loss: 0.6831

[Step 7850] Current Loss: 0.7005

[Step 7900] Current Loss: 0.7070

[Step 7950] Current Loss: 0.7138

[Step 8000] Current Loss: 0.6985


  return fn(*args, **kwargs)



[Step 8050] Current Loss: 0.7261

[Step 8100] Current Loss: 0.7101

[Step 8150] Current Loss: 0.7003

[Step 8200] Current Loss: 0.7027

[Step 8250] Current Loss: 0.6897

[Step 8300] Current Loss: 0.7221

[Step 8350] Current Loss: 0.7086

[Step 8400] Current Loss: 0.6779

[Step 8450] Current Loss: 0.6870

[Step 8500] Current Loss: 0.6879

[Step 8550] Current Loss: 0.6802

[Step 8600] Current Loss: 0.6800

[Step 8650] Current Loss: 0.6825

[Step 8700] Current Loss: 0.6735

[Step 8750] Current Loss: 0.6993

[Step 8800] Current Loss: 0.7450

[Step 8850] Current Loss: 0.6870

[Step 8900] Current Loss: 0.7032

[Step 8950] Current Loss: 0.6889

[Step 9000] Current Loss: 0.6944


  return fn(*args, **kwargs)



[Step 9050] Current Loss: 0.6859

[Step 9100] Current Loss: 0.6829

[Step 9150] Current Loss: 0.6814

[Step 9200] Current Loss: 0.6589

[Step 9250] Current Loss: 0.6604

[Step 9300] Current Loss: 0.7213

[Step 9350] Current Loss: 0.6919

[Step 9400] Current Loss: 0.6881

[Step 9450] Current Loss: 0.7056

[Step 9500] Current Loss: 0.6823

[Step 9550] Current Loss: 0.6817

[Step 9600] Current Loss: 0.7015

[Step 9650] Current Loss: 0.7208

[Step 9700] Current Loss: 0.6910

[Step 9750] Current Loss: 0.7161

[Step 9800] Current Loss: 0.6859

[Step 9850] Current Loss: 0.6993

[Step 9900] Current Loss: 0.6817

[Step 9950] Current Loss: 0.7082

[Step 10000] Current Loss: 0.6659


  return fn(*args, **kwargs)



[Step 10050] Current Loss: 0.6815

[Step 10100] Current Loss: 0.6643

[Step 10150] Current Loss: 0.6818

[Step 10200] Current Loss: 0.6523

[Step 10250] Current Loss: 0.7208

[Step 10300] Current Loss: 0.6856

[Step 10350] Current Loss: 0.6856

[Step 10400] Current Loss: 0.7270

[Step 10450] Current Loss: 0.6932

[Step 10500] Current Loss: 0.6843

[Step 10550] Current Loss: 0.6949

[Step 10600] Current Loss: 0.6795

[Step 10650] Current Loss: 0.6666

[Step 10700] Current Loss: 0.6975

[Step 10750] Current Loss: 0.6524

[Step 10800] Current Loss: 0.6910

[Step 10850] Current Loss: 0.7136

[Step 10900] Current Loss: 0.7012

[Step 10950] Current Loss: 0.6764

[Step 11000] Current Loss: 0.6785


  return fn(*args, **kwargs)



[Step 11050] Current Loss: 0.7093

[Step 11100] Current Loss: 0.7063

[Step 11150] Current Loss: 0.6716
Model trained and saved to medalpaca_finetuned


## Evaluation

In [None]:
!pip install transformers datasets torch pandas numpy scikit-learn rouge-score nltk scispacy

!pip install sentence-transformers

Collecting scispacy
  Downloading scispacy-0.6.2-py3-none-any.whl.metadata (20 kB)
Collecting conllu (from scispacy)
  Downloading conllu-6.0.0-py3-none-any.whl.metadata (21 kB)
Collecting numpy
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.0/61.0 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nmslib-metabrainz==2.1.3 (from scispacy)
  Downloading nmslib_metabrainz-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (956 bytes)
Collecting pysbd (from scispacy)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting pybind11>=2.2.3 (from nmslib-metabrainz==2.1.3->scispacy)
  Downloading pybind11-3.0.1-py3-none-any.whl.metadata (10.0 kB)
INFO: pip is looking at multiple versions of thinc to determine which version is compatible with o

In [None]:
pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz

Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz
  Downloading https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz (120.2 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m120.2/120.2 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting spacy<3.5.0,>=3.4.1 (from en-ner-bc5cdr-md==0.5.1)
  Downloading spacy-3.4.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (24 kB)
Collecting thinc<8.2.0,>=8.1.0 (from spacy<3.5.0,>=3.4.1->en-ner-bc5cdr-md==0.5.1)
  Downloading thinc-8.1.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting wasabi<1.1.0,>=0.9.1 (from spacy<3.5.0,>=3.4.1->en-ner-bc5cdr-md==0.5.1)
  Downloading wasabi-0.10.1-py3-none-any.whl.metadata (28 kB)


In [None]:
!pip uninstall -y numpy thinc spacy scispacy
!pip install numpy==1.26.4
!pip install spacy==3.7.2
!pip install scispacy==0.5.1


Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: thinc 8.1.12
Uninstalling thinc-8.1.12:
  Successfully uninstalled thinc-8.1.12
Found existing installation: spacy 3.4.4
Uninstalling spacy-3.4.4:
  Successfully uninstalled spacy-3.4.4
Found existing installation: scispacy 0.6.2
Uninstalling scispacy-0.6.2:
  Successfully uninstalled scispacy-0.6.2
[0mCollecting numpy==1.26.4
  Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Installing collected packages: numpy
Successfully installed numpy-1.26.4
[0mCollecting spacy==3.7.2
  Downloading spacy-3.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting thinc<8.3.0,>=8.1.8 (from spacy==3.7.2)
  Downloading thinc-8.2.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_6

In [None]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
!pip install transformers torch sentence-transformers spacy nltk pandas rouge-score

[0m

In [None]:
pip install accelerate bitsandbytes transformers bert-score peft

[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import logging
import re
import spacy
from nltk.tokenize import word_tokenize
import nltk
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from sentence_transformers import SentenceTransformer, util
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import os

# Suppress transformers warnings
logging.getLogger("transformers").setLevel(logging.ERROR)

# Download NLTK data
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
# Load spaCy and SentenceTransformer models
def load_spacy_model():
    """Load spaCy medical NER model."""
    try:
        return spacy.load("en_ner_bc5cdr_md")
    except Exception as e:
        print(f"Error loading spaCy model: {e}")
        return None

def load_sentence_transformer():
    """Load SentenceTransformer for FCS."""
    try:
        return SentenceTransformer("all-MiniLM-L6-v2", device="cuda" if torch.cuda.is_available() else "cpu")
    except Exception as e:
        print(f"Error loading SentenceTransformer: {e}")
        return None

nlp = load_spacy_model()
embedder = load_sentence_transformer()

In [None]:
# Load fine-tuned model and tokenizer
def load_fine_tuned_model(model_name="medalpaca/medalpaca", checkpoint_dir="medalpaca_finetuned"):
    """Load the fine-tuned QLoRA model and tokenizer."""
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=False,
            llm_int8_enable_fp32_cpu_offload=True,
            llm_int8_skip_modules=["lm_head"]
        )

        base_model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto",
            dtype=torch.float16,
            trust_remote_code=True,
            offload_folder="offload",
            low_cpu_mem_usage=True,
            offload_state_dict=True
        )
        base_model.config.pad_token_id = tokenizer.eos_token_id

        if os.path.exists(checkpoint_dir):
            if os.path.exists(os.path.join(checkpoint_dir, "adapter_model.bin")):
                print(f"Loading fine-tuned model from {checkpoint_dir}")
                model = PeftModel.from_pretrained(base_model, checkpoint_dir, is_trainable=False)
            else:
                checkpoints = [d for d in os.listdir(checkpoint_dir) if d.startswith("checkpoint-")]
                if checkpoints:
                    latest_checkpoint = max(checkpoints, key=lambda x: int(x.split("-")[1]))
                    checkpoint_path = os.path.join(checkpoint_dir, latest_checkpoint)
                    print(f"Loading fine-tuned model from checkpoint: {checkpoint_path}")
                    model = PeftModel.from_pretrained(base_model, checkpoint_path, is_trainable=False)
                else:
                    raise ValueError(f"No checkpoints or final model found in {checkpoint_dir}")
        else:
            raise ValueError(f"Checkpoint directory {checkpoint_dir} does not exist")

        return model, tokenizer
    except Exception as e:
        print(f"Error loading fine-tuned model: {e}")
        return None, None

model, tokenizer = load_fine_tuned_model()
if model is None or tokenizer is None:
    raise ValueError("Failed to load fine-tuned model or tokenizer")

In [None]:
# ‚úÖ Manually define 5 QA pairs (3 curated + 2 additional realistic)
questions = [
    "Are there any further procedures planned for the patient?",
    "Does the patient require long term monitoring?",
    "What precautions does the patient need to take post-discharge?",
    "What medications is the patient currently taking?",
    "What is the patient's primary diagnosis?"
]

inputs = [
    "Gender: F\nChief Complaint: Abdominal distention, nausea, and vomiting\nHistory: Cirrhosis, multiple paracenteses for ascites\nPlan: Schedule regular paracentesis every 2 weeks",
    "Gender: F\nChief Complaint: Abdominal distention, nausea, and vomiting\nPlan: Monitor weight and abdominal girth daily; assess for signs of fluid overload",
    "Gender: M\nChief Complaint: Abd pain, Hypotension\nDischarge Plan: Follow low sodium diet, take prescribed meds, and avoid strenuous activity",
    "Gender: F\nCurrent Medications: Lisinopril 10mg daily, Furosemide 40mg daily\nAllergies: None known\nAssessment: Hypertension, fluid retention",
    "Gender: M\nChief Complaint: Fever, Cough\nFindings: CXR shows consolidation in the right lower lobe\nAssessment: Community-acquired pneumonia"
]

references = [
    "Yes, the patient requires regular paracentesis due to fluid accumulation.",
    "Yes, the patient requires close monitoring for fluid accumulation and symptoms.",
    "Follow up with the doctor or nurse practitioner. Avoid heavy lifting and follow dietary guidelines.",
    "The patient is currently taking Lisinopril and Furosemide.",
    "The patient's primary diagnosis is community-acquired pneumonia."
]

In [None]:
# Prompt and validation functions
def create_prompt(question, context):
    """Create a prompt for the model."""
    return f"""You are a clinical assistant. Provide concise, factual answers based ONLY on the available information.

Question: {question}
Available Context: {context if context.strip() else "No specific clinical data provided"}

Answer (just the factual medical response, no references to tables/figures):"""

def validate_answer(answer):
    """Validate generated answer to exclude invalid phrases."""
    invalid_phrases = ["Table", "Figure", "as shown in", "refer to"]
    if any(phrase.lower() in answer.lower() for phrase in invalid_phrases):
        return "Unable to generate proper response from available data"
    return answer.strip()

# Dataset class for generation
class QADataset(Dataset):
    """Dataset class for question answering."""
    def __init__(self, questions, inputs, references, tokenizer, max_length=256):
        self.questions = questions
        self.inputs = inputs
        self.references = references
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        input_text = self.inputs[idx]
        prompt = create_prompt(question, input_text)
        encoding = self.tokenizer(
            prompt,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'prompt_length': encoding['input_ids'].shape[1],
            'question': question,
            'input_text': input_text,
            'reference': self.references[idx]
        }

# Generate responses
def generate_responses(model, tokenizer, questions, inputs, references):
    """Generate responses for the dataset."""
    bad_words = ["Table", "Figure"]
    bad_words_ids = [tokenizer.encode(word, add_special_tokens=False) for word in bad_words if tokenizer.encode(word, add_special_tokens=False)]

    generation_kwargs = {
        'max_new_tokens': 150,
        'do_sample': True,
        'temperature': 0.3,
        'repetition_penalty': 1.5,
        'no_repeat_ngram_size': 4,
        'bad_words_ids': bad_words_ids if bad_words_ids else None,
        'eos_token_id': tokenizer.eos_token_id,
        'pad_token_id': tokenizer.pad_token_id
    }

    qa_dataset = QADataset(questions, inputs, references, tokenizer)
    dataloader = DataLoader(qa_dataset, batch_size=1, shuffle=False)
    generated_outputs = []
    sample_number = 0

    try:
        for batch in dataloader:
            sample_number += 1
            input_ids = batch['input_ids'].to("cuda" if torch.cuda.is_available() else "cpu")
            attention_mask = batch['attention_mask'].to("cuda" if torch.cuda.is_available() else "cpu")
            question = batch['question'][0]
            input_text = batch['input_text'][0]
            reference = batch['reference'][0]
            prompt_length = batch['prompt_length'][0]

            print(f"\n=== Sample {sample_number} ===")
            print(f"Instruction: {question}")
            print(f"Input: {input_text}")

            with torch.amp.autocast(device_type="cuda" if torch.cuda.is_available() else "cpu", dtype=torch.float16):
                outputs = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    **generation_kwargs
                )

            if outputs.shape[1] > prompt_length:
                new_tokens = outputs[0, prompt_length:]
            else:
                print(f"Warning: No new tokens generated for sample {sample_number}")
                new_tokens = outputs[0]

            generated_answer = tokenizer.decode(new_tokens, skip_special_tokens=True)
            generated_answer = validate_answer(generated_answer)

            print(f"Generated Answer: {generated_answer}")
            print(f"Ground Truth Answer: {reference}")

            generated_outputs.append(generated_answer)

        print(f"\nProcessed {sample_number} samples")
        return generated_outputs

    except Exception as e:
        print(f"Error during generation: {str(e)}")
        print(f"Stopped at sample {sample_number}")
        print(f"Problematic sample details: {question}, {input_text}")
        return generated_outputs

generated_outputs = generate_responses(model, tokenizer, questions, inputs, references)

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY_FROM_ENV"  # replace with your actual key

In [None]:
import re
import numpy as np
import openai
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from sentence_transformers import SentenceTransformer, util
from nltk.translate.meteor_score import meteor_score
import torch
from torch import cuda
from openai import OpenAI
import os

# Set your OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    return ' '.join(tokens)

def compute_bleu_score(generated, reference):
    gen_tokens = word_tokenize(generated)
    ref_tokens = word_tokenize(reference)
    smoothie = SmoothingFunction().method4
    return sentence_bleu([ref_tokens], gen_tokens, smoothing_function=smoothie)

def compute_hybrid_score(bert_f1, bleu, bert_weight=0.7):
    return bert_weight * bert_f1 + (1 - bert_weight) * bleu

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def call_gpt4o(prompt, max_tokens=10):
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2,
            max_tokens=max_tokens,
        )
        reply = response.choices[0].message.content.strip()
        match = re.search(r"\d*\.\d+|\d+", reply)
        return float(match.group(0)) if match else 0.5
    except Exception as e:
        print(f"GPT-4o error: {e}")
        return 0.5

def compute_llm_judge_score(generated, reference, question):
    prompt = (
        f"You are an expert medical evaluator. Score the following generated answer from 0 to 1 "
        f"based on factual accuracy and clinical relevance to the reference.\n\n"
        f"Question: {question}\n"
        f"Generated Answer: {generated}\n"
        f"Reference Answer: {reference}\n\n"
        f"Just reply with a score (e.g., 0.73)."
    )
    return call_gpt4o(prompt)

def compute_geval_score(generated, reference, question):
    prompt = (
        f"You are evaluating the clinical quality of a generated answer. Consider factual correctness, completeness, and clarity. "
        f"Score it from 0 to 1.\n\n"
        f"Question: {question}\n"
        f"Generated Answer: {generated}\n"
        f"Reference Answer: {reference}\n\n"
        f"Reply with a score (e.g., 0.82)."
    )
    return call_gpt4o(prompt)

def compute_metrics_per_query(generated_outputs, references, questions, nlp, embedder):
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    # Metric containers
    bert_p_scores, bert_r_scores, bert_f1_scores = [], [], []
    rouge_l_scores, fcs_scores, bleu_scores, hybrid_scores = [], [], [], []
    entity_f1_scores, mcr_scores, meteor_scores = [], [], []
    llm_judge_scores, geval_scores = [], []

    print(f"\n=== Per-Query Evaluation Metrics ===")

    for i, (gen, ref, question) in enumerate(zip(generated_outputs, references, questions), 1):
        gen_norm = preprocess_text(gen)
        ref_norm = preprocess_text(ref)

        # BERTScore
        p, r, f1 = bert_score([gen_norm], [ref_norm], lang="en", model_type="roberta-large")
        bert_p = p.item()
        bert_r = r.item()
        bert_f1 = f1.item()

        # ROUGE-L
        rouge_scores = rouge_scorer_obj.score(ref_norm, gen_norm)
        rouge_l = rouge_scores['rougeL'].fmeasure

        # Entity F1
        gen_entities = set(ent.text.lower() for ent in nlp(gen).ents if ent.label_ in ["DISEASE", "CHEMICAL"])
        ref_entities = set(ent.text.lower() for ent in nlp(ref).ents if ent.label_ in ["DISEASE", "CHEMICAL"])
        if ref_entities:
            precision = len(gen_entities & ref_entities) / len(gen_entities) if gen_entities else 0
            recall = len(gen_entities & ref_entities) / len(ref_entities)
            entity_f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        else:
            entity_f1 = 1.0 if not gen_entities else 0.0

        # FCS (Fact Checking Score)
        gen_embedding = embedder.encode(gen, convert_to_tensor=True, device="cuda" if cuda.is_available() else "cpu")
        ref_embedding = embedder.encode(ref, convert_to_tensor=True, device="cuda" if cuda.is_available() else "cpu")
        fcs = util.cos_sim(gen_embedding, ref_embedding)[0][0].item()

        # MCR (Medical Concept Recall)
        if ref_entities:
            matching_concepts = len(gen_entities.intersection(ref_entities))
            mcr = matching_concepts / len(ref_entities)
        else:
            mcr = 1.0 if not gen_entities else 0.0

        # BLEU Score
        bleu = compute_bleu_score(gen_norm, ref_norm)

        # Hybrid Score
        hybrid_score = compute_hybrid_score(bert_f1, bleu)

        # METEOR Score
        meteor = meteor_score([word_tokenize(ref_norm)], word_tokenize(gen_norm))

        # LLM Judge Score (GPT-4o)
        llm_judge_score = compute_llm_judge_score(gen, ref, question)

        # GEval Score (GPT-4o)
        geval_score = compute_geval_score(gen, ref, question)

        # Store scores
        bert_p_scores.append(bert_p)
        bert_r_scores.append(bert_r)
        bert_f1_scores.append(bert_f1)
        rouge_l_scores.append(rouge_l)
        entity_f1_scores.append(entity_f1)
        fcs_scores.append(fcs)
        mcr_scores.append(mcr)
        bleu_scores.append(bleu)
        hybrid_scores.append(hybrid_score)
        meteor_scores.append(meteor)
        llm_judge_scores.append(llm_judge_score)
        geval_scores.append(geval_score)

        # Print
        print(f"\nSample {i}: {question}")
        print(f"Generated Answer: {gen}")
        print(f"Reference Answer: {ref}")
        print(f"BERTScore Precision: {bert_p:.4f}")
        print(f"BERTScore Recall: {bert_r:.4f}")
        print(f"BERTScore F1: {bert_f1:.4f}")
        print(f"ROUGE-L: {rouge_l:.4f}")
        print(f"FCS: {fcs:.4f}")
        print(f"BLEU: {bleu:.4f}")
        print(f"Hybrid BERT-BLEU: {hybrid_score:.4f}")
        print(f"METEOR: {meteor:.4f}")
        print(f"LLM Judge Score (GPT-4o): {llm_judge_score:.4f}")
        print(f"GEval Score (GPT-4o): {geval_score:.4f}")

    # Average metrics
    print("\n=== Average Metrics Across All Queries ===")
    print(f"Average BERTScore Precision: {np.mean(bert_p_scores):.4f}")
    print(f"Average BERTScore Recall: {np.mean(bert_r_scores):.4f}")
    print(f"Average BERTScore F1: {np.mean(bert_f1_scores):.4f}")
    print(f"Average ROUGE-L: {np.mean(rouge_l_scores):.4f}")
    print(f"Average FCS: {np.mean(fcs_scores):.4f}")
    print(f"Average Entity F1: {np.mean(entity_f1_scores):.4f}")
    print(f"Average MCR: {np.mean(mcr_scores):.4f}")
    print(f"Average BLEU: {np.mean(bleu_scores):.4f}")
    print(f"Average Hybrid BERT-BLEU: {np.mean(hybrid_scores):.4f}")
    print(f"Average METEOR: {np.mean(meteor_scores):.4f}")
    print(f"Average LLM Judge Score (GPT-4o): {np.mean(llm_judge_scores):.4f}")
    print(f"Average GEval Score (GPT-4o): {np.mean(geval_scores):.4f}")


In [None]:

# Evaluate generated outputs
if generated_outputs:
    compute_metrics_per_query(generated_outputs, references, questions, nlp, embedder)
else:
    print("No outputs generated due to error.")

# Clear GPU memory
torch.cuda.empty_cache()