# Hallucination detection progress using Flan-T5-Large
### Student Name:
Samin Adhikari

### Student ID:
223606554

### Traget Score:
Credit

### What's in this file.
- Progress upto benchmarking of normal vs parameterized model.
- Uses Hugging Face model downloaded using token and run locally.

In [13]:
# Install deps
!pip install transformers torch
!pip install evaluate
!pip install bert-score
!pip install absl-py
!pip install rouge-score



In [41]:
# All the modules used for the project
import transformers
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import evaluate
from huggingface_hub import login
from tqdm import tqdm

# Suppress logging
transformers.logging.set_verbosity_error()

In [43]:
# Hugging face token config
token = ""
login(token)

In [59]:
# Load model - default is google's flan-t5-large
def load_model_and_tokenizer(model_name="google/flan-t5-large"):
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

# Default model used throuhout
def get_default_model_tokenizer():
    model_name = "google/flan-t5-large"
    model, tokenizer = load_model_and_tokenizer(model_name)
    return model, tokenizer

# Model with no specific hyperparameter
def generate_response(model, tokenizer, input_text, max_length=50):   
    # prompt = f"Please explain this: {input_text}"
    # inputs = tokenizer(prompt, return_tensors="pt")
    inputs = tokenizer(input_text, return_tensors="pt")
    outputs = model.generate(
      inputs["input_ids"],
      max_length=max_length
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Some parameter tuning along with prompt addition
def generate_parameterized_response(model, tokenizer, input_text, max_length=100, temperature=0.3):

    prompt = f"""
      Answer the following medical question using evidence-based clinical guidelines from trusted sources like PubMed, FDA, ACOG, WHO, etc. Provide relevant recommendations from these sources and explain the reasoning behind the answer. Include any conditions, risks, or precautions, and clearly state if further consultation is needed. If no evidence is available, explain the limitations of the available data.

      Question: {input_text}
    """
    
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
      inputs["input_ids"],
      max_length=max_length,
      temperature=temperature,
      do_sample=True, # Disables randomness for more reliable outputs
      num_beams=5, # Increases search space for high-quality answers
      repetition_penalty=1.2,  # Reduces hallucination by discouraging word overuse
      length_penalty=1.0,  # Maintains natural sentence flow
      early_stopping=True,  # Stops generation when an answer is complete
      num_return_sequences=1  # Returns only the best response
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [5]:
# Benchmarching
# Load all required Hugging Face metrics
bleu_metric = evaluate.load("bleu")
meteor_metric = evaluate.load("meteor")
rouge_metric = evaluate.load("rouge")
bertscore_metric = evaluate.load("bertscore")

def evaluate_metrics(predictions, references):
    # BLEU expects references as list of list of tokens
    bleu_result = bleu_metric.compute(predictions=predictions, references=[[ref] for ref in references])
    meteor_result = meteor_metric.compute(predictions=predictions, references=references)
    rouge_result = rouge_metric.compute(predictions=predictions, references=references, use_stemmer=True)
    bert_result = bertscore_metric.compute(predictions=predictions, references=references, lang="en")

    return {
        "BLEU": bleu_result["bleu"],
        "METEOR": meteor_result["meteor"],
        "ROUGE-1": rouge_result["rouge1"],
        "ROUGE-L": rouge_result["rougeL"],
        "BERTScore-F1": sum(bert_result["f1"]) / len(bert_result["f1"]),
    }

def benchmark_model(generate_func, model, tokenizer, data):
    predictions, references = [], []

    for qa in tqdm(data):
        question = qa["question"]
        reference = qa["answer"]
        generated = generate_func(model, tokenizer, question)
        predictions.append(generated)
        references.append(reference)

    return evaluate_metrics(predictions, references)

def load_benchmark_data(path="data/question.json", limit=50):
    with open(path, "r") as f:
        data = json.load(f)
    return data[:limit]

[nltk_data] Downloading package wordnet to /Users/rvlife-
[nltk_data]     samin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/rvlife-
[nltk_data]     samin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/rvlife-
[nltk_data]     samin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [79]:
# Main execution step for the application

# Starts default unparameterized model
def start_default_qa_system():
    model, tokenizer = get_default_model_tokenizer()
    print("Welcome to the Medical QA system. Ask your medical question (or type 'exit' to quit).")
    
    while True:
        # Ask the user for a question
        input_text = input("Please enter your medical question: ")
        
        # Exit condition
        if input_text.lower() == "exit":
            print("Exiting the Medical QA system. Goodbye!")
            break
        
        # Generate and display the answer
        response = generate_response(model, tokenizer, input_text)
        print("\nAnswer:", response)
        print("\n---\n")

# Starts benchmarking of default vs parameterized model
def start_benchmark_system():
    print("Loading model...")
    model, tokenizer = get_default_model_tokenizer()

    print("Loading data...")
    data = load_benchmark_data()

    print("\n[1] Benchmarking default model...")
    default_results = benchmark_model(generate_response, model, tokenizer, data)
    print("\nDefault Results:")
    for k, v in default_results.items():
        print(f"{k}: {v:.4f}")

    print("\n[2] Benchmarking parameterized model...")
    param_results = benchmark_model(generate_parameterized_response, model, tokenizer, data)
    print("\nParameterized Results:")
    for k, v in param_results.items():
        print(f"{k}: {v:.4f}")

    print("\n--- COMPARISON ---")
    for metric in default_results:
        diff = param_results[metric] - default_results[metric]
        print(f"{metric}: Δ {diff:.4f} (Improvement: {'Yes' if diff > 0 else 'No'})")

# Main Call
print("Choose an option:")
print("1: Load default model")
print("2: Benchmark default and parameterized model")
choice = input("Enter the number corresponding to your choice: ")

if choice == '1':
    start_default_qa_system()
elif choice == '2':
    start_benchmark_system()
else:
    print("Invalid choice! Please enter a valid number.")

Choose an option:
1: Load default model
2: Benchmark default and parameterized model


Enter the number corresponding to your choice:  2


Loading model...
Loading data...

[1] Benchmarking default model...


100%|███████████████████████████████████████████| 50/50 [00:48<00:00,  1.02it/s]



Default Results:
BLEU: 0.0038
METEOR: 0.0577
ROUGE-1: 0.1205
ROUGE-L: 0.1086
BERTScore-F1: 0.8448

[2] Benchmarking parameterized model...


100%|███████████████████████████████████████████| 50/50 [08:45<00:00, 10.50s/it]



Parameterized Results:
BLEU: 0.0512
METEOR: 0.2239
ROUGE-1: 0.2532
ROUGE-L: 0.1998
BERTScore-F1: 0.8727

--- COMPARISON ---
BLEU: Δ 0.0475 (Improvement: Yes)
METEOR: Δ 0.1662 (Improvement: Yes)
ROUGE-1: Δ 0.1326 (Improvement: Yes)
ROUGE-L: Δ 0.0912 (Improvement: Yes)
BERTScore-F1: Δ 0.0279 (Improvement: Yes)


In [15]:
# Additional package install for next training
!pip install transformers torch datasets accelerate peft bitsandbytes



In [42]:
# Fix issue with tensorflow module import
!pip uninstall keras -y
!pip install tensorflow-cpu==2.16.1
!pip install tf-keras==2.16.0 --no-dependencies

Found existing installation: keras 3.9.2
Uninstalling keras-3.9.2:
  Successfully uninstalled keras-3.9.2
[31mERROR: Could not find a version that satisfies the requirement tensorflow-cpu==2.16.1 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for tensorflow-cpu==2.16.1[0m[31m
[0mCollecting tf-keras==2.16.0
  Downloading tf_keras-2.16.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tf_keras-2.16.0-py3-none-any.whl (1.7 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: tf-keras
Successfully installed tf-keras-2.16.0


In [73]:
import os

# Disable Tensorflow completely
# os.environ["TRANSFORMERS_NO_TF"] = "1"
# os.environ["USE_TF"] = "0"

In [1]:
# Lora and SFT training imports
import json
import random

import torch
from torch.utils.data import DataLoader

from datasets import Dataset
from tqdm import tqdm

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model

In [97]:
# Get random data by default for initial testing with 20% training set
def load_random_benchmark_data(path="data/question.json", limit=50):
    with open(path, "r") as f:
        data = json.load(f)
    
    random.shuffle(data)
    limited_data = data[:limit]

    split_point = int(0.2 * limit)
    test_data = limited_data[:split_point]
    train_data = limited_data[split_point:]

    return train_data, test_data

# Load 5 random QnA pairs
train_data, test_data = load_random_benchmark_data(limit=50)
print("Train size:", len(train_data))  # 20
print("Test size:", len(test_data)) # 80

Train size: 40
Test size: 10


In [25]:
# Load FLAN-T5-Large
model_name = "google/flan-t5-large"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float32)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# # Configure LoRA
lora_config = LoraConfig(
    r=32,  # Increased rank for richer adaptations
    lora_alpha=32,
    target_modules=["q", "v", "k", "dense_h_to_4h"],  # Additional attention heads and FFN layers
    lora_dropout=0.1,  # Slightly higher dropout to prevent overfitting
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 14,155,776 || all params: 797,305,856 || trainable%: 1.7755


In [27]:
# Wrap into HuggingFace Dataset
dataset = Dataset.from_list(train_data)

# Tokenization function
def tokenize_function(example):
    prompt = example['question']
    target = example['answer']

    inputs = tokenizer(prompt, padding="max_length", truncation=True, max_length=100)
    targets = tokenizer(target, padding="max_length", truncation=True, max_length=100)

    inputs["labels"] = targets["input_ids"]
    return inputs

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=False)

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [35]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=20,
    learning_rate=1e-4,
    logging_dir="./logs",
    report_to="none",
    # fp16=False,  # Disable fp16 to reduce calculation time or due to computation limit
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

# Start fine-tuning
trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


TrainOutput(global_step=400, training_loss=5.913037719726563, metrics={'train_runtime': 218.2729, 'train_samples_per_second': 3.665, 'train_steps_per_second': 1.833, 'total_flos': 366914764800000.0, 'train_loss': 5.913037719726563, 'epoch': 20.0})

In [37]:
# Save trained and finetuned data
model.save_pretrained("./lora_finetuned")
tokenizer.save_pretrained("./lora_finetuned")

('./lora_finetuned/tokenizer_config.json',
 './lora_finetuned/special_tokens_map.json',
 './lora_finetuned/tokenizer.json')

In [113]:
import json
import random
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer

# 1. Load your fine-tuned model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("./lora_finetuned")
tokenizer = AutoTokenizer.from_pretrained("./lora_finetuned")

# 2. Create a text2text-generation pipeline
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=100)
# pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

recheck_train = train_data[:10]

# 5. Also check answer with default uuparameterized model
default_model, default_tokenizer = get_default_model_tokenizer()

# 6. Test and print each question, true answer, and model answer
for i, example in enumerate(recheck_train):
    question = example["question"]
    true_answer = example["answer"]
    
    retrained_model_output = pipe(question)[0]['generated_text']
    # model_output = pipe(question, max_new_tokens=50)[0]['generated_text']
    default_model_output = generate_response(default_model, default_tokenizer, question, device)
    
    print(f"\n=== Example {i+1} ===")
    print(f"Question      : {question}")
    print(f"True Answer   : {true_answer}")
    print(f"Default Model Answer : {default_model_output}")
    print(f"Model Answer  : {retrained_model_output}")
    print("="*40)


Loading default model: google/flan-t5-large

=== Example 1 ===
Question      : What are good dietary sources of iron?
True Answer   : Good sources include red meat, poultry, lentils, beans, leafy green vegetables, and iron-fortified cereals.
Default Model Answer : iron-rich foods
Model Answer  : Nutritional iron supplements include iron supplements, iron supplements, and iron supplements.

=== Example 2 ===
Question      : What are signs that you should see a doctor for stomach pain?
True Answer   : Signs include severe pain, pain that lasts several days, accompanying fever, difficulty breathing, vomiting, and signs of dehydration.
Default Model Answer : If you have a sour taste in your mouth, a burning sensation in your stomach, or a burning sensation in your intestines
Model Answer  : symptoms of GERD include abdominal pain, abdominal tenderness, and abdominal tenderness.

=== Example 3 ===
Question      : How do you safely dispose of expired medications in Australia?
True Answer   :

In [103]:
import json
import random
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
import evaluate
import torch

import warnings
warnings.filterwarnings('ignore')

# --- Added: Device detection ---
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")
# --- End Added ---

# 1. Load the fine-tuned model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("./lora_finetuned").to(device)
tokenizer = AutoTokenizer.from_pretrained("./lora_finetuned")

# 2. Create a text2text-generation pipeline for retrained model (Optional, not used in benchmarking loop below)
# If you wanted to use the pipeline, you'd specify the device here too:
# pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=100, device=device)

# 3. Benchmarking function using evaluate
def benchmark_model(model_to_bench, tokenizer_to_use, dataset): # Renamed args for clarity
    # Initialize the evaluate library metrics
    bleu = evaluate.load("bleu")
    meteor = evaluate.load("meteor")
    rouge = evaluate.load("rouge")
    # Ensure bertscore uses the correct device if available
    bertscore = evaluate.load("bertscore") # Removed device=device, let bertscore handle internal device logic or defaults

    predictions = []
    true_answers = []

    # Ensure model is in evaluation mode and on the correct device
    model_to_bench.eval()
    model_to_bench.to(device) # Ensure it's on the correct device within the function scope too

    # Disable gradient calculation for efficiency during inference
    with torch.no_grad():
        for example in dataset:
            question = example["question"]
            true_answer = example["answer"]

            # Generate output from the model
            # Pass the device to the generation function
            model_output = generate_response(model_to_bench, tokenizer_to_use, question, device)

            predictions.append(model_output)
            true_answers.append(true_answer)

    # Calculate the metrics
    # Note: Some metrics might be slow on MPS, run on CPU if needed by moving tensors back
    # However, let's try default first. If bertscore specifically fails/is slow, adjust it.
    print("Calculating BLEU...")
    bleu_result = bleu.compute(predictions=predictions, references=[[ans] for ans in true_answers])
    print("Calculating METEOR...")
    meteor_result = meteor.compute(predictions=predictions, references=true_answers)
    print("Calculating ROUGE...")
    rouge_result = rouge.compute(predictions=predictions, references=[[ans] for ans in true_answers])
    print("Calculating BERTScore...")
    # BERTScore might need explicit CPU usage if MPS causes issues
    try:
        bertscore_result = bertscore.compute(predictions=predictions, references=true_answers, lang='en', device=device)
        bertscore_f1_mean = torch.mean(torch.tensor(bertscore_result["f1"])).item()
    except Exception as e:
        print(f"BERTScore failed on {device}, attempting CPU: {e}")
        # If bertscore fails on MPS/GPU, try running it on CPU
        bertscore_result = bertscore.compute(predictions=predictions, references=true_answers, lang='en', device='cpu') # Force CPU
        bertscore_f1_mean = torch.mean(torch.tensor(bertscore_result["f1"])).item()


    # Collect all results
    results = {
        "BLEU": bleu_result["bleu"],
        "METEOR": meteor_result["meteor"],
        "ROUGE-1": rouge_result["rouge1"],
        "ROUGE-L": rouge_result["rougeL"],
        "BERTScore-F1": bertscore_f1_mean
    }

    return results

# 4. Function to generate the response (for both models)
def generate_response(model_gen, tokenizer_gen, question, device_gen):
    inputs = tokenizer_gen(question, return_tensors="pt").to(device_gen) # Move inputs to the specified device
    outputs = model_gen.generate(**inputs, max_length=100) # Use **inputs to pass both input_ids and attention_mask if present
    return tokenizer_gen.decode(outputs[0], skip_special_tokens=True)

# Helper function to load the default model - recreated to avoid running old cell which didn't consider device type
def get_default_model_tokenizer(model_name="google/flan-t5-large"):
    print(f"Loading default model: {model_name}")
    default_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
    default_tokenizer = AutoTokenizer.from_pretrained(model_name)
    return default_model, default_tokenizer

# 5. Compare default model vs retrained model on recheck_train and test_data
def compare_models_on_data(recheck_train, test_data): # Pass data as arguments
    # Load default model
    # IMPORTANT: Make sure 't5-small' here matches the base model used for your LoRA fine-tuning
    default_model, default_tokenizer = get_default_model_tokenizer("google/flan-t5-large") # <<< CHANGE IF NEEDED

    # Benchmark on recheck_train
    print("\nBenchmarking default model on recheck_train...")
    default_results = benchmark_model(default_model, default_tokenizer, recheck_train)
    print("\nBenchmarking retrained model on recheck_train...")
    retrained_results = benchmark_model(model, tokenizer, recheck_train) # model is already loaded and on device

    print("\n--- RECHECK_TRAIN COMPARISON ---")
    for metric in default_results:
        diff = retrained_results[metric] - default_results[metric]
        print(f"{metric}: Default Model - {default_results[metric]:.4f}, Retrained Model - {retrained_results[metric]:.4f}, Δ {diff:+.4f} (Improvement: {'Yes' if diff > 0 else 'No'})")

    # Benchmark on test_data
    print("\nBenchmarking default model on test_data...")
    default_results_test = benchmark_model(default_model, default_tokenizer, test_data)
    print("\nBenchmarking retrained model on test_data...")
    retrained_results_test = benchmark_model(model, tokenizer, test_data)

    print("\n--- TEST_DATA COMPARISON ---")
    for metric in default_results_test:
        diff = retrained_results_test[metric] - default_results_test[metric]
        print(f"{metric}: Default Model - {default_results_test[metric]:.4f}, Retrained Model - {retrained_results_test[metric]:.4f}, Δ {diff:+.4f} (Improvement: {'Yes' if diff > 0 else 'No'})")

# 6. Main execution for comparing the models
compare_models_on_data(recheck_train, test_data) # Pass loaded data

print("\nBenchmarking finished.")

Using device: mps
Loading default model: google/flan-t5-large

Benchmarking default model on recheck_train...


[nltk_data] Downloading package wordnet to /Users/rvlife-
[nltk_data]     samin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/rvlife-
[nltk_data]     samin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/rvlife-
[nltk_data]     samin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Calculating BLEU...
Calculating METEOR...
Calculating ROUGE...
Calculating BERTScore...

Benchmarking retrained model on recheck_train...


[nltk_data] Downloading package wordnet to /Users/rvlife-
[nltk_data]     samin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/rvlife-
[nltk_data]     samin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/rvlife-
[nltk_data]     samin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Calculating BLEU...
Calculating METEOR...
Calculating ROUGE...
Calculating BERTScore...

--- RECHECK_TRAIN COMPARISON ---
BLEU: Default Model - 0.0000, Retrained Model - 0.0631, Δ +0.0631 (Improvement: Yes)
METEOR: Default Model - 0.0618, Retrained Model - 0.2043, Δ +0.1425 (Improvement: Yes)
ROUGE-1: Default Model - 0.1113, Retrained Model - 0.2174, Δ +0.1061 (Improvement: Yes)
ROUGE-L: Default Model - 0.0929, Retrained Model - 0.1689, Δ +0.0760 (Improvement: Yes)
BERTScore-F1: Default Model - 0.8509, Retrained Model - 0.8664, Δ +0.0155 (Improvement: Yes)

Benchmarking default model on test_data...


[nltk_data] Downloading package wordnet to /Users/rvlife-
[nltk_data]     samin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/rvlife-
[nltk_data]     samin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/rvlife-
[nltk_data]     samin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Calculating BLEU...
Calculating METEOR...
Calculating ROUGE...
Calculating BERTScore...

Benchmarking retrained model on test_data...


[nltk_data] Downloading package wordnet to /Users/rvlife-
[nltk_data]     samin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/rvlife-
[nltk_data]     samin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/rvlife-
[nltk_data]     samin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Calculating BLEU...
Calculating METEOR...
Calculating ROUGE...
Calculating BERTScore...

--- TEST_DATA COMPARISON ---
BLEU: Default Model - 0.0000, Retrained Model - 0.0524, Δ +0.0524 (Improvement: Yes)
METEOR: Default Model - 0.0579, Retrained Model - 0.2235, Δ +0.1656 (Improvement: Yes)
ROUGE-1: Default Model - 0.1100, Retrained Model - 0.2104, Δ +0.1004 (Improvement: Yes)
ROUGE-L: Default Model - 0.0924, Retrained Model - 0.1695, Δ +0.0771 (Improvement: Yes)
BERTScore-F1: Default Model - 0.8489, Retrained Model - 0.8612, Δ +0.0123 (Improvement: Yes)

Benchmarking finished.
