In [None]:
# ======= Install & Setup =======
!pip install -q transformers

import os
import gc
import re
import torch
import torch.nn as nn
import numpy as np
from tqdm.auto import tqdm
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    get_scheduler
)
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.cuda.amp import GradScaler, autocast
import multiprocessing
import random

# ======= Environment Setup =======
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# bfloat16 optimization (if supported)
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True

# ======= Config =======
train_path               = "/kaggle/input/cleaned/trained_dataset.json"
valid_path               = "/kaggle/input/cleaned/valid_dataset.json"
model_name               = "deepset/roberta-base-squad2"
train_batch_size         = 16
eval_batch_size          = 80
max_length               = 512
stride                   = 300
accumulation_steps       = 10
epochs                   = 1
early_stopping_patience  = 1
lr                       = 5e-5
num_proc                 = min(10, multiprocessing.cpu_count())
seed                     = 42

# ======= Output dirs =======
output_dir        = "/kaggle/working/"
model_save_dir    = os.path.join(output_dir, "best_model")
checkpoint_dir    = os.path.join(output_dir, "checkpoints")
os.makedirs(model_save_dir, exist_ok=True)
os.makedirs(checkpoint_dir, exist_ok=True)

resume_checkpoint = "/kaggle/working/checkpoints/checkpoint-epoch1"

# ======= Load dataset =======
raw_datasets = load_dataset("json", data_files={
    "train": train_path,
    "validation": valid_path
})
train_dataset = raw_datasets["train"]
valid_dataset = raw_datasets["validation"]

# ======= Tokenizer & Model =======
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)
    print(f"Using {torch.cuda.device_count()} GPUs")

# ======= Resume checkpoint =======
if resume_checkpoint:
    ckpt_path = os.path.join(checkpoint_dir, resume_checkpoint)
    print(f"🔁 Resuming from checkpoint: {ckpt_path}")
    model = AutoModelForQuestionAnswering.from_pretrained(ckpt_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

# ======= Preprocessing =======
def preprocess_function(examples):
    tok = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=stride,
        padding="max_length",
        return_offsets_mapping=True,
        return_overflowing_tokens=True
    )
    sample_map = tok.pop("overflow_to_sample_mapping")
    offset_map = tok.pop("offset_mapping")
    starts, ends = [], []

    for i, offsets in enumerate(offset_map):
        ids = tok["input_ids"][i]
        cls_index = ids.index(tokenizer.cls_token_id)
        seq_ids = tok.sequence_ids(i)
        sample_idx = sample_map[i]
        answers = examples["answers"][sample_idx]

        if not answers["answer_start"]:
            starts.append(cls_index)
            ends.append(cls_index)
            continue

        s_char = answers["answer_start"][0]
        text = answers["text"][0]
        e_char = s_char + len(text)

        ts = next(j for j, sid in enumerate(seq_ids) if sid == 1)
        te = len(ids) - 1
        while seq_ids[te] != 1:
            te -= 1

        if not (offsets[ts][0] <= s_char and offsets[te][1] >= e_char):
            starts.append(cls_index)
            ends.append(cls_index)
        else:
            while ts < len(offsets) and offsets[ts][0] <= s_char:
                ts += 1
            starts.append(ts - 1)
            while offsets[te][1] >= e_char:
                te -= 1
            ends.append(te + 1)

    tok["start_positions"] = starts
    tok["end_positions"] = ends
    return tok

tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names,
    num_proc=num_proc
)

tokenized_valid = valid_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=valid_dataset.column_names,
    num_proc=num_proc
)

def collate_fn(batch):
    return {k: torch.tensor([d[k] for d in batch]) for k in batch[0]}

def save_checkpoint(model, tokenizer, epoch):
    ckpt_path = os.path.join(checkpoint_dir, f"checkpoint-epoch{epoch+1}")
    os.makedirs(ckpt_path, exist_ok=True)
    mdl = model.module if hasattr(model, "module") else model
    mdl.save_pretrained(ckpt_path)
    tokenizer.save_pretrained(ckpt_path)
    print(f"💾 Saved checkpoint → {ckpt_path}")

train_loader = DataLoader(
    tokenized_train,
    batch_size=train_batch_size,
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=2
)
valid_loader = DataLoader(
    tokenized_valid,
    batch_size=eval_batch_size,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=2
)

# ======= Optimizer & Scheduler =======
optimizer = AdamW(
    model.parameters(),
    lr=lr,
    betas=(0.9, 0.999),
    eps=1e-8
)

start_epoch = 0
if resume_checkpoint:
    match = re.search(r"epoch(\d+)", resume_checkpoint)
    if match:
        start_epoch = int(match.group(1))

effective_epochs = epochs - start_epoch
total_steps = len(train_loader) * effective_epochs

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

scaler = GradScaler()
train_losses, val_losses = [], []
best_val = float("inf")
no_improve = 0
pbar = tqdm(total=total_steps, desc="Training")

# ======= Training Loop =======
for epoch in range(start_epoch, epochs):
    model.train()
    running_loss = 0.0
    optimizer.zero_grad()

    for step, batch in enumerate(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with autocast():
            out = model(**batch)
            loss = out.loss
            if loss.dim() > 0:
                loss = loss.mean()
            loss = loss / accumulation_steps
        scaler.scale(loss).backward()

        if (step + 1) % accumulation_steps == 0 or (step + 1) == len(train_loader):
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
        running_loss += loss.item()
        pbar.update(1)

    avg_tr = running_loss / len(train_loader)
    train_losses.append(avg_tr)
    save_checkpoint(model, tokenizer, epoch)

    # ======= Validation =======
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in valid_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with autocast():
                tmp = model(**batch).loss
                if tmp.dim() > 0:
                    tmp = tmp.mean()
                val_loss += tmp.item()

    avg_val = val_loss / len(valid_loader)
    val_losses.append(avg_val)
    print(f"Epoch {epoch+1}/{epochs} → Train Loss: {avg_tr:.4f} | Val Loss: {avg_val:.4f}")

    mdl = model.module if hasattr(model, "module") else model
    if avg_val < best_val:
        best_val = avg_val
        no_improve = 0
        mdl.save_pretrained(model_save_dir)
        tokenizer.save_pretrained(model_save_dir)
        print("✅ Saved best model so far.")
    else:
        no_improve += 1
        print(f"📉 No improvement for {no_improve} epoch(s).")
        if no_improve >= early_stopping_patience:
            print("🛑 Early stopping.")
            break

print("✅ Done. Artifacts saved:")
print(f"  • Model/tokenizer → {model_save_dir}")


In [None]:
# ======= Evaluation (EM / P / R / F1 / BLEU) =======
# PHIÊN BẢN 1:
!pip install nltk
from collections import Counter
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
# ======= Evaluation (EM / P / R / F1 / BLEU) =======
def compute_exact(p, t):
    return int(p.strip().lower() == t.strip().lower())

def compute_f1_precision_recall(p, t):
    pt, tt = p.lower().split(), t.lower().split()
    common = Counter(pt) & Counter(tt)
    n = sum(common.values())
    if n == 0:
        return 0.0, 0.0, 0.0
    prec = n / len(pt)
    rec  = n / len(tt)
    f1   = 2 * prec * rec / (prec + rec)
    return f1, prec, rec
    
# Evaluation function
def evaluate(model, tokenizer, dataloader, dataset, max_samples=1000):
    model.eval()
    truths   = [itm["answers"]["text"][0] if itm["answers"]["text"] else "" for itm in dataset]
    EMs, Ps, Rs, F1s = [], [], [], []
    BLEUs = []
    count = 0

    for batch in dataloader:
        if count >= max_samples:
            break
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            out    = model(**batch)
        starts = torch.argmax(out.start_logits, dim=-1)
        ends   = torch.argmax(out.end_logits,   dim=-1)

        for i, (ids, s, e) in enumerate(zip(batch["input_ids"].cpu(), starts, ends)):
            if count >= max_samples:
                break
            pred = tokenizer.decode(ids[s:e+1], skip_special_tokens=True).strip()
            true = truths[count].strip()
            count += 1
            if not pred or not true:
                continue
            EMs.append(compute_exact(pred, true))
            f1, p, r = compute_f1_precision_recall(pred, true)
            F1s.append(f1); Ps.append(p); Rs.append(r)
            BLEUs.append(sentence_bleu([true.split()], pred.split(), smoothing_function=SmoothingFunction().method1))

    em   = np.mean(EMs) * 100 if EMs else 0.0
    prec = np.mean(Ps)  * 100 if Ps  else 0.0
    rec  = np.mean(Rs)  * 100 if Rs  else 0.0
    f1   = np.mean(F1s) * 100 if F1s else 0.0
    bleu = np.mean(BLEUs) * 100 if BLEUs else 0.0

    print("\n📊 Evaluation Results (in %):")
    print(f"Exact Match: {em:.2f}%")
    print(f"Precision:   {prec:.2f}%")
    print(f"Recall:      {rec:.2f}%")
    print(f"F1 Score:    {f1:.2f}%")
    print(f"BLEU Score:  {bleu:.2f}%")


# Load best model and evaluate
best_model = AutoModelForQuestionAnswering.from_pretrained(model_save_dir).to(device)
if torch.cuda.device_count() > 1:
    best_model = nn.DataParallel(best_model)

evaluate(best_model, tokenizer, valid_loader, valid_dataset)

In [None]:
# ======= Evaluation (EM / P / R / F1 / BLEU) =======
# PHIÊN BẢN 2: thêm vài hàm chuẩn hóa
# kết quả tốt hơn 1
def normalize_and_tokenize(text):
    return re.findall(r"\w+|[^\w\s]", text.lower())

def compute_exact(p, t):
    if not p or not t:
        return 0
    return int(p.strip().lower() == t.strip().lower())

def compute_f1_precision_recall(p, t):
    if not p or not t:
        return 0.0, 0.0, 0.0

    ptokens = normalize_and_tokenize(p)
    ttokens = normalize_and_tokenize(t)

    common = Counter(ptokens) & Counter(ttokens)
    n_common = sum(common.values())
    if n_common == 0:
        return 0.0, 0.0, 0.0

    prec = n_common / len(ptokens)
    rec  = n_common / len(ttokens)
    f1   = 2 * prec * rec / (prec + rec)
    return f1, prec, rec
def evaluate(model, tokenizer, dataloader, dataset, max_samples=1000):
    model.eval()

    truths = [itm["answers"]["text"][0] if itm["answers"]["text"] else "" for itm in dataset]
    EMs, Ps, Rs, F1s, BLEUs = [], [], [], [], []
    count = 0

    for batch in dataloader:
        if count >= max_samples:
            break
        batch = {k: v.to(device) for k,v in batch.items()}
        with torch.no_grad():
            out = model(**batch)
        starts = torch.argmax(out.start_logits, dim=-1).cpu()
        ends   = torch.argmax(out.end_logits,   dim=-1).cpu()

        for ids, s, e in zip(batch["input_ids"].cpu(), starts, ends):
            if count >= max_samples:
                break
            true = truths[count].strip()
            pred = tokenizer.decode(ids[s:e+1], skip_special_tokens=True).strip()
            if not true:
                count += 1
                continue

            EMs.append(compute_exact(pred, true))
            f1, p, r = compute_f1_precision_recall(pred, true)
            F1s.append(f1); Ps.append(p); Rs.append(r)
            BLEUs.append(sentence_bleu(
                [true.split()], pred.split(),
                smoothing_function=SmoothingFunction().method1
            ))

            count += 1

    print(f"Exact Match: {np.mean(EMs):.4f}")
    print(f"Precision:   {np.mean(Ps):.4f}")
    print(f"Recall:      {np.mean(Rs):.4f}")
    print(f"F1 Score:    {np.mean(F1s):.4f}")
    print(f"BLEU Score:  {np.mean(BLEUs):.4f}")
# Load best model and evaluate
best_model = AutoModelForQuestionAnswering.from_pretrained(model_save_dir).to(device)
if torch.cuda.device_count() > 1:
    best_model = nn.DataParallel(best_model)

evaluate(best_model, tokenizer, valid_loader, valid_dataset)

In [None]:
# ======= Evaluation (EM / P / R / F1 / BLEU) =======
# PHIÊN BẢN 2
#Kết quả tốt hơn 1
from collections import Counter
import numpy as np
import re
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def normalize_and_tokenize(text):
    return re.findall(r"\w+|[^\w\s]", text.lower())

def compute_metrics_single(pred, truths):
    if not pred and all(not t for t in truths):
        return 1, 1.0, 1.0, 1.0, 1.0
    ems = [int(pred.strip().lower() == t.strip().lower()) for t in truths]
    EM = max(ems)

    # Tokenize once
    ptoks = normalize_and_tokenize(pred)
    best = (0.0, 0.0, 0.0) 
    for t in truths:
        ttoks = normalize_and_tokenize(t)
        common = Counter(ptoks) & Counter(ttoks)
        n_common = sum(common.values())
        if n_common == 0:
            continue
        prec = n_common / len(ptoks)
        rec  = n_common / len(ttoks)
        f1   = 2 * prec * rec / (prec + rec)
        if f1 > best[0]:
            best = (f1, prec, rec)
    F1, P, R = best

    # BLEU-4 với smoothing
    list_of_references = [normalize_and_tokenize(t) for t in truths]
    # Nếu không có token nào chung sẽ trả về 0, smoothing cải thiện
    BLEU = sentence_bleu(
        list_of_references,
        ptoks,
        weights=(0.25, 0.25, 0.25, 0.25),
        smoothing_function=SmoothingFunction().method2
    )

    return EM, F1, P, R, BLEU

def evaluate(model, tokenizer, dataloader, dataset, max_samples=1000):
    model.eval()
    truths_all = [itm["answers"]["text"] for itm in dataset]
    EMs, F1s, Ps, Rs, BLEUs = [], [], [], [], []
    count = 0

    for batch in dataloader:
        if count >= max_samples:
            break
        batch = {k: v.to(device) for k,v in batch.items()}
        with torch.no_grad():
            out = model(**batch)
        starts = torch.argmax(out.start_logits, dim=-1).cpu()
        ends   = torch.argmax(out.end_logits,   dim=-1).cpu()

        for ids, s, e in zip(batch["input_ids"].cpu(), starts, ends):
            if count >= max_samples:
                break

            pred = tokenizer.decode(ids[s:e+1], skip_special_tokens=True).strip()
            truths = truths_all[count]
            # Bỏ qua hoàn toàn khi không có bất kỳ truth nào?
            if not truths:
                count += 1
                continue

            EM, f1, p, r, bleu = compute_metrics_single(pred, truths)
            EMs.append(EM)
            F1s.append(f1); Ps.append(p); Rs.append(r); BLEUs.append(bleu)
            count += 1

    print(f"Exact Match: {np.mean(EMs):.4f}")
    print(f"Precision:   {np.mean(Ps):.4f}")
    print(f"Recall:      {np.mean(Rs):.4f}")
    print(f"F1 Score:    {np.mean(F1s):.4f}")
    print(f"BLEU-4:      {np.mean(BLEUs):.4f}")


# ————————————————————————————————————————————————
# Ví dụ gọi hàm:
best_model = AutoModelForQuestionAnswering.from_pretrained(model_save_dir).to(device)
if torch.cuda.device_count() > 1:
    best_model = torch.nn.DataParallel(best_model)
tokenizer   = AutoTokenizer.from_pretrained(model_save_dir)
evaluate(best_model, tokenizer, valid_loader, valid_dataset)

In [None]:
#Trả lời thử với 10 câu hỏi 
from transformers import AutoTokenizer, RobertaForQuestionAnswering

# Load model and tokenizer
model_dir = "/kaggle/working/best_model"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = RobertaForQuestionAnswering.from_pretrained(model_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# 🧠 Advanced QA function
def answer_question_advanced(question, context):
    inputs = tokenizer(
        question,
        context,
        return_tensors="pt",
        truncation="only_second",
        max_length=384,
        stride=128,
        padding="max_length",
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        return_token_type_ids=True
    ).to(device)

    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    offset_mapping = inputs["offset_mapping"]
    token_type_ids = inputs["token_type_ids"]

    best_score = -float("inf")
    best_answer = ""
    best_start, best_end = None, None
    best_start_word, best_end_word = "", ""

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

    for i in range(len(input_ids)):
        start_logit = start_logits[i]
        end_logit = end_logits[i]

        start_idx = torch.argmax(start_logit).item()
        end_idx = torch.argmax(end_logit).item()

        if start_idx > end_idx:
            continue

        score = start_logit[start_idx] + end_logit[end_idx]

        if score > best_score:
            best_score = score
            offsets = offset_mapping[i].cpu().tolist()
            input_id = input_ids[i].cpu().tolist()

            start_char = offsets[start_idx][0]
            end_char = offsets[end_idx][1]

            if start_char is None or end_char is None:
                continue

            best_answer = context[start_char:end_char]
            best_start = start_idx
            best_end = end_idx
            best_start_word = tokenizer.convert_ids_to_tokens([input_id[start_idx]])[0]
            best_end_word = tokenizer.convert_ids_to_tokens([input_id[end_idx]])[0]

    if best_answer.strip() == "":
        return {
            "start_token": None,
            "end_token": None,
            "start_word": "",
            "end_word": "",
            "answer": "🤔 No suitable answer found."
        }

    return {
        "start_token": best_start,
        "end_token": best_end,
        "start_word": best_start_word,
        "end_word": best_end_word,
        "answer": best_answer.strip()
    }

# 🧪 Test cases with extended context
test_cases = [
    {
        "question": "Who developed BERT?",
        "context": (
            "BERT, which stands for Bidirectional Encoder Representations from Transformers, is a natural language "
            "processing model introduced by Google in 2018. The model was developed by researchers Jacob Devlin, "
            "Ming-Wei Chang, Kenton Lee, and Kristina Toutanova at Google AI Language. BERT marked a major advancement "
            "in the use of transformers for NLP tasks and inspired a wide range of follow-up research and variants such as RoBERTa, DistilBERT, and ALBERT."
        )
    },
    {
        "question": "When was BERT introduced?",
        "context": (
            "The BERT model was officially introduced to the machine learning community in 2018 through a research paper "
            "titled 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding'. It was presented "
            "by researchers at Google AI. The paper detailed a new method for pre-training language representations that "
            "significantly improved the performance of a wide range of NLP tasks."
        )
    },
    {
        "question": "What does BERT stand for?",
        "context": (
            "BERT is an acronym for Bidirectional Encoder Representations from Transformers. It represents a new approach "
            "in natural language processing where deep bidirectional representations are pre-trained by jointly conditioning "
            "on both left and right context in all layers. This contrasts with earlier models that used unidirectional training, "
            "which limited their ability to understand full context."
        )
    },
    {
        "question": "What tasks is BERT used for?",
        "context": (
            "BERT has been widely adopted for a variety of natural language processing tasks. These include question answering, "
            "sentence classification, sentiment analysis, named entity recognition, and language inference. One of BERT’s key strengths "
            "is its ability to be fine-tuned on small task-specific datasets while still achieving state-of-the-art performance. "
            "This flexibility has led to its widespread use in both academic and commercial applications."
        )
    },
    {
        "question": "What is the innovation behind BERT?",
        "context": (
            "The core innovation behind BERT lies in its use of a masked language model (MLM) objective and next sentence prediction (NSP) "
            "during pre-training, allowing it to deeply understand the context of words. Unlike previous models that trained language models "
            "in a left-to-right or right-to-left fashion, BERT is trained bidirectionally. This means it considers both left and right context "
            "simultaneously when learning word representations. This approach significantly improves comprehension and has advanced the state of the art "
            "on multiple NLP benchmarks such as SQuAD and GLUE."
        )
    },
    {
        "question": "What is the function of red blood cells?",
        "context": (
            "Red blood cells, also known as erythrocytes, are a crucial component of human blood. Their primary function is to "
            "transport oxygen from the lungs to the rest of the body and return carbon dioxide from the tissues back to the lungs. "
            "This is made possible by hemoglobin, a protein within red blood cells that binds to oxygen molecules. An adequate number "
            "of red blood cells is essential for maintaining normal bodily functions and preventing conditions such as anemia."
        )
    },
    {
        "question": "What causes high blood pressure?",
        "context": (
            "High blood pressure, or hypertension, occurs when the force of blood pushing against the walls of the arteries is consistently too high. "
            "It can be caused by a variety of factors including poor diet (high in salt and saturated fats), lack of physical activity, obesity, stress, "
            "genetics, and certain chronic conditions such as kidney disease. If left uncontrolled, high blood pressure can lead to serious complications "
            "like heart disease, stroke, and kidney failure."
        )
    },
    {
        "question": "How do vaccines work?",
        "context": (
            "Vaccines work by training the immune system to recognize and fight specific pathogens, such as viruses or bacteria. When a person receives a vaccine, "
            "their body is exposed to a harmless form of the pathogen—often a weakened or inactivated version, or just a piece of it like a protein. "
            "This exposure triggers an immune response, allowing the body to create memory cells that will recognize and combat the real pathogen if it is encountered "
            "in the future. This mechanism is the foundation of immunization and helps prevent the spread of infectious diseases."
        )
    },
    {
        "question": "What is diabetes?",
        "context": (
            "Diabetes is a chronic medical condition that occurs when the body is unable to properly regulate blood sugar (glucose) levels. "
            "There are two main types of diabetes: Type 1, where the body does not produce insulin, and Type 2, where the body does not use insulin effectively. "
            "Insulin is a hormone that helps glucose enter the cells to be used for energy. Without proper insulin function, glucose builds up in the bloodstream, "
            "leading to various health problems such as cardiovascular disease, nerve damage, and kidney failure if left unmanaged."
        )
    },
    {
        "question": "What are the symptoms of COVID-19?",
        "context": (
            "COVID-19, caused by the SARS-CoV-2 virus, can present with a wide range of symptoms. Common symptoms include fever, cough, fatigue, loss of taste or smell, "
            "and difficulty breathing. Some individuals may also experience muscle aches, sore throat, diarrhea, and headaches. In severe cases, COVID-19 can lead to pneumonia, "
            "acute respiratory distress, and death, particularly among older adults and those with underlying health conditions. Asymptomatic cases are also possible, "
            "contributing to the rapid spread of the virus."
        )
    }
]


# 🔍 Run and display results
for idx, case in enumerate(test_cases, 1):
    print(f"\n🔹 Test Case {idx}")
    print(f"❓ Question: {case['question']}")
    print(f"📘 Context: {case['context']}")
    result = answer_question_advanced(case["question"], case["context"])
    print(f"✅ Answer: {result['answer']}")
    print(f"🔢 Start Token Index: {result['start_token']} ({result['start_word'].replace('Ġ', '')})")
    print(f"🔢 End Token Index: {result['end_token']} ({result['end_word'].replace('Ġ', '')})")
