# Inference

In [1]:
import os
import torch
import torch.nn as nn
import numpy as np
from tqdm.auto import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from torch.cuda.amp import autocast
from collections import Counter
import re
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# ─── Config ───────────────────────────────────────────────────────────────────
valid_path = '/kaggle/input/cleaned/valid_dataset.json'
model_name = "deepset/roberta-large-squad2"
batch_size = 16
max_length = 150
stride     = 15
device     = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ─── Load dataset & tokenizer/model ───────────────────────────────────────────
raw_valid = load_dataset("json", data_files={"validation": valid_path})["validation"]
tokenizer = AutoTokenizer.from_pretrained(model_name)
model     = AutoModelForQuestionAnswering.from_pretrained(model_name).to(device)

# Multi-GPU
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)

model.eval()  # gọi sau khi bọc DataParallel

# ─── Preprocessing ────────────────────────────────────────────────────────────
def preprocess_function(examples):
    tok = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=stride,
        padding="max_length",
        return_offsets_mapping=True,
        return_overflowing_tokens=True
    )
    sample_map = tok.pop("overflow_to_sample_mapping")
    offset_map = tok.pop("offset_mapping")

    starts, ends = [], []
    for i, offsets in enumerate(offset_map):
        ids = tok["input_ids"][i]
        cls_index = ids.index(tokenizer.cls_token_id)
        seq_ids = tok.sequence_ids(i)
        sample_idx = sample_map[i]
        answers = examples["answers"][sample_idx]

        if not answers["answer_start"]:
            starts.append(cls_index); ends.append(cls_index); continue

        s_char = answers["answer_start"][0]
        text   = answers["text"][0]
        e_char = s_char + len(text)

        ts = next(j for j, sid in enumerate(seq_ids) if sid == 1)
        te = len(ids) - 1
        while seq_ids[te] != 1: te -= 1

        if not (offsets[ts][0] <= s_char and offsets[te][1] >= e_char):
            starts.append(cls_index); ends.append(cls_index)
        else:
            while ts < len(offsets) and offsets[ts][0] <= s_char: ts += 1
            starts.append(ts - 1)
            while te >= 0 and offsets[te][1] >= e_char: te -= 1
            ends.append(te + 1)

    tok["start_positions"] = starts
    tok["end_positions"]   = ends
    tok["offset_mapping"]                = offset_map
    tok["overflow_to_sample_mapping"]   = sample_map
    return tok

tokenized_valid = raw_valid.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_valid.column_names,
    num_proc=4
)

# ─── DataLoader ────────────────────────────────────────────────────────────────
def collate_fn(batch):
    tensor_keys = ["input_ids","attention_mask","start_positions","end_positions"]
    collated = {k: torch.tensor([d[k] for d in batch]) for k in tensor_keys}
    collated["offset_mapping"]             = [d["offset_mapping"] for d in batch]
    collated["overflow_to_sample_mapping"] = [d["overflow_to_sample_mapping"] for d in batch]
    return collated

valid_loader = DataLoader(
    tokenized_valid, batch_size=batch_size,
    shuffle=False, collate_fn=collate_fn, num_workers=0
)

# ─── Metric helpers ───────────────────────────────────────────────────────────
def normalize_and_tokenize(text):
    return re.findall(r"\w+|[^\w\s]", text.lower())

def compute_metrics_single(pred, truths):
    ems = [int(pred.strip().lower() == t.strip().lower()) for t in truths]
    EM = max(ems) if truths else 1
    ptoks = normalize_and_tokenize(pred)
    best_f1 = best_prec = best_rec = 0.0
    for t in truths:
        ttoks = normalize_and_tokenize(t)
        common = Counter(ptoks) & Counter(ttoks)
        n_common = sum(common.values())
        if n_common == 0:
            continue
        prec = n_common / len(ptoks) if ptoks else 0
        rec  = n_common / len(ttoks) if ttoks else 0
        f1   = 2 * prec * rec / (prec + rec) if prec + rec > 0 else 0
        if f1 > best_f1:
            best_f1, best_prec, best_rec = f1, prec, rec
    refs = [normalize_and_tokenize(t) for t in truths]
    BLEU = sentence_bleu(refs, ptoks, weights=(0.25,0.25,0.25,0.25),
                         smoothing_function=SmoothingFunction().method2)
    return EM, best_f1, best_prec, best_rec, BLEU

# ─── Inference & Evaluation ────────────────────────────────────────────────────
all_EM, all_F1, all_P, all_R, all_B = [], [], [], [], []

with torch.no_grad():
    for batch in tqdm(valid_loader):
        input_ids      = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        start_logits = outputs.start_logits.cpu().numpy()
        end_logits   = outputs.end_logits.cpu().numpy()

        for i in range(input_ids.size(0)):
            sample_idx = batch["overflow_to_sample_mapping"][i]
            raw = raw_valid[sample_idx]
            context = raw["context"]
            truths = raw["answers"]["text"]

            s_idx = np.argmax(start_logits[i])
            e_idx = np.argmax(end_logits[i])
            offsets = batch["offset_mapping"][i]

            if s_idx >= len(offsets) or e_idx >= len(offsets) or s_idx > e_idx:
                pred = ""
            else:
                s_char, e_char = offsets[s_idx][0], offsets[e_idx][1]
                pred = context[s_char:e_char].strip()

            EM, F1, P, R, BLEU = compute_metrics_single(pred, truths)
            all_EM.append(EM); all_F1.append(F1)
            all_P.append(P); all_R.append(R); all_B.append(BLEU)

# ─── Report ────────────────────────────────────────────────────────────────────
print("\n📊 QA Metrics:")
print(f"Exact Match : {np.mean(all_EM)*100:5.2f}%")
print(f"F1 Score    : {np.mean(all_F1)*100:5.2f}%")
print(f"Precision   : {np.mean(all_P)*100:5.2f}%")
print(f"Recall      : {np.mean(all_R)*100:5.2f}%")
print(f"BLEU-4      : {np.mean(all_B)*100:5.2f}%")


Generating validation split: 0 examples [00:00, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/696 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

2025-05-11 07:31:08.087372: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746948668.473425      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746948668.588768      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Map (num_proc=4):   0%|          | 0/32739 [00:00<?, ? examples/s]

  0%|          | 0/11734 [00:00<?, ?it/s]

  with autocast():



📊 QA Metrics:
Exact Match :  0.00%
F1 Score    :  0.26%
Precision   :  0.38%
Recall      :  0.45%
BLEU-4      :  0.13%


In [3]:
print("\n📊 QA Metrics:")
print(f"Exact Match : {np.mean(all_EM):5.4f}%")
print(f"F1 Score    : {np.mean(all_F1):5.4f}%")
print(f"Precision   : {np.mean(all_P):5.4f}%")
print(f"Recall      : {np.mean(all_R):5.4f}%")
print(f"BLEU-4      : {np.mean(all_B):5.4f}%")


📊 QA Metrics:
Exact Match : 0.0000%
F1 Score    : 0.0026%
Precision   : 0.0038%
Recall      : 0.0045%
BLEU-4      : 0.0013%
