In [None]:
# ======= Install & Setup =======
!pip install -q transformers

import os
import gc
import re
import torch
import torch.nn as nn
import numpy as np
from tqdm.auto import tqdm
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    get_scheduler
)
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.cuda.amp import GradScaler, autocast
import multiprocessing
import random

# ======= Environment Setup =======
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# ======= Config =======
train_path               = "/kaggle/input/cleaned/trained_dataset.json"
valid_path               = "/kaggle/input/cleaned/valid_dataset.json"
model_name               = "deepset/roberta-base-squad2"
train_batch_size         = 16
eval_batch_size          = 80
max_length               = 512
stride                   = 300
accumulation_steps       = 10
epochs                   = 1
early_stopping_patience  = 1
lr                       = 5e-5
num_proc                 = min(4, multiprocessing.cpu_count())
seed                     = 42

# ======= Output dirs =======
output_dir        = "/kaggle/working/"
model_save_dir    = os.path.join(output_dir, "best_model")
checkpoint_dir    = os.path.join(output_dir, "checkpoints")
os.makedirs(model_save_dir, exist_ok=True)
os.makedirs(checkpoint_dir, exist_ok=True)

resume_checkpoint = None # Nếu là lần chạy đầu tiên (trước đó không có checkpoint nào)
# resume_checkpoint = "/kaggle/working/checkpoints/checkpoint-epoch1"

# ======= Load dataset =======
raw_datasets = load_dataset("json", data_files={
    "train": train_path,
    "validation": valid_path
})
train_dataset = raw_datasets["train"]
valid_dataset = raw_datasets["validation"]

# ======= Tokenizer & Model =======
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)
    print(f"Using {torch.cuda.device_count()} GPUs")

# ======= Resume checkpoint =======
if resume_checkpoint:
    ckpt_path = os.path.join(checkpoint_dir, resume_checkpoint)
    print(f"🔁 Resuming from checkpoint: {ckpt_path}")
    model = AutoModelForQuestionAnswering.from_pretrained(ckpt_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

# ======= Preprocessing =======
def preprocess_function(examples):
    tok = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=stride,
        padding="max_length",
        return_offsets_mapping=True,
        return_overflowing_tokens=True
    )
    sample_map = tok.pop("overflow_to_sample_mapping")
    offset_map = tok.pop("offset_mapping")
    starts, ends = [], []

    for i, offsets in enumerate(offset_map):
        ids = tok["input_ids"][i]
        cls_index = ids.index(tokenizer.cls_token_id)
        seq_ids = tok.sequence_ids(i)
        sample_idx = sample_map[i]
        answers = examples["answers"][sample_idx]

        if not answers["answer_start"]:
            starts.append(cls_index)
            ends.append(cls_index)
            continue

        s_char = answers["answer_start"][0]
        text = answers["text"][0]
        e_char = s_char + len(text)

        ts = next(j for j, sid in enumerate(seq_ids) if sid == 1)
        te = len(ids) - 1
        while seq_ids[te] != 1:
            te -= 1

        if not (offsets[ts][0] <= s_char and offsets[te][1] >= e_char):
            starts.append(cls_index)
            ends.append(cls_index)
        else:
            while ts < len(offsets) and offsets[ts][0] <= s_char:
                ts += 1
            starts.append(ts - 1)
            while offsets[te][1] >= e_char:
                te -= 1
            ends.append(te + 1)

    tok["start_positions"] = starts
    tok["end_positions"] = ends
    return tok

tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names,
    num_proc=num_proc
)

tokenized_valid = valid_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=valid_dataset.column_names,
    num_proc=num_proc
)

def collate_fn(batch):
    return {k: torch.tensor([d[k] for d in batch]) for k in batch[0]}

def save_checkpoint(model, tokenizer, epoch):
    ckpt_path = os.path.join(checkpoint_dir, f"checkpoint-epoch{epoch+1}")
    os.makedirs(ckpt_path, exist_ok=True)
    mdl = model.module if hasattr(model, "module") else model
    mdl.save_pretrained(ckpt_path)
    tokenizer.save_pretrained(ckpt_path)
    print(f"💾 Saved checkpoint → {ckpt_path}")

train_loader = DataLoader(
    tokenized_train,
    batch_size=train_batch_size,
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=2
)
valid_loader = DataLoader(
    tokenized_valid,
    batch_size=eval_batch_size,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=2
)

# ======= Optimizer & Scheduler =======
optimizer = AdamW(
    model.parameters(),
    lr=lr,
    betas=(0.9, 0.999),
    eps=1e-8
)

start_epoch = 0
if resume_checkpoint:
    match = re.search(r"epoch(\d+)", resume_checkpoint)
    if match:
        start_epoch = int(match.group(1))

effective_epochs = epochs - start_epoch
total_steps = len(train_loader) * effective_epochs

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

scaler = GradScaler()
train_losses, val_losses = [], []
best_val = float("inf")
no_improve = 0
pbar = tqdm(total=total_steps, desc="Training")

# ======= Training Loop =======
for epoch in range(start_epoch, epochs):
    model.train()
    running_loss = 0.0
    optimizer.zero_grad()

    for step, batch in enumerate(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with autocast():
            out = model(**batch)
            loss = out.loss
            if loss.dim() > 0:
                loss = loss.mean()
            loss = loss / accumulation_steps
        scaler.scale(loss).backward()

        if (step + 1) % accumulation_steps == 0 or (step + 1) == len(train_loader):
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
        running_loss += loss.item()
        pbar.update(1)

    avg_tr = running_loss / len(train_loader)
    train_losses.append(avg_tr)
    save_checkpoint(model, tokenizer, epoch)

    # ======= Validation =======
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in valid_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with autocast():
                tmp = model(**batch).loss
                if tmp.dim() > 0:
                    tmp = tmp.mean()
                val_loss += tmp.item()

    avg_val = val_loss / len(valid_loader)
    val_losses.append(avg_val)
    print(f"Epoch {epoch+1}/{epochs} → Train Loss: {avg_tr:.4f} | Val Loss: {avg_val:.4f}")

    mdl = model.module if hasattr(model, "module") else model
    if avg_val < best_val:
        best_val = avg_val
        no_improve = 0
        mdl.save_pretrained(model_save_dir)
        tokenizer.save_pretrained(model_save_dir)
        print("✅ Saved best model so far.")
    else:
        no_improve += 1
        print(f"📉 No improvement for {no_improve} epoch(s).")
        if no_improve >= early_stopping_patience:
            print("🛑 Early stopping.")
            break

print("✅ Done. Artifacts saved:")
print(f"  • Model/tokenizer → {model_save_dir}")


In [None]:
# ======= Evaluation (EM / P / R / F1 / BLEU) =======
!pip install nltk
from collections import Counter
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def normalize_and_tokenize(text):
    return re.findall(r"\w+|[^\w\s]", text.lower())

def compute_exact(p, t):
    if not p or not t:
        return 0
    return int(p.strip().lower() == t.strip().lower())

def compute_f1_precision_recall(p, t):
    if not p or not t:
        return 0.0, 0.0, 0.0

    ptokens = normalize_and_tokenize(p)
    ttokens = normalize_and_tokenize(t)

    common = Counter(ptokens) & Counter(ttokens)
    n_common = sum(common.values())
    if n_common == 0:
        return 0.0, 0.0, 0.0

    prec = n_common / len(ptokens)
    rec  = n_common / len(ttokens)
    f1   = 2 * prec * rec / (prec + rec)
    return f1, prec, rec
def evaluate(model, tokenizer, dataloader, dataset, max_samples=1000):
    model.eval()

    truths = [itm["answers"]["text"][0] if itm["answers"]["text"] else "" for itm in dataset]
    EMs, Ps, Rs, F1s, BLEUs = [], [], [], [], []
    count = 0

    for batch in dataloader:
        if count >= max_samples:
            break
        batch = {k: v.to(device) for k,v in batch.items()}
        with torch.no_grad():
            out = model(**batch)
        starts = torch.argmax(out.start_logits, dim=-1).cpu()
        ends   = torch.argmax(out.end_logits,   dim=-1).cpu()

        for ids, s, e in zip(batch["input_ids"].cpu(), starts, ends):
            if count >= max_samples:
                break
            true = truths[count].strip()
            pred = tokenizer.decode(ids[s:e+1], skip_special_tokens=True).strip()
            if not true:
                count += 1
                continue

            EMs.append(compute_exact(pred, true))
            f1, p, r = compute_f1_precision_recall(pred, true)
            F1s.append(f1); Ps.append(p); Rs.append(r)
            BLEUs.append(sentence_bleu(
                [true.split()], pred.split(),
                smoothing_function=SmoothingFunction().method1
            ))

            count += 1

    print(f"Exact Match: {np.mean(EMs):.4f}")
    print(f"Precision:   {np.mean(Ps):.4f}")
    print(f"Recall:      {np.mean(Rs):.4f}")
    print(f"F1 Score:    {np.mean(F1s):.4f}")
    print(f"BLEU Score:  {np.mean(BLEUs):.4f}")
# Load best model and evaluate
best_model = AutoModelForQuestionAnswering.from_pretrained(model_save_dir).to(device)
if torch.cuda.device_count() > 1:
    best_model = nn.DataParallel(best_model)

evaluate(best_model, tokenizer, valid_loader, valid_dataset)