In [1]:
!pip install -q -U rouge_score evaluate transformers datasets
!pip install -U -q tqdm

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [2]:
import os
import json
from dataclasses import dataclass
from typing import Dict, List, Any

import numpy as np
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import evaluate

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
if torch.cuda.is_available():
    try:
        print('CUDA device name:', torch.cuda.get_device_name(0))
    except Exception:
        pass

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using: ", device)

CUDA device name: Tesla T4
Using:  cuda


# Configuration

In [4]:
@dataclass
class QAConfig:
    # Model & tokenizer
    model_name: str = "VietAI/vit5-base"

    # Data
    data_path: str = "/content/drive/MyDrive/qa_dataset.json"
    output_dir: str = "/content/drive/MyDrive/vit5-qa-model"

    train_size: float = 0.6
    val_size: float = 0.2
    test_size: float = 0.2

    # Tokenization
    max_input_length: int = 256
    max_target_length: int = 128

    # Training hyper-parameters
    num_epochs: int = 5
    batch_size: int = 4
    gradient_accumulation_steps: int = 1

    learning_rate: float = 5e-4
    weight_decay: float = 0.01
    warmup_steps: int = 0

    logging_steps: int = 50
    eval_strategy: str = "epoch"   # đánh giá mỗi epoch
    save_strategy: str = "epoch"   # lưu checkpoint mỗi epoch
    metric_for_best_model: str = "eval_rougeL"
    greater_is_better: bool = True

    generation_max_length: int = 128
    generation_num_beams: int = 4

    seed: int = 42

# Training Pipeline

In [7]:
class QASeq2SeqPipeline:
    def __init__(self, config: QAConfig):
        self.config = config

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")

        # Components that will be initialized later
        self.tokenizer = None
        self.model = None
        self.dataset_dict: DatasetDict = None
        self.tokenized_datasets: DatasetDict = None

        # Metrics
        self.rouge = evaluate.load("rouge")
        self.bleu = evaluate.load("bleu")

    # ------------------------------
    # Data loading / splitting
    # ------------------------------
    def load_and_split_data(self):
        print(f"Loading data from: {self.config.data_path}")
        if not os.path.exists(self.config.data_path):
            raise FileNotFoundError(f"Không tìm thấy file: {self.config.data_path}")

        # Cho phép cả JSON list và JSON Lines
        try:
            with open(self.config.data_path, "r", encoding="utf-8") as f:
                data = json.load(f)
        except json.JSONDecodeError:
            # JSONL
            print("Detected JSON Lines format, reading line by line...")
            with open(self.config.data_path, "r", encoding="utf-8") as f:
                data = [json.loads(line) for line in f]

        # Convert to HF Dataset
        dataset = Dataset.from_list(data)
        print(f"Full dataset size: {len(dataset)}")

        # Split: train / (val+test)
        train_val_test = dataset.train_test_split(
            test_size=self.config.val_size + self.config.test_size,
            seed=self.config.seed
        )
        temp = train_val_test["test"].train_test_split(
            test_size=self.config.test_size / (self.config.val_size + self.config.test_size),
            seed=self.config.seed
        )

        self.dataset_dict = DatasetDict(
            {
                "train": train_val_test["train"],
                "validation": temp["train"],
                "test": temp["test"],
            }
        )

        print(
            "Dataset split -> "
            f"Train: {len(self.dataset_dict['train'])}, "
            f"Val: {len(self.dataset_dict['validation'])}, "
            f"Test: {len(self.dataset_dict['test'])}"
        )

    # ------------------------------
    # Tokenizer & model
    # ------------------------------
    def init_tokenizer_and_model(self):
        print(f"Loading tokenizer and model from: {self.config.model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)

        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_name)
        self.model.to(self.device)

        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        print("Tokenizer & model loaded successfully.")

    # ------------------------------
    # Preprocessing
    # ------------------------------
    def preprocess_function(self, examples: Dict[str, List[str]]) -> Dict[str, Any]:
        # Format input: "question: ... context: ..."
        inputs = [
            f"question: {q} context: {c}"
            for q, c in zip(examples["question"], examples["context"])
        ]
        targets = examples["answer"]

        # Encode inputs
        model_inputs = self.tokenizer(
            inputs,
            max_length=self.config.max_input_length,
            truncation=True,
            padding="max_length",
        )

        # Encode targets
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(
                targets,
                max_length=self.config.max_target_length,
                truncation=True,
                padding="max_length",
            )

        labels_ids = labels["input_ids"]
        # Replace padding token id by -100 for loss ignoring
        labels_ids = [
            [
                (token_id if token_id != self.tokenizer.pad_token_id else -100)
                for token_id in label
            ]
            for label in labels_ids
        ]
        model_inputs["labels"] = labels_ids

        return model_inputs

    def tokenize_datasets(self):
        if self.dataset_dict is None:
            raise ValueError("Dataset chưa được load. Gọi load_and_split_data() trước.")

        print("Tokenizing datasets...")
        self.tokenized_datasets = self.dataset_dict.map(
            self.preprocess_function,
            batched=True,
            remove_columns=["context", "question", "answer"],
        )
        print("Tokenization completed.")

    # ------------------------------
    # Metrics
    # ------------------------------
    def compute_metrics(self, eval_pred):
        pred_ids, label_ids = eval_pred

        # Một số version Trainer trả về tuple
        if isinstance(pred_ids, tuple):
            pred_ids = pred_ids[0]

        # Ép về numpy int64 cho chắc
        pred_ids = np.asarray(pred_ids, dtype=np.int64)
        label_ids = np.asarray(label_ids, dtype=np.int64)

        # Thay -100 trong labels bằng pad_token_id để decode
        label_ids[label_ids == -100] = self.tokenizer.pad_token_id

        # Dọn dẹp prediction: mọi giá trị < 0 hoặc >= vocab_size coi như pad
        vocab_size = self.model.config.vocab_size
        pred_ids[(pred_ids < 0) | (pred_ids >= vocab_size)] = self.tokenizer.pad_token_id

        # Decode
        decoded_preds = self.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
        decoded_labels = self.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

        decoded_preds = [p.strip() for p in decoded_preds]
        decoded_labels = [l.strip() for l in decoded_labels]

        # Tính ROUGE
        rouge_result = self.rouge.compute(
            predictions=decoded_preds,
            references=decoded_labels,
            use_stemmer=True,
        )

        # Tính BLEU (thư viện evaluate)
        bleu_result = self.bleu.compute(
            predictions=decoded_preds,
            references=[[r] for r in decoded_labels],
        )

        return {
            "rouge1": float(rouge_result.get("rouge1", 0.0)),
            "rouge2": float(rouge_result.get("rouge2", 0.0)),
            "rougeL": float(rouge_result.get("rougeL", 0.0)),
            "bleu": float(bleu_result.get("bleu", 0.0)),
        }

    # ------------------------------
    # Build trainer
    # ------------------------------
    def build_trainer(self) -> Seq2SeqTrainer:
        if self.tokenizer is None or self.model is None:
            raise ValueError("Tokenizer/Model chưa được khởi tạo.")

        if self.tokenized_datasets is None:
            raise ValueError("Dataset chưa được tokenize.")

        data_collator = DataCollatorForSeq2Seq(
            self.tokenizer,
            model=self.model,
        )

        # TrainingArguments
        training_args = Seq2SeqTrainingArguments(
            output_dir=self.config.output_dir,
            eval_strategy=self.config.eval_strategy,
            save_strategy=self.config.save_strategy,
            per_device_train_batch_size=self.config.batch_size,
            per_device_eval_batch_size=self.config.batch_size,
            gradient_accumulation_steps=self.config.gradient_accumulation_steps,
            learning_rate=self.config.learning_rate,
            num_train_epochs=self.config.num_epochs,
            weight_decay=self.config.weight_decay,
            warmup_steps=self.config.warmup_steps,
            logging_steps=self.config.logging_steps,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=True,
            metric_for_best_model=self.config.metric_for_best_model,
            greater_is_better=self.config.greater_is_better,
            generation_max_length=self.config.generation_max_length,
            generation_num_beams=self.config.generation_num_beams,
            seed=self.config.seed,
            save_safetensors=True,  # lưu file model.safetensors
        )

        trainer = Seq2SeqTrainer(
            model=self.model,
            args=training_args,
            train_dataset=self.tokenized_datasets["train"],
            eval_dataset=self.tokenized_datasets["validation"],
            tokenizer=self.tokenizer,
            data_collator=data_collator,
            compute_metrics=self.compute_metrics,
        )
        return trainer

    # ------------------------------
    # Train & evaluate
    # ------------------------------
    def train_and_evaluate(self):
        # 1) Data
        self.load_and_split_data()

        # 2) Tokenizer & model
        self.init_tokenizer_and_model()

        # 3) Tokenize datasets
        self.tokenize_datasets()

        # 4) Build trainer
        trainer = self.build_trainer()

        # 5) Train
        print("Starting supervised fine-tuning...")
        print(
            f"Training for {self.config.num_epochs} epochs"
        )

        import time
        start_time = time.time()

        train_result = trainer.train()
        training_time = time.time() - start_time
        print(f"Training completed in {training_time:.2f} seconds")

        # 6) Evaluate on validation & test
        print("Evaluating on validation set...")
        val_metrics = trainer.evaluate(self.tokenized_datasets["validation"])
        print("[VAL METRICS]", val_metrics)

        print("Evaluating on test set...")
        test_metrics = trainer.evaluate(self.tokenized_datasets["test"])
        print("[TEST METRICS]", test_metrics)

        # 7) Lưu best model ra output_dir
        os.makedirs(self.config.output_dir, exist_ok=True)
        print(f"Saving best model & tokenizer to: {self.config.output_dir}")

        trainer.save_model(self.config.output_dir)
        self.tokenizer.save_pretrained(self.config.output_dir)

        # Sau khi save_model, trong output_dir sẽ có:
        # - config.json
        # - tokenizer files
        # - pytorch_model.bin hoặc model.safetensors (do save_safetensors=True)

        # 8) Lưu thêm config & metrics
        summary = {
            "config": self.config.__dict__,
            "training_metrics": train_result.metrics,
            "val_metrics": val_metrics,
            "test_metrics": test_metrics,
        }
        summary_path = os.path.join(self.config.output_dir, "training_summary.json")
        with open(summary_path, "w", encoding="utf-8") as f:
            json.dump(summary, f, ensure_ascii=False, indent=2)

        print(f"Training summary saved to: {summary_path}")

# Main chạy thực nghiệm

In [8]:
# Truyền tham số epoch
EPOCHS = 3

config = QAConfig(num_epochs=EPOCHS)
pipeline = QASeq2SeqPipeline(config)
pipeline.train_and_evaluate()

Using device: cuda
Loading data from: /content/drive/MyDrive/qa_dataset.json
Full dataset size: 25061
Dataset split -> Train: 15036, Val: 5012, Test: 5013
Loading tokenizer and model from: VietAI/vit5-base
Tokenizer & model loaded successfully.
Tokenizing datasets...


Map:   0%|          | 0/15036 [00:00<?, ? examples/s]



Map:   0%|          | 0/5012 [00:00<?, ? examples/s]

Map:   0%|          | 0/5013 [00:00<?, ? examples/s]

Tokenization completed.
Starting supervised fine-tuning...
Training for 3 epochs


  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Bleu
1,1.0928,0.911832,0.530599,0.438785,0.500257,0.293517
2,0.8015,0.717887,0.595429,0.515559,0.570248,0.381296
3,0.505,0.632861,0.626525,0.554695,0.604317,0.41941


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Training completed in 9001.80 seconds
Evaluating on validation set...


[VAL METRICS] {'eval_loss': 0.6328614950180054, 'eval_rouge1': 0.6265246376752363, 'eval_rouge2': 0.5546954508863142, 'eval_rougeL': 0.6043173686529222, 'eval_bleu': 0.4194102292236637, 'eval_runtime': 2109.6005, 'eval_samples_per_second': 2.376, 'eval_steps_per_second': 0.594, 'epoch': 3.0}
Evaluating on test set...


[TEST METRICS] {'eval_loss': 0.5971229076385498, 'eval_rouge1': 0.6234157841149565, 'eval_rouge2': 0.5515792547075642, 'eval_rougeL': 0.6016187553373664, 'eval_bleu': 0.41691615559452244, 'eval_runtime': 2148.0718, 'eval_samples_per_second': 2.334, 'eval_steps_per_second': 0.584, 'epoch': 3.0}
Saving best model & tokenizer to: /content/drive/MyDrive/vit5-qa-model
Training summary saved to: /content/drive/MyDrive/vit5-qa-model/training_summary.json
