In [None]:
!pip install -q -U rouge_score evaluate transformers datasets
!pip install -U -q tqdm

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m511.6/511.6 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m47.7/47.7 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
import os
import json
from dataclasses import dataclass
from typing import Dict, List, Any

import numpy as np
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import evaluate

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
if torch.cuda.is_available():
    try:
        print('CUDA device name:', torch.cuda.get_device_name(0))
    except Exception:
        pass

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using: ", device)

CUDA device name: Tesla T4
Using:  cuda


# Configuration

In [None]:
@dataclass
class QAConfig:
    # Model & tokenizer
    model_name: str = "VietAI/vit5-base"

    # Data
    data_path: str = "/content/drive/MyDrive/qa_dataset.json"
    output_dir: str = "/content/drive/MyDrive/vit5-qa-model"

    train_size: float = 0.6
    val_size: float = 0.2
    test_size: float = 0.2

    # Tokenization
    max_input_length: int = 256
    max_target_length: int = 128

    # Training hyper-parameters
    num_epochs: int = 5
    batch_size: int = 4
    gradient_accumulation_steps: int = 1

    learning_rate: float = 5e-4
    weight_decay: float = 0.01
    warmup_steps: int = 0

    logging_steps: int = 50
    eval_strategy: str = "epoch"   # ƒë√°nh gi√° m·ªói epoch
    save_strategy: str = "epoch"   # l∆∞u checkpoint m·ªói epoch
    metric_for_best_model: str = "eval_rougeL"
    greater_is_better: bool = True

    generation_max_length: int = 128
    generation_num_beams: int = 4

    seed: int = 42

# Training Pipeline

In [None]:
class QASeq2SeqPipeline:
    def __init__(self, config: QAConfig):
        self.config = config

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")

        # Components that will be initialized later
        self.tokenizer = None
        self.model = None
        self.dataset_dict: DatasetDict = None
        self.tokenized_datasets: DatasetDict = None

        # Metrics
        self.rouge = evaluate.load("rouge")
        self.bleu = evaluate.load("bleu")

    # ------------------------------
    # Data loading / splitting
    # ------------------------------
    def load_and_split_data(self):
        """
        Thay v√¨ chia random, load ƒë√∫ng b·ªô train/val/test ƒë√£ t·∫°o tr∆∞·ªõc ƒë√≥
        (train.jsonl ‚Äì val.jsonl ‚Äì test.jsonl)
        """
        train_path = "/content/drive/MyDrive/train_full.jsonl"
        val_path   = "/content/drive/MyDrive/val.jsonl"
        test_path  = "/content/drive/MyDrive/test.jsonl"

        print("üì• Loading fixed dataset splits...")

        if not all(os.path.exists(p) for p in [train_path, val_path, test_path]):
            raise FileNotFoundError(
                "Kh√¥ng t√¨m th·∫•y b·ªô split c·ªë ƒë·ªãnh. H√£y t·∫°o tr∆∞·ªõc b·∫±ng baseline splitting."
            )

        def load_jsonl(path):
            rows = []
            with open(path, "r", encoding="utf-8") as f:
                for line in f:
                    rows.append(json.loads(line))
            return rows

        train_data = load_jsonl(train_path)
        val_data   = load_jsonl(val_path)
        test_data  = load_jsonl(test_path)

        print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

        # Chuy·ªÉn sang HF Dataset
        self.dataset_dict = DatasetDict({
            "train": Dataset.from_list(train_data),
            "validation": Dataset.from_list(val_data),
            "test": Dataset.from_list(test_data)
        })

        print("‚úÖ Loaded fixed dataset splits successfully.")


    # ------------------------------
    # Tokenizer & model
    # ------------------------------
    def init_tokenizer_and_model(self):
        print(f"Loading tokenizer and model from: {self.config.model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)

        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_name)
        self.model.to(self.device)

        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        print("Tokenizer & model loaded successfully.")

    # ------------------------------
    # Preprocessing
    # ------------------------------
    def preprocess_function(self, examples: Dict[str, List[str]]) -> Dict[str, Any]:
        # Format input: "question: ... context: ..."
        inputs = [
            f"question: {q} context: {c}"
            for q, c in zip(examples["question"], examples["context"])
        ]
        targets = examples["answer"]

        # Encode inputs
        model_inputs = self.tokenizer(
            inputs,
            max_length=self.config.max_input_length,
            truncation=True,
            padding="max_length",
        )

        # Encode targets
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(
                targets,
                max_length=self.config.max_target_length,
                truncation=True,
                padding="max_length",
            )

        labels_ids = labels["input_ids"]
        # Replace padding token id by -100 for loss ignoring
        labels_ids = [
            [
                (token_id if token_id != self.tokenizer.pad_token_id else -100)
                for token_id in label
            ]
            for label in labels_ids
        ]
        model_inputs["labels"] = labels_ids

        return model_inputs

    def tokenize_datasets(self):
        if self.dataset_dict is None:
            raise ValueError("Dataset ch∆∞a ƒë∆∞·ª£c load. G·ªçi load_and_split_data() tr∆∞·ªõc.")

        print("Tokenizing datasets...")
        self.tokenized_datasets = self.dataset_dict.map(
            self.preprocess_function,
            batched=True,
            remove_columns=["context", "question", "answer"],
        )
        print("Tokenization completed.")

    # ------------------------------
    # Metrics
    # ------------------------------
    def compute_metrics(self, eval_pred):
        pred_ids, label_ids = eval_pred

        # M·ªôt s·ªë version Trainer tr·∫£ v·ªÅ tuple
        if isinstance(pred_ids, tuple):
            pred_ids = pred_ids[0]

        # √âp v·ªÅ numpy int64 cho ch·∫Øc
        pred_ids = np.asarray(pred_ids, dtype=np.int64)
        label_ids = np.asarray(label_ids, dtype=np.int64)

        # Thay -100 trong labels b·∫±ng pad_token_id ƒë·ªÉ decode
        label_ids[label_ids == -100] = self.tokenizer.pad_token_id

        # D·ªçn d·∫πp prediction: m·ªçi gi√° tr·ªã < 0 ho·∫∑c >= vocab_size coi nh∆∞ pad
        vocab_size = self.model.config.vocab_size
        pred_ids[(pred_ids < 0) | (pred_ids >= vocab_size)] = self.tokenizer.pad_token_id

        # Decode
        decoded_preds = self.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
        decoded_labels = self.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

        decoded_preds = [p.strip() for p in decoded_preds]
        decoded_labels = [l.strip() for l in decoded_labels]

        # T√≠nh ROUGE
        rouge_result = self.rouge.compute(
            predictions=decoded_preds,
            references=decoded_labels,
            use_stemmer=True,
        )

        # T√≠nh BLEU (th∆∞ vi·ªán evaluate)
        bleu_result = self.bleu.compute(
            predictions=decoded_preds,
            references=[[r] for r in decoded_labels],
        )

        return {
            "rouge1": float(rouge_result.get("rouge1", 0.0)),
            "rouge2": float(rouge_result.get("rouge2", 0.0)),
            "rougeL": float(rouge_result.get("rougeL", 0.0)),
            "bleu": float(bleu_result.get("bleu", 0.0)),
        }

    # ------------------------------
    # Build trainer
    # ------------------------------
    def build_trainer(self) -> Seq2SeqTrainer:
        if self.tokenizer is None or self.model is None:
            raise ValueError("Tokenizer/Model ch∆∞a ƒë∆∞·ª£c kh·ªüi t·∫°o.")

        if self.tokenized_datasets is None:
            raise ValueError("Dataset ch∆∞a ƒë∆∞·ª£c tokenize.")

        data_collator = DataCollatorForSeq2Seq(
            self.tokenizer,
            model=self.model,
        )

        # TrainingArguments
        training_args = Seq2SeqTrainingArguments(
            output_dir=self.config.output_dir,
            eval_strategy=self.config.eval_strategy,
            save_strategy=self.config.save_strategy,
            per_device_train_batch_size=self.config.batch_size,
            per_device_eval_batch_size=self.config.batch_size,
            gradient_accumulation_steps=self.config.gradient_accumulation_steps,
            learning_rate=self.config.learning_rate,
            num_train_epochs=self.config.num_epochs,
            weight_decay=self.config.weight_decay,
            warmup_steps=self.config.warmup_steps,
            logging_steps=self.config.logging_steps,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=True,
            metric_for_best_model=self.config.metric_for_best_model,
            greater_is_better=self.config.greater_is_better,
            generation_max_length=self.config.generation_max_length,
            generation_num_beams=self.config.generation_num_beams,
            seed=self.config.seed,
            save_safetensors=True,  # l∆∞u file model.safetensors
        )

        trainer = Seq2SeqTrainer(
            model=self.model,
            args=training_args,
            train_dataset=self.tokenized_datasets["train"],
            eval_dataset=self.tokenized_datasets["validation"],
            tokenizer=self.tokenizer,
            data_collator=data_collator,
            compute_metrics=self.compute_metrics,
        )
        return trainer

    # ------------------------------
    # Train & evaluate
    # ------------------------------
    def train_and_evaluate(self):
        # 1) Data
        self.load_and_split_data()

        # 2) Tokenizer & model
        self.init_tokenizer_and_model()

        # 3) Tokenize datasets
        self.tokenize_datasets()

        # 4) Build trainer
        trainer = self.build_trainer()

        # 5) Train
        print("Starting supervised fine-tuning...")
        print(
            f"Training for {self.config.num_epochs} epochs"
        )

        import time
        start_time = time.time()

        train_result = trainer.train()
        training_time = time.time() - start_time
        print(f"Training completed in {training_time:.2f} seconds")

        # 6) Evaluate on validation & test
        print("Evaluating on validation set...")
        val_metrics = trainer.evaluate(self.tokenized_datasets["validation"])
        print("[VAL METRICS]", val_metrics)

        print("Evaluating on test set...")
        test_metrics = trainer.evaluate(self.tokenized_datasets["test"])
        print("[TEST METRICS]", test_metrics)

        # 7) L∆∞u best model ra output_dir
        os.makedirs(self.config.output_dir, exist_ok=True)
        print(f"Saving best model & tokenizer to: {self.config.output_dir}")

        trainer.save_model(self.config.output_dir)
        self.tokenizer.save_pretrained(self.config.output_dir)

        # Sau khi save_model, trong output_dir s·∫Ω c√≥:
        # - config.json
        # - tokenizer files
        # - pytorch_model.bin ho·∫∑c model.safetensors (do save_safetensors=True)

        # 8) L∆∞u th√™m config & metrics
        summary = {
            "config": self.config.__dict__,
            "training_metrics": train_result.metrics,
            "val_metrics": val_metrics,
            "test_metrics": test_metrics,
        }
        summary_path = os.path.join(self.config.output_dir, "training_summary.json")
        with open(summary_path, "w", encoding="utf-8") as f:
            json.dump(summary, f, ensure_ascii=False, indent=2)

        print(f"Training summary saved to: {summary_path}")

# Main chaÃ£y th∆∞Ã£c nghi√™Ã£m

In [None]:
# Truy√™ÃÄn tham s√¥ÃÅ epoch
EPOCHS = 4

config = QAConfig(num_epochs=EPOCHS)
pipeline = QASeq2SeqPipeline(config)
pipeline.train_and_evaluate()

Using device: cuda
üì• Loading fixed dataset splits...
Train: 38167, Val: 2256, Test: 2507
‚úÖ Loaded fixed dataset splits successfully.
Loading tokenizer and model from: VietAI/vit5-base


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/820k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/904M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/904M [00:00<?, ?B/s]

Tokenizer & model loaded successfully.
Tokenizing datasets...


Map:   0%|          | 0/38167 [00:00<?, ? examples/s]



Map:   0%|          | 0/2256 [00:00<?, ? examples/s]

Map:   0%|          | 0/2507 [00:00<?, ? examples/s]

Tokenization completed.


  trainer = Seq2SeqTrainer(


Starting supervised fine-tuning...
Training for 4 epochs


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhmy207204[0m ([33mhmy207204-ton-duc-thang-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Bleu
1,1.0031,0.792009,0.553356,0.477965,0.529846,0.313177
2,0.8799,0.630733,0.604908,0.535625,0.584259,0.375418
3,0.7078,0.553311,0.644976,0.57703,0.6251,0.441274
4,0.5013,0.498241,0.672107,0.605884,0.65352,0.483636


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Training completed in 12722.27 seconds
Evaluating on validation set...


[VAL METRICS] {'eval_loss': 0.49824124574661255, 'eval_rouge1': 0.6721066100470252, 'eval_rouge2': 0.605883869457333, 'eval_rougeL': 0.6535203064680593, 'eval_bleu': 0.48363597251964235, 'eval_runtime': 847.6534, 'eval_samples_per_second': 2.661, 'eval_steps_per_second': 0.665, 'epoch': 4.0}
Evaluating on test set...
[TEST METRICS] {'eval_loss': 0.519686222076416, 'eval_rouge1': 0.677169841055747, 'eval_rouge2': 0.6051672484225699, 'eval_rougeL': 0.6573929100838423, 'eval_bleu': 0.4939461509247145, 'eval_runtime': 965.6671, 'eval_samples_per_second': 2.596, 'eval_steps_per_second': 0.649, 'epoch': 4.0}
Saving best model & tokenizer to: /content/drive/MyDrive/vit5-qa-model
Training summary saved to: /content/drive/MyDrive/vit5-qa-model/training_summary.json
