In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import (
    T5ForConditionalGeneration, 
    T5Tokenizer,
    get_linear_schedule_with_warmup
)
import os
from tqdm import tqdm
import numpy as np
import sacrebleu

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class PreTokenizedDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx]
        }

# Hàm chuẩn bị dữ liệu (tokenize toàn bộ trước)
def prepare_data(vi_file, en_file, tokenizer, max_length=256):
    with open(vi_file, 'r', encoding='utf-8') as f:
        vi_texts = [line.strip() for line in f]
    with open(en_file, 'r', encoding='utf-8') as f:
        en_texts = [line.strip() for line in f]

    assert len(vi_texts) == len(en_texts), "Số lượng câu không khớp!"

    src_texts = [f"vi-en: {vi}" for vi in vi_texts]
    
    src_enc = tokenizer(
        src_texts,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    
    tgt_enc = tokenizer(
        en_texts,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    labels = tgt_enc["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100

    return PreTokenizedDataset(src_enc["input_ids"], src_enc["attention_mask"], labels)

In [3]:
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    
    progress_bar = tqdm(dataloader, desc="Training")
    for batch in progress_bar:
        # Di chuyển data lên GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'loss': loss.item()})
    
    return total_loss / len(dataloader)

In [4]:
def evaluate_bleu(model, dataloader, tokenizer, device, max_samples=None):
    """Tính BLEU score cho model trên dataset bằng SacreBLEU"""
    model.eval()
    predictions = []
    references = []
    total_loss = 0
    
    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataloader, desc="Evaluating BLEU")):
            if max_samples and i >= max_samples:
                break
                
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Tính loss
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            total_loss += outputs.loss.item()
            
            # Generate predictions
            generated = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=512,
                num_beams=4,
                early_stopping=True,
                do_sample=False
            )
            
            # Decode predictions và references
            for j in range(len(generated)):
                # Decode prediction
                pred = tokenizer.decode(generated[j], skip_special_tokens=True)
                predictions.append(pred)
                
                # Decode reference (target)
                label = labels[j].cpu().numpy()
                label[label == -100] = tokenizer.pad_token_id
                ref = tokenizer.decode(label, skip_special_tokens=True)
                references.append(ref)
    
    # Tính BLEU score bằng SacreBLEU
    bleu = sacrebleu.corpus_bleu(predictions, [references])
    bleu_score = bleu.score 
    
    avg_loss = total_loss / min(len(dataloader), max_samples or len(dataloader))
    
    return avg_loss, bleu_score, predictions[:5], references[:5]  # Return 5 examples

In [None]:
MODEL_NAME = "VietAI/vit5-base"  
BATCH_SIZE = 8      
LEARNING_RATE = 1e-4
NUM_EPOCHS = 1
MAX_LENGTH = 128
OUTPUT_DIR = "./vit5_finetuned_vi_to_en"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load tokenizer và model
print("Loading tokenizer and model...")
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
model.to(device)

# Tạo datasets
train_dataset = prepare_data("Released Corpus/train.vi.txt", "Released Corpus/train.en.txt", tokenizer, MAX_LENGTH)
test_dataset = prepare_data("Released Corpus/test.vi.txt", "Released Corpus/test.en.txt", tokenizer, MAX_LENGTH)

# Tạo dataloaders
train_dataloader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True,
    num_workers=0,
    pin_memory=True
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
    pin_memory=True
)

# Optimizer và scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

total_steps = len(train_dataloader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

print(f"Total training steps: {total_steps}")
print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")

# Training loop
best_val_loss = float('inf')
best_bleu = 0

print("Starting training...")

for epoch in range(NUM_EPOCHS):
    print(f"\nEpoch {epoch + 1}/{NUM_EPOCHS}")
    
    train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, device)
    print(f"Training Loss: {train_loss:.4f}")
    
    val_loss, bleu_score, sample_preds, sample_refs = evaluate_bleu(
        model, test_dataloader, tokenizer, device, max_samples=100
    )
    
    print(f"Validation Loss: {val_loss:.4f}")
    print(f"BLEU Score: {bleu_score:.4f} ({bleu_score*100:.1f})")  
    
    print("\nSample translations:")
    for i in range(min(3, len(sample_preds))):
        print(f"Pred: {sample_preds[i]}")
        print(f"Ref:  {sample_refs[i]}")
        print()
    
    if bleu_score > best_bleu:
        best_bleu = bleu_score
        best_val_loss = val_loss
        print("New best BLEU score! Saving model...")
        
        if not os.path.exists(OUTPUT_DIR):
            os.makedirs(OUTPUT_DIR)
            
        model.save_pretrained(OUTPUT_DIR)
        tokenizer.save_pretrained(OUTPUT_DIR)
        
        with open(os.path.join(OUTPUT_DIR, "training_info.txt"), "w") as f:
            f.write(f"Best BLEU Score: {best_bleu:.4f}\n")
            f.write(f"Best Epoch: {epoch + 1}\n")

print(f"\nTraining completed!")
print(f"Best BLEU Score: {best_bleu:.4f}")
print(f"Model saved to: {OUTPUT_DIR}")

Using device: cuda
Loading tokenizer and model...


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Total training steps: 62500
Training samples: 500000
Test samples: 3000
Starting training...

Epoch 1/1


Training: 100%|██████████| 62500/62500 [4:37:42<00:00,  3.75it/s, loss=0.978]  


Training Loss: 1.4559


Evaluating BLEU:  27%|██▋       | 100/375 [02:24<06:36,  1.44s/it]


Validation Loss: 0.8961
BLEU Score: 37.4292 (3742.9)

Sample translations:
Pred: Knowledge and practice of people with health insurance cards in using medical examination and treatment services in public health facilities and some influencing factors in Viêng Chăn, Lao, 2017
Ref:  Knowledge, practices in public health service utilization among health insurance cards holders and influencing factors in Vientiane, Lao

Pred: To describe the current situation of knowledge and practice of people with health insurance cards in the use of medical examination and treatment services in public health facilities and some related factors in Viêng Chăn Province, Lao People's Democratic Republic of Lao in 2017.
Ref:  Describe knowledge, practices in public health service utilization among health insurance card's holders and influencing factors in Vientiane, Lao PDR, 2017.

Pred: Methods: A cross-sectional descriptive study was conducted on 928 adults with health insurance cards in Phone Hong and Keo

In [6]:
def final_test_evaluation(model_path, test_vi_file, test_en_file):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    print("FINAL EVALUATION ON FULL TEST SET")
    
    # Load model và tokenizer
    tokenizer = T5Tokenizer.from_pretrained(model_path)
    model = T5ForConditionalGeneration.from_pretrained(model_path)
    model.to(device)
    model.eval()
    
    # Load test data
    with open(test_en_file, 'r', encoding='utf-8') as f:
        test_en_texts = [line.strip() for line in f.readlines()]
        
    with open(test_vi_file, 'r', encoding='utf-8') as f:
        test_vi_texts = [line.strip() for line in f.readlines()]
    
    predictions = []
    references = test_en_texts
    
    print(f"Translating {len(test_vi_texts)} test sentences...")
    
    for i, vi_text in enumerate(tqdm(test_vi_texts, desc="Translating")):
        # Thêm prefix
        input_text = f"vi-en: {vi_text}"
        
        # Tokenize
        inputs = tokenizer(
            input_text,
            return_tensors='pt',
            max_length=512,
            truncation=True
        ).to(device)
        
        # Generate
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=512,
                num_beams=4,
                early_stopping=True,
                do_sample=False
            )
        
        # Decode
        translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(translation)
    
    # Tính BLEU score bằng SacreBLEU
    bleu = sacrebleu.corpus_bleu(predictions, [references])
    
    print("FINAL TEST RESULTS:")
    print(f"Total test samples: {len(test_vi_texts)}")
    print(f"BLEU Score: {bleu.score:.2f}")
    print(f"BLEU Details: {bleu}")
    
    # Save kết quả
    results_file = os.path.join(model_path, "final_test_results.txt")
    with open(results_file, "w", encoding='utf-8') as f:
        f.write(f"Final Test Results\n")
        f.write(f"Total test samples: {len(test_en_texts)}\n")
        f.write(f"BLEU Score: {bleu.score:.2f}\n")
        f.write(f"BLEU Details: {bleu}\n")
        for i in range(min(10, len(predictions))):
            f.write(f"\nExample {i+1}:\n")
            f.write(f"VI: {test_vi_texts[i]}\n")
            f.write(f"Pred: {predictions[i]}\n")
            f.write(f"Ref: {references[i]}\n")
    
    print(f"Detailed results saved to: {results_file}")
    
    return bleu.score

In [7]:
# Final evaluation trên toàn bộ test set
if os.path.exists("./vit5_finetuned_vi_to_en"):
    final_bleu = final_test_evaluation("./vit5_finetuned_vi_to_en", "Released Corpus/test.vi.txt", "Released Corpus/test.en.txt")
    print(f"\nFinal BLEU-4 score on full test set: {final_bleu:.4f}")

FINAL EVALUATION ON FULL TEST SET
Translating 3000 test sentences...


Translating: 100%|██████████| 3000/3000 [30:57<00:00,  1.61it/s]


FINAL TEST RESULTS:
Total test samples: 3000
BLEU Score: 34.17
BLEU Details: BLEU = 34.17 65.8/41.5/28.7/20.8 (BP = 0.957 ratio = 0.958 hyp_len = 73367 ref_len = 76604)
Detailed results saved to: ./vit5_finetuned_vi_to_en\final_test_results.txt

Final BLEU-4 score on full test set: 34.1713
