In [21]:
!pip install transformers datasets torch sentencepiece




In [22]:
#from google.colab import files
#uploaded = files.upload()


In [28]:
from datasets import load_dataset, DatasetDict

# Load your dataset from CSV with utf-8 encoding
dataset = load_dataset(
    'csv',
    data_files='/kaggle/input/t5-training-data-full/t5_training_data_full.csv',
    split='train',
    encoding='utf-8'
)

# First, split off 10% as test set
train_val, test = dataset.train_test_split(test_size=0.1, seed=42).values()

# Then split remaining 90% into train (81%) and validation (9%)
train, validation = train_val.train_test_split(test_size=0.1, seed=42).values()

# Bundle everything into a DatasetDict
final_dataset = DatasetDict({
    'train': train,
    'validation': validation,
    'test': test
})

print(final_dataset)



DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 36699
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 4078
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 4531
    })
})


In [29]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model
model_name = "NlpHUST/t5-en-vi-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Move model to CUDA
model.to(device)

print(f"Model loaded on {device}")



Model loaded on cuda


In [25]:
# text = "Tôi thích đọc sách."

# # Tokenize
# tokenized = tokenizer(text)

# # Print tokenized output
# print("Input Text:", text)
# print("Token IDs:", tokenized["input_ids"])
# print("Tokens:", [tokenizer.convert_ids_to_tokens(id) for id in tokenized["input_ids"]])

# # Decode back to text
# decoded_text = tokenizer.decode(tokenized["input_ids"], skip_special_tokens=True)
# print("Decoded Text:", decoded_text)

In [30]:
max_source_length = 512
max_target_length = 128

# Preprocessing function
def preprocess_function(examples):
    # Tokenize source (English)
    model_inputs = tokenizer(
        examples['source'],
        max_length=max_source_length,
        padding='max_length',
        truncation=True
    )

    # Tokenize target (Vietnamese)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['target'],
            max_length=max_target_length,
            padding='max_length',
            truncation=True
        )

    model_inputs['labels'] = labels['input_ids']
    return model_inputs


# Apply tokenization to all splits
tokenized_datasets = final_dataset.map(preprocess_function, batched=True)


In [40]:
from transformers import TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir="./t5-finetuned-en-vi",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    weight_decay=0.01,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2,
    fp16=True if torch.cuda.is_available() else False,  # Enable mixed precision training if using CUDA
    push_to_hub=False
)



In [42]:
from transformers import TrainerCallback

class PrintLossCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"\n✅ Epoch {state.epoch:.0f} Finished — Training Loss: {state.log_history[-1]['loss']}")


In [43]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer
)




  trainer = Trainer(


In [45]:
import wandb

wandb.login(key="c1b0248db039145f0457d72f94404dd6c3002ef3")  # Replace with your actual API key

wandb.init(project="t5_evbc_translation", name="t5-small-finetune-run")





In [46]:
trainer.train()




Epoch,Training Loss,Validation Loss
1,0.4026,0.336794
2,0.3914,0.332376
3,0.3791,0.331211
4,0.3752,0.329649
5,0.375,0.329582




TrainOutput(global_step=11470, training_loss=0.42889901837163524, metrics={'train_runtime': 14489.3975, 'train_samples_per_second': 12.664, 'train_steps_per_second': 0.792, 'total_flos': 9.70229245280256e+16, 'train_loss': 0.42889901837163524, 'epoch': 5.0})

In [47]:
trainer.save_model('./t5_evb_translation_model')
tokenizer.save_pretrained('./t5_evb_translation_model')


('./t5_evb_translation_model/tokenizer_config.json',
 './t5_evb_translation_model/special_tokens_map.json',
 './t5_evb_translation_model/spiece.model',
 './t5_evb_translation_model/added_tokens.json')

In [55]:
test_sentences = [
    "Translate English to Vietnamese: gay",
    "Translate English to Vietnamese: I love reading books.",
    "Translate English to Vietnamese: Hưng is a fat guy ."
]

for sentence in test_sentences:
    inputs = tokenizer(sentence, return_tensors="pt").to(model.device)
    output = model.generate(**inputs, max_length=50)
    print(f"Input: {sentence}")
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    print(output_text.encode('utf-8').decode())



Input: Translate English to Vietnamese: gay
đồng tính
Input: Translate English to Vietnamese: I love reading books.
Tôi thích đọc sách.
Input: Translate English to Vietnamese: Hưng is a fat guy .
Hưng là một chàng trai béo .


In [49]:
from datasets import load_metric
import numpy as np

# Load evaluation metrics
rouge = load_metric("rouge")
bleu = load_metric("bleu")

# Function to generate translations and compute metrics
def evaluate_model(model, tokenizer, test_dataset):
    predictions = []
    references = []

    for example in test_dataset:
        # Encode input text
        input_ids = tokenizer(example['source'], return_tensors="pt", padding=True, truncation=True, max_length=max_source_length).input_ids.to(device)
        
        # Generate translation
        output_ids = model.generate(input_ids, max_length=max_target_length)
        translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        # Append results
        predictions.append(translated_text)
        references.append([example['target']])  # BLEU expects list of references

    # Compute BLEU score
    bleu_score = bleu.compute(predictions=[p.split() for p in predictions], references=[r[0].split() for r in references])

    # Compute ROUGE score
    rouge_score = rouge.compute(predictions=predictions, references=references)

    print("\n🔹 **Evaluation Results** 🔹")
    print(f"BLEU Score: {bleu_score['bleu']:.4f}")
    print(f"ROUGE Score: {rouge_score}")

# Run evaluation on test set
evaluate_model(model, tokenizer, tokenized_datasets["test"])


ImportError: cannot import name 'load_metric' from 'datasets' (/usr/local/lib/python3.10/dist-packages/datasets/__init__.py)