In [1]:
pip install transformers datasets torch sentencepiece


Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

In [2]:
from google.colab import files
uploaded = files.upload()


Saving t5_training_data_full.csv to t5_training_data_full.csv


In [3]:
from datasets import load_dataset, DatasetDict

# Load your dataset from CSV with utf-8 encoding
dataset = load_dataset(
    'csv',
    data_files='t5_training_data_full.csv',
    split='train',
    encoding='utf-8'
)

# First, split off 10% as test set
train_val, test = dataset.train_test_split(test_size=0.1, seed=42).values()

# Then split remaining 90% into train (81%) and validation (9%)
train, validation = train_val.train_test_split(test_size=0.1, seed=42).values()

# Bundle everything into a DatasetDict
final_dataset = DatasetDict({
    'train': train,
    'validation': validation,
    'test': test
})

print(final_dataset)



Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 36699
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 4078
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 4531
    })
})


In [4]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [5]:
max_source_length = 512
max_target_length = 128

def preprocess_function(examples):
    # Tokenize source (English)
    model_inputs = tokenizer(
        examples['source'],
        max_length=max_source_length,
        padding='max_length',
        truncation=True
    )

    # Tokenize target (Vietnamese)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['target'],
            max_length=max_target_length,
            padding='max_length',
            truncation=True
        )

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply tokenization to all splits
tokenized_datasets = final_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/36699 [00:00<?, ? examples/s]



Map:   0%|          | 0/4078 [00:00<?, ? examples/s]

Map:   0%|          | 0/4531 [00:00<?, ? examples/s]

In [6]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir='./t5_evb_finetuned',
    evaluation_strategy='epoch',
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,  # Enable if running on GPU
    logging_steps=200,
    save_strategy='epoch',
    eval_accumulation_steps=4,
    report_to=["wandb"],         #
    run_name="t5-small-finetune-run"  #
)




In [7]:
from transformers import TrainerCallback

class PrintLossCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"\n✅ Epoch {state.epoch:.0f} Finished — Training Loss: {state.log_history[-1]['loss']}")


In [8]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    callbacks=[PrintLossCallback()]
)




  trainer = Seq2SeqTrainer(


In [9]:
import wandb
wandb.init(project="t5_evbc_translation", name="t5-small-finetune-run")



[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mphamlonghai060504[0m ([33mphamlonghai060504-hanoi-university-of-science-and-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [10]:
trainer.train()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
trainer.save_model('./t5_evb_translation_model')
tokenizer.save_pretrained('./t5_evb_translation_model')


In [None]:
test_sentences = [
    "Translate English to Vietnamese: Hello, how are you?",
    "Translate English to Vietnamese: I love reading books.",
    "Translate English to Vietnamese: Vietnam is a beautiful country."
]

for sentence in test_sentences:
    inputs = tokenizer(sentence, return_tensors="pt").to(model.device)
    output = model.generate(**inputs, max_length=50)
    print(f"Input: {sentence}")
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    print(output_text.encode('utf-8').decode())

