In [9]:
pip install transformers datasets torch sentencepiece


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
from datasets import load_dataset

# Load your dataset from CSV
dataset = load_dataset('csv', data_files='t5_training_data_full.csv')['train']

# First, split off 10% as test set
train_val, test = dataset.train_test_split(test_size=0.1, seed=42).values()

# Then split remaining 90% into train (81%) and validation (9%)
train, validation = train_val.train_test_split(test_size=0.1, seed=42).values()

# Bundle everything into a DatasetDict
dataset_dict = {
    'train': train,
    'validation': validation,
    'test': test
}

from datasets import DatasetDict
final_dataset = DatasetDict(dataset_dict)

print(final_dataset)


DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 36699
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 4078
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 4531
    })
})


In [11]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
max_source_length = 512
max_target_length = 128

def preprocess_function(examples):
    # Tokenize source (English)
    model_inputs = tokenizer(
        examples['source'],
        max_length=max_source_length,
        padding='max_length',
        truncation=True
    )
    
    # Tokenize target (Vietnamese)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['target'],
            max_length=max_target_length,
            padding='max_length',
            truncation=True
        )
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply tokenization to all splits
tokenized_datasets = final_dataset.map(preprocess_function, batched=True)


In [13]:
pip install transformers[torch]

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir='./t5_evb_finetuned',
    evaluation_strategy='epoch',
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,  # Enable if running on GPU
    logging_steps=200,
    save_strategy='epoch',
    eval_accumulation_steps=4,
    report_to=["wandb"],         # ✅ Enable wandb logging
    run_name="t5-small-finetune-run"  # ✅ Optional: name your wandb run
)


In [15]:
from transformers import TrainerCallback

class PrintLossCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"\n✅ Epoch {state.epoch:.0f} Finished — Training Loss: {state.log_history[-1]['loss']}")


In [16]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    callbacks=[PrintLossCallback()]
)




In [17]:
import wandb
wandb.init(project="t5_evbc_translation", name="t5-small-finetune-run")



[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mphamlonghai060504[0m ([33mphamlonghai060504-hanoi-university-of-science-and-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [18]:
trainer.train()


  0%|          | 7/22940 [01:37<93:39:19, 14.70s/it]

KeyboardInterrupt: 

In [None]:
trainer.save_model('./t5_evb_translation_model')
tokenizer.save_pretrained('./t5_evb_translation_model')
