In [None]:
# Reference: https://huggingface.co/docs/transformers/tasks/translation

In [1]:
!pip install datasets




In [2]:
!pip install evaluate



In [3]:
# Source: https://huggingface.co/docs/transformers/tasks/translation

In [4]:
from transformers import AutoTokenizer

model_name = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
# prompt: code to get gpu device

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [6]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model_name = "google-t5/t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

In [7]:
# Greedy search
src_sentence = "Translate English to Italian: The boy eats rice."

input_ids = tokenizer(src_sentence, return_tensors="pt").input_ids

outputs = model.generate(
    input_ids.to(device),
    do_sample = False,
    no_repeat_ngram_size=1,
    remove_invalid_values=True,
)
print([tokenizer.decode(i, skip_special_tokens=True) for i in outputs])



['Der Junge hat Reis.']


In [8]:
source_lang = "en"
target_lang = "it"
prefix = "translate English to Italian: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [9]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

In [20]:
import evaluate

metric = evaluate.load("bleu")

In [21]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["bleu"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [12]:
!pip install accelerate -U



In [15]:
from datasets import load_dataset

books = load_dataset("opus_books", "en-it")
tokenized_books = books.map(preprocess_function, batched=True)


Downloading readme:   0%|          | 0.00/25.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.73M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32332 [00:00<?, ? examples/s]

Map:   0%|          | 0/32332 [00:00<?, ? examples/s]

In [22]:

training_args = Seq2SeqTrainingArguments(
    output_dir="./output",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model.to(device),
    args=training_args,
    train_dataset=tokenized_books["train"],
    eval_dataset=tokenized_books["train"], # The dataset has no val or test split. So, I will just use the same. -Aditya
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,3.3157,2.924352,0.0032,18.1429
2,3.2532,2.867725,0.0034,18.1356


Checkpoint destination directory ./output/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=4042, training_loss=3.3219066319755846, metrics={'train_runtime': 1411.5532, 'train_samples_per_second': 45.811, 'train_steps_per_second': 2.864, 'total_flos': 1699696357933056.0, 'train_loss': 3.3219066319755846, 'epoch': 2.0})

In [23]:
# Greedy search
src_sentence = "Translate English to Italian: The boy eats rice."

input_ids = tokenizer(src_sentence, return_tensors="pt").input_ids

outputs = model.generate(
    input_ids.to(device),
    do_sample = False,
    no_repeat_ngram_size=1,
    remove_invalid_values=True,
)
print([tokenizer.decode(i, skip_special_tokens=True) for i in outputs])

['Le bambin mange rice.']
