In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [23]:
from datasets import load_dataset

books = load_dataset("opus_books", "en-fr")
books

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 127085
    })
})

In [3]:
books = books["train"].train_test_split(test_size=0.2)

In [4]:
books["train"][0]

{'id': '101876',
 'translation': {'en': 'Pencroft and Neb, during this time, had gone to fetch the boat, moored a few hundred feet higher up, on the bank of the Mercy, and by the time they returned, Ayrton was ready to start.',
  'fr': 'Pencroff et Nab, pendant ce temps, étaient allés chercher la pirogue, amarrée quelques centaines de pas plus haut, sur la berge de la Mercy, et, quand ils revinrent, Ayrton était prêt à partir.'}}

In [5]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
source_lang = "en"
target_lang = "fr"
prefix = "translate English to French: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [7]:
tokenized_books = books.map(preprocess_function, batched=True)

Map:   0%|          | 0/101668 [00:00<?, ? examples/s]

Map:   0%|          | 0/25417 [00:00<?, ? examples/s]

In [8]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [9]:
import evaluate

metric = evaluate.load("sacrebleu")

In [10]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [11]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_opus_books_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_books["train"],
    eval_dataset=tokenized_books["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/12710 [00:00<?, ?it/s]

{'loss': 2.1128, 'grad_norm': 1.3909549713134766, 'learning_rate': 1.9213217938631003e-05, 'epoch': 0.08}
{'loss': 2.002, 'grad_norm': 1.2674825191497803, 'learning_rate': 1.8426435877261997e-05, 'epoch': 0.16}
{'loss': 1.9818, 'grad_norm': 1.2479631900787354, 'learning_rate': 1.7639653815893e-05, 'epoch': 0.24}
{'loss': 1.9346, 'grad_norm': 1.1362398862838745, 'learning_rate': 1.6852871754524e-05, 'epoch': 0.31}
{'loss': 1.9303, 'grad_norm': 1.224910855293274, 'learning_rate': 1.6067663257277734e-05, 'epoch': 0.39}
{'loss': 1.9153, 'grad_norm': 1.3740708827972412, 'learning_rate': 1.5280881195908735e-05, 'epoch': 0.47}
{'loss': 1.888, 'grad_norm': 1.3049758672714233, 'learning_rate': 1.4494099134539735e-05, 'epoch': 0.55}
{'loss': 1.8998, 'grad_norm': 1.3628268241882324, 'learning_rate': 1.3707317073170734e-05, 'epoch': 0.63}
{'loss': 1.8733, 'grad_norm': 1.4406628608703613, 'learning_rate': 1.292210857592447e-05, 'epoch': 0.71}
{'loss': 1.8443, 'grad_norm': 1.3113832473754883, 'learn



  0%|          | 0/1589 [00:00<?, ?it/s]

{'eval_loss': 1.6306092739105225, 'eval_bleu': 5.5635, 'eval_gen_len': 17.6017, 'eval_runtime': 366.9432, 'eval_samples_per_second': 69.267, 'eval_steps_per_second': 4.33, 'epoch': 1.0}
{'loss': 1.8459, 'grad_norm': 1.656699299812317, 'learning_rate': 9.776553894571205e-06, 'epoch': 1.02}
{'loss': 1.8388, 'grad_norm': 1.1598753929138184, 'learning_rate': 8.989771833202203e-06, 'epoch': 1.1}
{'loss': 1.8375, 'grad_norm': 1.5924371480941772, 'learning_rate': 8.202989771833204e-06, 'epoch': 1.18}
{'loss': 1.831, 'grad_norm': 1.3244783878326416, 'learning_rate': 7.416207710464202e-06, 'epoch': 1.26}
{'loss': 1.838, 'grad_norm': 1.0659116506576538, 'learning_rate': 6.630999213217939e-06, 'epoch': 1.34}
{'loss': 1.832, 'grad_norm': 1.2019505500793457, 'learning_rate': 5.844217151848939e-06, 'epoch': 1.42}
{'loss': 1.8319, 'grad_norm': 1.353132963180542, 'learning_rate': 5.057435090479937e-06, 'epoch': 1.49}
{'loss': 1.8105, 'grad_norm': 1.3671542406082153, 'learning_rate': 4.270653029110937e



  0%|          | 0/1589 [00:00<?, ?it/s]

{'eval_loss': 1.6068373918533325, 'eval_bleu': 5.7232, 'eval_gen_len': 17.5852, 'eval_runtime': 361.9954, 'eval_samples_per_second': 70.214, 'eval_steps_per_second': 4.39, 'epoch': 2.0}
{'train_runtime': 2256.1358, 'train_samples_per_second': 90.126, 'train_steps_per_second': 5.634, 'train_loss': 1.8749637054515391, 'epoch': 2.0}


TrainOutput(global_step=12710, training_loss=1.8749637054515391, metrics={'train_runtime': 2256.1358, 'train_samples_per_second': 90.126, 'train_steps_per_second': 5.634, 'train_loss': 1.8749637054515391, 'epoch': 2.0})

In [13]:
text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."

In [14]:
from transformers import pipeline

translator = pipeline("translation_xx_to_yy", model="my_awesome_opus_books_model")
translator(text)

[{'translation_text': 'Les légumes partagent les ressources avec les bactéries fixatrice'}]

In [18]:
text = "praise the lord"
from transformers import pipeline

translator = pipeline("translation_xx_to_yy", model="my_awesome_opus_books_model")
translator(text)

[{'translation_text': 'el praise the Lord'}]