Build a translation model to convert between languages as demonstrated in this tutorial.

Note: Make sure to understand the process thoroughly.

[Translation task](https://huggingface.co/learn/nlp-course/chapter7/4?fw=pt)

In [2]:
!pip install datasets



In [3]:
from datasets import load_dataset

raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 210173
    })
})

In [6]:
raw_datasets['train'] = raw_datasets['train'].select(range(100))
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 100
    })
})

In [7]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 90
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 10
    })
})

In [8]:
split_datasets["validation"] = split_datasets.pop("test")

In [9]:
split_datasets["train"][1]["translation"]

{'en': 'ROLES_OF_TRANSLATORS', 'fr': '& traducteurJeromeBlanc;'}

In [10]:
from transformers import pipeline

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
translator = pipeline("translation", model=model_checkpoint)
translator("Default to expanded threads")



[{'translation_text': 'Par défaut pour les threads élargis'}]

In [12]:
split_datasets["train"][5]["translation"]

{'en': 'The Folder Filter', 'fr': 'Le Filtre de dossier'}

In [13]:
translator(
    "Unable to import %1 using the OFX importer plugin. This file is not the correct format."
)

[{'translation_text': "Impossible d'importer %1 en utilisant le plugin d'importateur OFX. Ce fichier n'est pas le bon format."}]

In [14]:
translator(
    "hi , i am artificial intelligence student "
)

[{'translation_text': 'Salut, je suis étudiant en intelligence artificielle'}]

fine-tuned model

In [15]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")



In [16]:
en_sentence = split_datasets["train"][1]["translation"]["en"]
fr_sentence = split_datasets["train"][1]["translation"]["fr"]

inputs = tokenizer(en_sentence, text_target=fr_sentence)
inputs

{'input_ids': [11278, 11928, 817, 40376, 817, 8886, 323, 28928, 142, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [402, 41586, 6079, 35509, 27108, 50, 0]}

In [17]:
wrong_targets = tokenizer(fr_sentence)
print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))
print(tokenizer.convert_ids_to_tokens(inputs["labels"]))

['▁&', '▁tra', 'duct', 'eur', 'J', 'er', 'ome', 'Blanc', ';', '</s>']
['▁&', '▁traducteur', 'Je', 'rome', 'Blanc', ';', '</s>']


In [18]:
max_length = 128


def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [19]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [20]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [21]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [22]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [23]:
batch["labels"]

tensor([[  402, 41586,  6079, 35509, 27108,    50,     0,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100],
        [   60,  5893,  9151,   188, 25028,    27,   402,  2635,   186,  8577,
           259,    50,    88,  2043,    34,  1400,  2323,    39,   272,     5,
          8418,   188, 25028,   483,  3089,     3,     0]])

In [24]:
batch["decoder_input_ids"]

tensor([[59513,   402, 41586,  6079, 35509, 27108,    50,     0, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513],
        [59513,    60,  5893,  9151,   188, 25028,    27,   402,  2635,   186,
          8577,   259,    50,    88,  2043,    34,  1400,  2323,    39,   272,
             5,  8418,   188, 25028,   483,  3089,     3]])

In [25]:
for i in range(1, 3):
    print(tokenized_datasets["train"][i]["labels"])

[402, 41586, 6079, 35509, 27108, 50, 0]
[60, 5893, 9151, 188, 25028, 27, 402, 2635, 186, 8577, 259, 50, 88, 2043, 34, 1400, 2323, 39, 272, 5, 8418, 188, 25028, 483, 3089, 3, 0]


In [26]:
!pip install sacrebleu



In [27]:
!pip install evaluate



In [28]:
import evaluate

metric = evaluate.load("sacrebleu")

In [29]:
predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 46.750469682990165,
 'counts': [11, 6, 4, 3],
 'totals': [12, 11, 10, 9],
 'precisions': [91.66666666666667,
  54.54545454545455,
  40.0,
  33.333333333333336],
 'bp': 0.9200444146293233,
 'sys_len': 12,
 'ref_len': 13}

In [30]:
predictions = ["This This This This"]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 1.683602693167689,
 'counts': [1, 0, 0, 0],
 'totals': [4, 3, 2, 1],
 'precisions': [25.0, 16.666666666666668, 12.5, 12.5],
 'bp': 0.10539922456186433,
 'sys_len': 4,
 'ref_len': 13}

In [31]:
predictions = ["This plugin"]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 0.0,
 'counts': [2, 1, 0, 0],
 'totals': [2, 1, 0, 0],
 'precisions': [100.0, 100.0, 0.0, 0.0],
 'bp': 0.004086771438464067,
 'sys_len': 2,
 'ref_len': 13}

In [32]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [35]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [36]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"marian-finetuned-kde4-en-to-fr",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)



In [37]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [38]:
trainer.evaluate(max_length=max_length)

{'eval_loss': 1.5093746185302734,
 'eval_bleu': 27.4771299254944,
 'eval_runtime': 23.4129,
 'eval_samples_per_second': 0.427,
 'eval_steps_per_second': 0.043}

In [39]:
trainer.train()

Step,Training Loss


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}


TrainOutput(global_step=9, training_loss=1.06378173828125, metrics={'train_runtime': 319.2761, 'train_samples_per_second': 0.846, 'train_steps_per_second': 0.028, 'total_flos': 4259011756032.0, 'train_loss': 1.06378173828125, 'epoch': 3.0})

In [40]:
trainer.evaluate(max_length=max_length)

{'eval_loss': 1.2243977785110474,
 'eval_bleu': 29.44568546617384,
 'eval_runtime': 15.3364,
 'eval_samples_per_second': 0.652,
 'eval_steps_per_second': 0.065,
 'epoch': 3.0}

In [41]:
trainer.push_to_hub(tags="translation", commit_message="Training complete")

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}


events.out.tfevents.1722260377.a222ae59bddf.29122.0:   0%|          | 0.00/6.39k [00:00<?, ?B/s]

events.out.tfevents.1722260721.a222ae59bddf.29122.1:   0%|          | 0.00/401 [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/bushra1dajam/marian-finetuned-kde4-en-to-fr/commit/37e53141af1ba565273fb6e4211d299127fa69d9', commit_message='Training complete', commit_description='', oid='37e53141af1ba565273fb6e4211d299127fa69d9', pr_url=None, pr_revision=None, pr_num=None)

In [42]:
from torch.utils.data import DataLoader

tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

In [43]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)



In [44]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [45]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [46]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "marian-finetuned-kde4-en-to-fr-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

'bushra1dajam/marian-finetuned-kde4-en-to-fr-accelerate'