# Eng - Fr Translator using transformers

In [1]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
Collec

In [3]:
!pip install transformers==4.28.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.28.1
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m97.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.1)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m105.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.13.3 transformers-4.28.1


### About the dataset

the KDE4 dataset is a parallel corpus containing sentence pairs in various languages. It was created by extracting parallel sentences from KDE4 localization files, hence the name KDE4. The dataset contains sentence pairs in languages such as English, French, German, Spanish, Italian, Dutch, Portuguese, Catalan, and more. The dataset is commonly used for machine translation and other natural language processing tasks.

In [4]:
from datasets import load_dataset
data = load_dataset('kde4',lang1='en',lang2='fr')
data

Downloading builder script:   0%|          | 0.00/4.25k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/8.45k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

Downloading and preparing dataset kde4/en-fr to /root/.cache/huggingface/datasets/kde4/en-fr-lang1=en,lang2=fr/0.0.0/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac...


Downloading data:   0%|          | 0.00/7.05M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/210173 [00:00<?, ? examples/s]

Dataset kde4 downloaded and prepared to /root/.cache/huggingface/datasets/kde4/en-fr-lang1=en,lang2=fr/0.0.0/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 210173
    })
})

In [None]:
small = data['train'].shuffle(seed=42).select(range(210173//2)) #50% of data is enough))

In [None]:
split = small.train_test_split(seed=42)

In [None]:
split

In [None]:
split["train"][0]

### Why use Helsinki-NLP/opus-mt-en-fr
The Helsinki-NLP/opus-mt-en-fr model is a machine translation model that translates English to French. It is based on the Transformer architecture, which was introduced in the paper "Attention Is All You Need" by Vaswani et al.  
The Helsinki-NLP/opus-mt-en-fr model uses a specific variant of the Transformer architecture known as the Marian architecture. This architecture includes both an encoder and a decoder, each of which consists of multiple Transformer layers. The encoder processes the input sequence, while the decoder generates the output sequence.

In [None]:
from transformers import AutoTokenizer
helsinki = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(helsinki)

In [None]:
en = split['train'][10]['translation']['en']
fr = split['train'][10]['translation']['fr']

en,fr

In [None]:
# Tokenizing the inputs

inputs = tokenizer(en)
inputs

In [None]:
# tokenize the target sentence as target context manager

with tokenizer.as_target_tokenizer():
  targets = tokenizer(fr)
targets

In [None]:
tokenizer.convert_ids_to_tokens(targets['input_ids'])

In [None]:
# visualize the frequency of words in sentences (inputs)

import matplotlib.pyplot as plt

train = split["train"]["translation"]

input_lens = [len(tr["en"]) for tr in train]

plt.figure(figsize=(16,8))
plt.hist(input_lens,bins=500,edgecolor="black")
plt.xlabel("Input length")
plt.ylabel("No. of sentences")
plt.show()

In [None]:
plt.figure(figsize=(16,8))
plt.hist(input_lens, bins=500, edgecolor="black")
plt.xlabel("Input length")
plt.ylabel("No. of sentences")
plt.xlim([0, 500])  # set the x-axis limits
plt.show()

In [None]:
target_lens = [len(tr["fr"]) for tr in train]

plt.figure(figsize=(16,8))
plt.hist(target_lens,bins=500,edgecolor="black")
plt.xlabel("Input length")
plt.ylabel("No. of sentences")
plt.show()

In [None]:
plt.figure(figsize=(16,8))
plt.hist(target_lens, bins=500, edgecolor="black")
plt.xlabel("Input length")
plt.ylabel("No. of sentences")
plt.xlim([0, 500])  # set the x-axis limits
plt.show()

In [None]:
max_input_len = 256
max_target_len = 256
def tokenizer_fn(batch):
  inputs = [x["en"] for x in batch["translation"]]
  targets = [x["fr"] for x in batch["translation"]]

  tokenized_inputs = tokenizer(
      inputs,
      max_length = max_input_len,
      truncation = True
  )

  with tokenizer.as_target_tokenizer():
    tokenized_targets = tokenizer(
        targets,
        max_length = max_target_len,
        truncation = True
    )
    tokenized_inputs["labels"] = tokenized_targets["input_ids"]

    return tokenized_inputs

In [None]:
# storeing tokenized dataset

tokenized_datasets = split.map(
    tokenizer_fn,
    batched=True,
    remove_columns = split['train'].column_names,
)

In [None]:
#view the tokenized datasets
tokenized_datasets

In [None]:
print(tokenized_datasets['train'][1],tokenized_datasets['train'][2])

In [None]:
# Creatig the Seq2Seq transformers model
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(helsinki)

### DataCollorator  
data_collator expects input as a list of samples, where each sample is a dictionary containing input_ids and attention_mask for the source language sequence, and labels for the target language sequence.

When data_collator is called with a list of n samples, it returns a dictionary with the following keys:

'input_ids': a tensor of shape (n, max_seq_length) containing the encoded input sequences for the source language.
'attention_mask': a tensor of shape (n, max_seq_length) containing a mask to indicate which elements of the input sequences should be attended to and which should not.
'labels': a tensor of shape (n, max_seq_length) containing the encoded target sequences for the target language.
'decoder_input_ids': a tensor of shape (n, max_seq_length) containing the encoded input sequences for the target language, shifted one position to the right.
In the provided code, batch is obtained by calling data_collator with a list of two samples (corresponding to the first two samples of the training dataset), so the resulting dictionary has keys 'input_ids', 'attention_mask', 'labels', and 'decoder_input_ids'.

In [None]:
# using datacollator for padding and converting into torch tensor

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)

In [None]:
# Test out our datacollator 
# It expects inputs as list of samples

batch = data_collator([tokenized_datasets["train"][i] for i in range(1,3)])
batch.keys()

In [None]:
batch.labels

In [None]:
print(type(batch["labels"]))

In [None]:
# think of seq2seq RNNs - the decoder RNN must have "some input
# it's just like a language model where we predict the next word from previous words
batch['decoder_input_ids']
# decoder_input_ids are just the shifted version of targets

In [None]:
print(type(batch["decoder_input_ids"]))

In [None]:
# first token is a pad
tokenizer.convert_ids_to_tokens(batch["decoder_input_ids"][0])

In [None]:
split['train'][1]

In [None]:
from datasets import load_metric

bleu_metric = load_metric("sacrebleu")
bert_metric = load_metric("bertscore")

### Bleu metric
BLEU (bilingual evaluation understudy) is a widely used metric for evaluating the quality of machine translation outputs. It compares the similarity between a machine-generated translation and one or more human-generated reference translations based on n-gram overlaps. The BLEU score ranges from 0 to 1, with 1 indicating a perfect match.

In [None]:
# targets must be in a list - for bleu there can be multiple
# acceptable reference translations

bleu_metric.compute(predictions=["I love cats."],references = [["I love cats."]])

In [None]:
from transformers.models import reformer

s = "Unreal Engine is a 3D computer"
bleu_metric.compute(predictions=[s],references=[[s]])

### BERT metric

bert_score is a popular metric for evaluating the quality of generated text in natural language processing tasks. It is based on contextualized word embeddings produced by pre-trained BERT models. The bert_score package provides a score() function that can be used to compute precision, recall, and F1 scores for generated text in comparison to reference text. It is a useful metric for evaluating the fluency and semantic coherence of generated text.

In [None]:
# bleu score works on ngrams*
# its not good for detecting similar words, or the word with same meaning
# bert_metricis useful here, it uses cosine similarities

bert_metric.compute(
    predictions=["I love cats. what do you think"],
    references=[["I like cats. what do you think"]],
    lang="en"
)

In [None]:
bleu_metric.compute(
    predictions=["I love cats. what do you think"],
    references=[["I like cats. what do you think"]],
)

In [None]:
# create personal compute_metric function
!pip install --upgrade accelerate

In [None]:
!transformers-cli env


### Exlain training args

-"finetuned-model": This is the name of the directory where the trained model will be saved.  
-evaluation_strategy="no": This means that no evaluation will be performed during training.  
-save_strategy="epoch": This means that the model will be saved after each epoch of training.  
-learning_rate=2e-5: This sets the learning rate of the optimizer to 2e-5.  
-per_device_train_batch_size=32: This sets the batch size for training to 32 sequences per device (GPU or CPU).  
-per_device_eval_batch_size=64: This sets the batch size for evaluation to 64 sequences per device.  
-weight_decay=0.01: This sets the weight decay (L2 regularization) factor to 0.01.  
-save_total_limit=3: This sets the maximum number of saved models to 3.  
-num_train_epochs=3: This sets the number of epochs for training to 3.  
-predict_with_generate=True: This enables autoregressive decoding during evaluation, meaning that the decoder generates one token at a time, conditioned on the previously generated tokens.  
-fp16=True: This enables mixed-precision training, which can speed up training by using half-precision (float16) instead of full-precision (float32) for some computations.

In [None]:
import numpy as np
def compute_metrics(preds_labels):
    preds,labels = preds_labels

    decoded_preds = tokenizer.batch_decode(preds,skip_special_token=True)

    labels = np.where(labels != -100, labels,tokenizer.pad_token_id)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_token=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]

    decoded_labels = [[label.strip()] for label in decoded_labels]

    bleu = bleu_metric.compute(
        predictions = decoded_preds,
        references = decoded_labels
    )
    bert_score = bert_metric.compute(
        predictions = decoded_preds,
        references = decoded_labels,
        lang='fr'
    )

    return {"bleu":bleu["score"],"bert":np.mean(bert_score['f1'])}



In [None]:
from transformers import Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
    "finetuned-model",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True, #makes it autoregressive
    fp16=True,
)


### What is Seq2SeqTrainer

Seq2SeqTrainer is a class provided by the transformers library in Python, which is used to train sequence-to-sequence (seq2seq) models. Seq2seq models are a type of neural network architecture commonly used in natural language processing (NLP) for tasks such as machine translation, summarization, and text generation.

In [None]:
# trainer 
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
#trainer.evaluate(max_length=max_target_len)

In [None]:
trainer.save_model("en_fr_translator")

In [None]:
from transformers import pipeline
translator = pipeline("translation",model="en_fr_translator" ,device=0)

In [None]:
translator("I love natural language processing")