In [1]:
! pip install sacrebleu --quiet

[0m

In [2]:
# let's import all necessary dependencies

import numpy as np
import pandas as pd
import tensorflow as tf
import os
from datasets import Dataset

import warnings
warnings.filterwarnings('ignore')

os.environ["WANDB_DISABLED"] = 'true'

In [3]:
dataset = pd.read_csv('/kaggle/input/english-french-translation/fr-en-translation.csv', nrows=100000, encoding_errors='ignore')

dataset = dataset.loc[:, dataset.columns != 'Unnamed: 2']

dataset.head()

Unnamed: 0,1,2
0,You thought to this day that there were tyrants?,Vous avez cru jusqu'à ce jour qu'il y avait de...
1,"How do you feed your family?""","Comment nourrissez-vous votre famille ?"""
2,The first group shows God creating the Heavens...,Le premier ciel est une voûte à laquelle la te...
3,It is said after this he split to a thousand p...,"Il est dit après cela, qu'il s'est divisé en m..."
4,"They are subservient to him, and created for a...","Ils sont serviles à son égard, et créés pour u..."


In [4]:
# let's use HuggingFace's Dataset object
dataset = Dataset.from_pandas(dataset)

dataset

Dataset({
    features: ['1', '2'],
    num_rows: 100000
})

In [5]:
# let's split the dataset into train and validation datasets
from sklearn.model_selection import train_test_split
from typing import Tuple

def split_dataset(dataset: Dataset) -> Tuple[Dataset, Dataset]:
    """
    Split a HuggingFace Dataset object into training and validation sets.

    Args:
        dataset (Dataset): The dataset to split.

    Returns:
        Tuple[Dataset, Dataset]: A tuple containing the training and validation datasets.
    """
    # Extract the texts and labels from the dataset
    texts = dataset['1']
    labels = dataset['2']
    
    # Split the dataset into training and validation sets
    train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)

    # Create new Dataset objects for the training and validation sets
    train_dataset = Dataset.from_dict({'english': train_texts, 'french': train_labels})
    val_dataset = Dataset.from_dict({'english': val_texts, 'french': val_labels})
    
    return train_dataset, val_dataset

In [6]:
train_dataset, val_dataset = split_dataset(dataset)
print(len(train_dataset), len(val_dataset))

80000 20000


In [7]:
train_dataset['english'][:3]

['Lincoln wanted to win; Davis wanted to be right.',
 'But he also notes that "if Joshua Prawer were alive today he would no doubt deny any linkage between his Zionist political beliefs and the model of segregation that he developed."',
 'On 1 September 1920, the Weimar Republic and Austria concluded an economic agreement.']

In [8]:
train_dataset['french'][:3]

['Lincoln voulait gagner\xa0; Davis voulait être juste.',
 "Il nota aussi que «\xa0si Joshua Prawer était vivant aujourd'hui, il nierait sans aucun doute toute filiation entre ses croyances politiques sionistes et le modèle de ségrégation qu'il a développé\xa0».",
 "Le 1er septembre 1920, la République de Weimar et l'Autriche signèrent un accord économique."]

### Loading From HuggingFace

In [9]:
# some stuff 

maxlen = 40
batch_size = 64
epochs = 5

In [10]:
# load the tokenizer from huggingface
from transformers import AutoTokenizer

model_name = 'Helsinki-NLP/opus-mt-en-fr'

tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

In [11]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/301M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [12]:
text = "This is how the model tokenizes text."

tokenized_text = tokenizer(text, add_special_tokens = False)
tokenized_text

{'input_ids': [160, 32, 541, 4, 2223, 12, 7106, 3317, 9, 1863, 3], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [13]:
def preprocess_dataset(dataset):
    """Utility function to batch encode the texts in the dataset"""
    
    inputs = [text for text in dataset['english']]
    targets = [text for text in dataset['french']]
    model_inputs = tokenizer(inputs, max_length = maxlen, truncation = True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length = maxlen, truncation = True)
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [14]:
train_dataset = train_dataset.map(preprocess_dataset, batched = True)
val_dataset = val_dataset.map(preprocess_dataset, batched = True)

  0%|          | 0/80 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

In [15]:
train_dataset

Dataset({
    features: ['english', 'french', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 80000
})

In [16]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer,  model = model)

## Training

In [17]:
from transformers import Seq2SeqTrainingArguments

training_arguments = Seq2SeqTrainingArguments(output_dir = 'model_checkpoint', #evaluation_strategy = 'epoch',
                                             learning_rate=2e-5, per_device_train_batch_size = batch_size,
                                              #per_device_eval_batch_size=batch_size,
                                             weight_decay = 0.01, num_train_epochs = epochs,
                                             save_total_limit = 3, fp16 = True)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [18]:
from datasets import load_metric

metric = load_metric('sacrebleu')

Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [19]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    
    return preds, labels

In [20]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    if isinstance(preds, tuple):
        preds = preds[0]
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens = True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    
    return result

In [21]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(model, training_arguments, train_dataset = train_dataset, 
                         data_collator = data_collator, #eval_dataset = val_dataset,
                         tokenizer = tokenizer, compute_metrics = compute_metrics)

Using cuda_amp half precision backend


In [22]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: french, english. If french, english are not expected by `MarianMTModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 80000
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 6250
  Number of trainable parameters = 74609664


Step,Training Loss
500,0.6852
1000,0.6425
1500,0.6126
2000,0.5824
2500,0.5818
3000,0.5374
3500,0.5429
4000,0.5269
4500,0.511
5000,0.5142


Saving model checkpoint to model_checkpoint/checkpoint-500
Configuration saved in model_checkpoint/checkpoint-500/config.json
Configuration saved in model_checkpoint/checkpoint-500/generation_config.json
Model weights saved in model_checkpoint/checkpoint-500/pytorch_model.bin
tokenizer config file saved in model_checkpoint/checkpoint-500/tokenizer_config.json
Special tokens file saved in model_checkpoint/checkpoint-500/special_tokens_map.json
Saving model checkpoint to model_checkpoint/checkpoint-1000
Configuration saved in model_checkpoint/checkpoint-1000/config.json
Configuration saved in model_checkpoint/checkpoint-1000/generation_config.json
Model weights saved in model_checkpoint/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in model_checkpoint/checkpoint-1000/tokenizer_config.json
Special tokens file saved in model_checkpoint/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to model_checkpoint/checkpoint-1500
Configuration saved in model_checkpoint/

TrainOutput(global_step=6250, training_loss=0.5581571008300781, metrics={'train_runtime': 1658.8493, 'train_samples_per_second': 241.131, 'train_steps_per_second': 3.768, 'total_flos': 4131498819059712.0, 'train_loss': 0.5581571008300781, 'epoch': 5.0})

## Using the Model

In [27]:
from transformers import pipeline

model_checkpoint = "/kaggle/working/model_checkpoint/checkpoint-6000"

In [28]:
def translate_text(text):
    
    translator = pipeline('translation', model = model_checkpoint)
    translated_text = translator(text)[0]['translation_text']
    
    return translated_text

In [29]:
translate_text('Akorede is my middle name.')

loading configuration file /kaggle/working/model_checkpoint/checkpoint-6000/config.json
Model config MarianConfig {
  "_name_or_path": "/kaggle/working/model_checkpoint/checkpoint-6000",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 59513,
  "decoder_vocab_size": 59514,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2"

'Akodede est mon second prénom.'

In [30]:
translate_text("I love to eat banana and mango.")

loading configuration file /kaggle/working/model_checkpoint/checkpoint-6000/config.json
Model config MarianConfig {
  "_name_or_path": "/kaggle/working/model_checkpoint/checkpoint-6000",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 59513,
  "decoder_vocab_size": 59514,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2"

"J'adore manger de la banane et de la mangue."

In [31]:
translate_text("This is not very special.")

loading configuration file /kaggle/working/model_checkpoint/checkpoint-6000/config.json
Model config MarianConfig {
  "_name_or_path": "/kaggle/working/model_checkpoint/checkpoint-6000",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      59513
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 59513,
  "decoder_vocab_size": 59514,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2"

"Ce n'est pas très spécial."

### Resources

[HuggingFace](https://github.com/huggingface/notebooks/blob/main/examples/translation.ipynb)
[HuggingFace](https://huggingface.co/course/chapter7/4?fw=tf#processing-the-data)