In [1]:
! pip install sacrebleu --quiet

[0m

In [2]:
# let's import all necessary dependencies

import numpy as np
import pandas as pd
import tensorflow as tf
import os
from tensorflow import keras
from keras import models, layers
from tensorflow.python.ops.numpy_ops import np_config
from datasets import Dataset

import warnings
warnings.filterwarnings('ignore')

os.environ["WANDB_DISABLED"] = 'true'

In [3]:
dataset = pd.read_csv('/kaggle/input/english-french-translation/fr-en-translation.csv', nrows=100000, encoding_errors='ignore')

dataset = dataset.loc[:, dataset.columns != 'Unnamed: 2']

dataset.head()

Unnamed: 0,1,2
0,You thought to this day that there were tyrants?,Vous avez cru jusqu'à ce jour qu'il y avait de...
1,"How do you feed your family?""","Comment nourrissez-vous votre famille ?"""
2,The first group shows God creating the Heavens...,Le premier ciel est une voûte à laquelle la te...
3,It is said after this he split to a thousand p...,"Il est dit après cela, qu'il s'est divisé en m..."
4,"They are subservient to him, and created for a...","Ils sont serviles à son égard, et créés pour u..."


In [4]:
# let's use HuggingFace's Dataset object
dataset = Dataset.from_pandas(dataset)

dataset

Dataset({
    features: ['1', '2'],
    num_rows: 100000
})

### Loading From HuggingFace

In [5]:
model_name = 't5-base'

batch_size = 64

# start tokens required for the task
prefix = "translate English to French: "

maxlen = 40

In [6]:
# load the tokenizer from huggingface

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

In [7]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

In [8]:
text = "translate English to French: This is how the model tokenizes text."

tokenized_text = tokenizer(text, add_special_tokens = False)
tokenized_text

{'input_ids': [13959, 1566, 12, 2379, 10, 100, 19, 149, 8, 825, 14145, 1737, 7, 1499, 5], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
def preprocess_dataset(dataset):
    """Utility function to batch encode the texts in the dataset"""
    
    inputs = [prefix + text for text in dataset['1']]
    targets = [text for text in dataset['2']]
    model_inputs = tokenizer(inputs, max_length = maxlen, truncation = True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length = maxlen, truncation = True)
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [10]:
tokenized_dataset = dataset.map(preprocess_dataset, batched = True)

  0%|          | 0/100 [00:00<?, ?ba/s]

In [11]:
tokenized_dataset

Dataset({
    features: ['1', '2', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 100000
})

In [12]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer,  model = model)

In [13]:
from transformers import Seq2SeqTrainingArguments

training_arguments = Seq2SeqTrainingArguments(output_dir = 'model_checkpoints', 
                                             learning_rate=2e-5, per_device_train_batch_size = batch_size,
                                             weight_decay = 0.01, num_train_epochs = 3,
                                             save_total_limit = 3, fp16 = True)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [14]:
from datasets import load_metric

metric = load_metric('sacrebleu')

Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [15]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    
    return preds, labels

In [16]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    if isinstance(preds, tuple):
        preds = preds[0]
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens = True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    
    return result

In [17]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(model, training_arguments, train_dataset = tokenized_dataset, data_collator = data_collator,
                         tokenizer = tokenizer, compute_metrics = compute_metrics)

Using cuda_amp half precision backend


In [18]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: 1, 2. If 1, 2 are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 100000
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 4689


Step,Training Loss
500,0.614
1000,0.5798
1500,0.57
2000,0.5573
2500,0.5502
3000,0.5455
3500,0.542
4000,0.5361
4500,0.5341


Saving model checkpoint to model_checkpoints/checkpoint-500
Configuration saved in model_checkpoints/checkpoint-500/config.json
Model weights saved in model_checkpoints/checkpoint-500/pytorch_model.bin
tokenizer config file saved in model_checkpoints/checkpoint-500/tokenizer_config.json
Special tokens file saved in model_checkpoints/checkpoint-500/special_tokens_map.json
Copy vocab file to model_checkpoints/checkpoint-500/spiece.model
Saving model checkpoint to model_checkpoints/checkpoint-1000
Configuration saved in model_checkpoints/checkpoint-1000/config.json
Model weights saved in model_checkpoints/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in model_checkpoints/checkpoint-1000/tokenizer_config.json
Special tokens file saved in model_checkpoints/checkpoint-1000/special_tokens_map.json
Copy vocab file to model_checkpoints/checkpoint-1000/spiece.model
Saving model checkpoint to model_checkpoints/checkpoint-1500
Configuration saved in model_checkpoints/checkpoint-150

TrainOutput(global_step=4689, training_loss=0.5579330662365126, metrics={'train_runtime': 3178.5799, 'train_samples_per_second': 94.382, 'train_steps_per_second': 1.475, 'total_flos': 1.426879681265664e+16, 'train_loss': 0.5579330662365126, 'epoch': 3.0})

### Resources

[HuggingFace](https://github.com/huggingface/notebooks/blob/main/examples/translation.ipynb)