In [None]:
!pip install transformers datasets evaluate sacrebleu
!pip uninstall accelerate
!pip install accelerate

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import DataCollatorForSeq2Seq
import numpy as np
import torch
import evaluate
from transformers import pipeline
import json
from google.colab import files

In [3]:
# Using the GPU
device = torch.device("cuda")

## Data Pre-Processing

In [66]:
books = load_dataset("opus_books", "en-it")
books = books["train"].train_test_split(test_size=0.2)

In [67]:
# Model Definition
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [68]:
source_lang = "it"
target_lang = "en"

def preprocess_function(examples):
    inputs = [example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [69]:
tokenized_books = books.map(preprocess_function, batched=True)

Map:   0%|          | 0/25865 [00:00<?, ? examples/s]

Map:   0%|          | 0/6467 [00:00<?, ? examples/s]

In [70]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [71]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [72]:
train_data = tokenized_books["train"].select(range(20000))
val_data = tokenized_books["train"].select(range(20000,25000))

## Training

In [73]:
# Fine-tuning
training_args = Seq2SeqTrainingArguments(
    output_dir="translator_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    seed = 25, #25, 42
    push_to_hub=False,

)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,3.8749,3.542521,0.6667,17.35
2,3.7653,3.498298,0.7904,17.2148


TrainOutput(global_step=2500, training_loss=3.860327880859375, metrics={'train_runtime': 417.1992, 'train_samples_per_second': 95.877, 'train_steps_per_second': 5.992, 'total_flos': 1307118430322688.0, 'train_loss': 3.860327880859375, 'epoch': 2.0})

In [74]:
trainer.save_model("./trained_translator")
model.from_pretrained("./trained_translator")
print("Model Loaded")

Model Loaded


In [26]:
d = trainer.state.log_history
file = open("log_history_stranslator.json", "w")
json.dump(d, file)
file.close()

In [27]:
# Only needed when Colab "locale bug" arise (still an open issue)
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [88]:
!zip -r /content/trained_translator /content/trained_translator/


files.download("/content/trained_translator.zip")

updating: content/trained_translator/ (stored 0%)
updating: content/trained_translator/generation_config.json (deflated 29%)
updating: content/trained_translator/training_args.bin (deflated 49%)
updating: content/trained_translator/special_tokens_map.json (deflated 86%)
updating: content/trained_translator/pytorch_model.bin (deflated 10%)
updating: content/trained_translator/tokenizer_config.json (deflated 83%)
updating: content/trained_translator/config.json (deflated 62%)
updating: content/trained_translator/tokenizer.json (deflated 74%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Evaluation

In [75]:
test_data = books["test"].select(range(500))
inputs = []
targets = []

for i in range(0,len(test_data)):
  inputs.append(test_data['translation'][i][source_lang])
  targets.append(test_data['translation'][i][target_lang])

In [76]:
tk_inputs = tokenizer(inputs, padding=True, truncation=True,return_tensors="pt").input_ids.to('cuda')
outputs = model.generate(tk_inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)

In [77]:
outputs_str = []
for i in range(0,len(test_data)):
  outputs_str.append(tokenizer.decode(outputs[i], skip_special_tokens=True))

In [78]:
result = metric.compute(predictions=outputs_str, references=targets)
result = {"bleu": result["score"]}
result

{'bleu': 0.7687404283825323}