In [2]:
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM

model_name = "mrm8488/t5-small-finetuned-imdb-sentiment"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [3]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

## Text Classification

In [4]:
input = "This book is normal."
label = "positive"

In [5]:
encoded_input = tokenizer(input, return_tensors="pt")
encoded_input

{'input_ids': tensor([[ 100,  484,   19, 1389,    5,    1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [6]:
encoded_label = tokenizer(label, return_tensors="pt")
encoded_label

{'input_ids': tensor([[1465,    1]]), 'attention_mask': tensor([[1, 1]])}

In [7]:
logit = model(
    input_ids=encoded_input.input_ids,
    labels=encoded_label.input_ids
)
logit.loss

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


tensor(0.9455, grad_fn=<NllLossBackward0>)

In [8]:
pred = model.generate(
    encoded_input.input_ids,
    attention_mask=encoded_input.attention_mask,
    max_length=2
)
pred

tensor([[   0, 1389]])

In [9]:
tokenizer.decode([1389])

'normal'

## Translation

In [10]:
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM

In [11]:
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [12]:
input = 'I go to school'
output = 'Ich besuche die Schule'

In [13]:
input = "translate English to German: " + input
input

'translate English to German: I go to school'

In [14]:
encoded_input = tokenizer(input, return_tensors="pt")
encoded_input

{'input_ids': tensor([[13959,  1566,    12,  2968,    10,    27,   281,    12,   496,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [15]:
encoded_output = tokenizer(output, return_tensors="pt")
encoded_output

{'input_ids': tensor([[ 1674,     3, 27024,    15,    67, 12853,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [16]:
logit = model(
    input_ids=encoded_input.input_ids,
    labels=encoded_output.input_ids
)
logit.loss

tensor(0.3751, grad_fn=<NllLossBackward0>)

In [17]:
pred = model.generate(
    encoded_input.input_ids,
    attention_mask=encoded_input.attention_mask,
    max_length=20
)

In [18]:
tokenizer.batch_decode(pred)

['<pad> Ich besuche die Schule</s>']

## Fine-tuning mBART50 for EN-VI Machine Translation

In [19]:
!pip install -q transformers sentencepiece datasets accelerate evaluate sacrebleu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [20]:
from datasets import load_dataset

ds = load_dataset("thainq107/iwslt2015-en-vi")
ds

README.md:   0%|          | 0.00/522 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/181k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/133317 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1268 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1268 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 133317
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
})

## Tokenizer

In [21]:
from transformers import AutoTokenizer

model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

## Encoding

In [22]:
import torch

MAX_LEN = 75

def preprocess_function(examples):
    input_ids = tokenizer(
        examples["en"], 
        padding="max_length", 
        truncation=True, 
        max_length=MAX_LEN
    )["input_ids"]

    labels = tokenizer(
        examples["vi"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN
    )["input_ids"]

    labels = [[-100 if item == tokenizer.pad_token_id else item 
               for item in label] for label in labels]
    return {
        "input_ids": torch.tensor(input_ids),
        "labels": torch.tensor(labels)
    }

preprocessed_ds = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/133317 [00:00<?, ? examples/s]

Map:   0%|          | 0/1268 [00:00<?, ? examples/s]

Map:   0%|          | 0/1268 [00:00<?, ? examples/s]

## Model

In [23]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

## Evaluate

In [24]:
import numpy as np
import evaluate

metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(
        preds, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(
        labels, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result={"bleu": result["score"]}

    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

## Trainer

In [25]:
import os
os.environ["WANDB_DISABLED"] = 'true'

In [None]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="./en-vi-mbart50",
    logging_dir="logs",
    logging_steps=1000,
    predict_with_generate=True, # for sequence generation task
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_total_limit=1,
    num_train_epochs=3,
    load_best_model_at_end=True,
    # report_to="wandb"
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model
)

trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=preprocessed_ds['train'],
    eval_dataset=preprocessed_ds['validation'],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

## Inference

In [26]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "thainq107/en-vi-mbart50"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/992 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/226 [00:00<?, ?B/s]

### Greedy Search

In [27]:
src_text = "I will train machine translation by fine-tuning T5 model."

encoded_text = tokenizer(src_text, return_tensors="pt")
generated_token = model.generate(**encoded_text)

tokenizer.batch_decode(generated_token, skip_special_tokens=True)

['Tôi sẽ luyện tập phiên dịch máy móc bằng mô hình T5 tinh chỉnh .']

### Beam Search

In [28]:
src_text = "In the next step, we consider the next possible tokens for each of the three branches we created in the previous step."

encoded_text = tokenizer(src_text, return_tensors="pt")
generated_token = model.generate(**encoded_text, num_beams=5)

tokenizer.batch_decode(generated_token, skip_special_tokens=True)

['Bước tiếp theo , chúng tôi xem xét các token tiềm năng tiếp theo cho mỗi trong ba nhánh mà chúng tôi tạo ra ở bước trước .']

## Pipeline

In [30]:
from transformers import pipeline

translator = pipeline(model="thainq107/en-vi-mbart50")

Device set to use cuda:0


In [31]:
translated_text = translator("I go to school", num_beams=1, do_sample=False)
translated_text



[{'generated_text': 'Tôi đi học'}]

In [32]:
pred_sentences = translator(ds['test']['en'], batch_size=32, num_beams=5)
pred_sentences = [pred_sentence['generated_text'] for pred_sentence in pred_sentences]

In [33]:
import sacrebleu

bleu_score = sacrebleu.corpus_bleu(pred_sentences, [ds['test']['vi']], force=True)
bleu_score

BLEU = 34.17 66.5/42.2/28.0/18.9 (BP = 0.980 ratio = 0.980 hyp_len = 33060 ref_len = 33738)