In [1]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import torch
from torchtext.datasets import multi30k, Multi30k
from typing import Iterable, List
from torch.utils.data import DataLoader
from torchmetrics.text import BLEUScore
from transformers.optimization import AdamW
from tqdm.notebook import tqdm_notebook
import torchmetrics
from torch.utils.tensorboard import SummaryWriter

In [2]:
writer = SummaryWriter('experiments_results')

In [3]:
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [4]:
tokenizer.src_lang = "de_DE"
tokenizer.tgt_lang = "en_XX"

In [5]:
article_de = ["Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche."]
model_inputs = tokenizer(article_de, return_tensors="pt",padding=True,truncation=True,)
with tokenizer.as_target_tokenizer():
    labels = tokenizer('Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.', return_tensors="pt").input_ids
generated_tokens = model.generate(
    **model_inputs,
    forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]).to('cuda')
with tokenizer.as_target_tokenizer():
    translate = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    print(translate)


#----Two young, White males are outside near many bushes  --> Dataset

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


['Two young white men are outdoors near many bushes.']


In [6]:
BATCH_SIZE = 16
SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'
EPOCHS = 10
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [7]:
train_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE)

In [8]:
len(list(train_dataloader))

64

In [9]:
from torchmetrics.text import BLEUScore
bleu = BLEUScore()
eval = []
def eval_bleu(tgt,pred):
    for t, p in zip(tgt,pred):
        res = bleu([p], [[t]])
        eval.append(res.item())
    val =sum(eval)/len(eval)
    return val
    

In [10]:
def evaluate():
    model.eval()
    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=16)
    arr = []
    batch_iterator = tqdm_notebook(train_dataloader,total=len(list(val_dataloader)))
    for src, tgt in batch_iterator:
        model_inputs = tokenizer(src, return_tensors="pt",padding=True,truncation=True,)
        generated_tokens = model.generate(
            **model_inputs,
            forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
        )
        with tokenizer.as_target_tokenizer():
            translate = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        val = eval_bleu(tgt,translate)
        arr.append(val)
    res =sum(arr)/len(arr)
    if writer:
            metric = torchmetrics.CharErrorRate()
            cer = metric(tgt, translate)
            writer.add_scalar('validation cer', cer)
            writer.flush()

            metric = torchmetrics.WordErrorRate()
            wer = metric(tgt, translate)
            writer.add_scalar('validation wer', wer)
            writer.flush()

            writer.add_scalar('validation BLEU', res)
            writer.flush()
    

In [11]:
def fine_tuning():
    print('Start fine-tuning')  
    optimizer = AdamW(model.parameters(), lr=1e-4)
    model.train()
    for i in range(EPOCHS):
        losses = 0
        batch_iterator = tqdm_notebook(train_dataloader,total=len(list(train_dataloader)) ,desc=f"Processing Epoch {i:02d}")
        for src, tgt in batch_iterator:
                model_inputs = tokenizer(src, return_tensors="pt",padding=True,truncation=True)
                with tokenizer.as_target_tokenizer():
                        labels = tokenizer(list(tgt), return_tensors="pt",padding=True,truncation=True).input_ids
                optimizer.zero_grad()
                output = model(**model_inputs,labels=labels)
                loss = output.loss
                loss.backward()
                optimizer.step()
                losses += loss.item()
        losses = losses / len(list(train_dataloader))
        writer.add_scalar('train loss', losses, i)
        writer.flush()
        print(f'Epoch: {i}, Losses: {losses}')
        evaluate()
    
        

In [12]:
print('-------Start process-------')
fine_tuning()
evaluate()
print('-------End process------')

-------Start process-------
Start fine-tuning




Processing Epoch 00:   0%|          | 0/64 [00:00<?, ?it/s]

Epoch: 10, Losses: 1.623460367321968


  0%|          | 0/64 [00:00<?, ?it/s]



Processing Epoch 01:   0%|          | 0/64 [00:00<?, ?it/s]

Epoch: 10, Losses: 0.30189129314385355


  0%|          | 0/64 [00:00<?, ?it/s]

Processing Epoch 02:   0%|          | 0/64 [00:00<?, ?it/s]

Epoch: 10, Losses: 0.14805128891021013


  0%|          | 0/64 [00:00<?, ?it/s]

Processing Epoch 03:   0%|          | 0/64 [00:00<?, ?it/s]

Epoch: 10, Losses: 0.0983183472417295


  0%|          | 0/64 [00:00<?, ?it/s]

Processing Epoch 04:   0%|          | 0/64 [00:00<?, ?it/s]

Epoch: 10, Losses: 0.07369236648082733


  0%|          | 0/64 [00:00<?, ?it/s]

Processing Epoch 05:   0%|          | 0/64 [00:00<?, ?it/s]

Epoch: 10, Losses: 0.05989678291371092


  0%|          | 0/64 [00:00<?, ?it/s]

Processing Epoch 06:   0%|          | 0/64 [00:00<?, ?it/s]

Epoch: 10, Losses: 0.04806271899724379


  0%|          | 0/64 [00:00<?, ?it/s]

Processing Epoch 07:   0%|          | 0/64 [00:00<?, ?it/s]

Epoch: 10, Losses: 0.04730839545663912


  0%|          | 0/64 [00:00<?, ?it/s]

Processing Epoch 08:   0%|          | 0/64 [00:00<?, ?it/s]

Epoch: 10, Losses: 0.07323535207251552


  0%|          | 0/64 [00:00<?, ?it/s]

Processing Epoch 09:   0%|          | 0/64 [00:00<?, ?it/s]

Epoch: 10, Losses: 0.03457020396308508


  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

-------End process------
