In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, BartTokenizer, BartForConditionalGeneration
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate import meteor_score
from datasets import load_dataset

import time

  from .autonotebook import tqdm as notebook_tqdm


### Define Models and Tokenizer

In [2]:
# T5
tokenizer_t5_base = T5Tokenizer.from_pretrained("google-t5/t5-base")
model_t5_base = T5ForConditionalGeneration.from_pretrained("google-t5/t5-base")

# BART 
tokenizer_bart_base = BartTokenizer.from_pretrained("facebook/bart-base")
model_bart_base = BartForConditionalGeneration.from_pretrained("facebook/bart-base")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


### Define datasets

In [3]:
ds_de_en = load_dataset("wmt/wmt19", "de-en")
print(ds_de_en)


DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 34782245
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2998
    })
})


### Check example outputs

In [4]:
for i in range(5):
    example = ds_de_en['train'][i]  
    print(f"Example {i+1}:")
    print(f"De : {example['translation']['de']}")
    print(f"En : {example['translation']['en']}\n")


Example 1:
De : Wiederaufnahme der Sitzungsperiode
En : Resumption of the session

Example 2:
De : Ich erkläre die am Freitag, dem 15. Dezember 2000, unterbrochene Sitzungsperiode des Europäischen Parlaments für wieder aufgenommen.
En : I declare resumed the session of the European Parliament adjourned on Friday, 15 December 2000.

Example 3:
De : Erklärungen der Präsidentin
En : Statements by the President

Example 4:
De : Werte Kolleginnen und Kollegen, wie Sie wissen, hat ein weiteres Erdbeben in Mittelamerika in dieser bereits mehrfach seit Beginn des zwanzigsten Jahrhunderts schwer getroffenen Region verheerenden Schaden angerichtet.
En : Ladies and gentlemen, on Saturday, as you know, an earthquake struck Central America once again, with tragic consequences. This is an area which has already been seriously affected on a number of occasions since the beginning of the twentieth century.

Example 5:
De : Die vorläufige, schreckliche Bilanz in El Salvador lautet zurzeit bereits: 350 

In [5]:
input_text_t5 = "translate English to French: I like to eat pizza"
t5_input_ids = tokenizer_t5_base.encode(input_text_t5, return_tensors='pt')
t5_outputs = model_t5_base.generate(t5_input_ids)
t5_output_text = tokenizer_t5_base.decode(t5_outputs[0], skip_special_tokens=True)

: 

### Main code starts here