In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import MT5ForConditionalGeneration, AutoTokenizer
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from datasets import load_dataset
from evaluate import load
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

import evaluate
import time
import nltk
import numpy as np

nltk.download('wordnet')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jinhyunpark/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Init

In [2]:
# Function to generate the output from the model
def generate_output(model, tokenizer, example):
    inputs = tokenizer.encode(example, return_tensors="pt")
    outputs = model.generate(inputs, max_new_tokens=np.inf)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

### Define metrics

In [3]:
# jhpark: verified that this is a correct way to use this 
def calculate_rouge(true_sentence, predicted_sentence):
    # jhpark: rouge1/rouge2 (e.g. rouge1, rouge2): n-gram based scoring.
    # jhpark: rougeL: Longest common subsequence based scoring.
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(true_sentence, predicted_sentence)
    return scores

In [4]:
# jhpark: verified that this is a correct way to use this 
def calculate_bleu(true_tokens, predicted_tokens):
    '''
    * reference for smoothing: A Systematic Comparison of Smoothing Techniques for Sentence-Level BLEU, Boxing Chen and Collin Cherry (2014)
    * method1: Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
    * https://www.nltk.org/_modules/nltk/translate/bleu_score.html for more details
    '''
    bleu_score = sentence_bleu(true_tokens, predicted_tokens, smoothing_function=SmoothingFunction().method1)
    return bleu_score

### Datasets

In [5]:
# english to X is only possible for T5
from datasets import load_dataset, load_from_disk

# ds_de_en = load_dataset("wmt/wmt14", "de-en")
# ds_fr_en = load_dataset("wmt/wmt15", "fr-en")
# ds_ro_en = load_dataset("wmt/wmt16", "ro-en")

# ds_de_en.save_to_disk("../wmt14_de_en")
# ds_fr_en.save_to_disk("../wmt15_fr_en")
# ds_ro_en.save_to_disk("../wmt16_ro_en")

ds_de_en = load_from_disk("../wmt14_de_en")
ds_fr_en = load_from_disk("../wmt15_fr_en")
ds_ro_en = load_from_disk("../wmt16_ro_en")

### Models

In [6]:
t5_tokenizer_small = AutoTokenizer.from_pretrained('t5-small')
t5_model_small = AutoModelForSeq2SeqLM.from_pretrained('t5-small')
print("Done with small")

t5_tokenizer_base = AutoTokenizer.from_pretrained('t5-base')
t5_model_base = AutoModelForSeq2SeqLM.from_pretrained('t5-base')
print("Done with base")

Done with small
Done with base


### 1. Generate translations (German to English)

In [7]:
for i in range(5):
    print("German to English")
    print(f"------------ Sample {i+1} ------------")
    
    input_text = f"translate English to German: {ds_de_en['train'][i]['translation']['en']}"
    true_translation = ds_de_en['train'][i]['translation']['de']
    predicted_translation = generate_output(model=t5_model_small, tokenizer=t5_tokenizer_small, example=input_text)
    assert type(input_text) == type(true_translation)   # str
    assert type(true_translation) == type(predicted_translation)

    true_tokens = t5_tokenizer_small.tokenize(true_translation)
    predicted_tokens = t5_tokenizer_small.tokenize(predicted_translation)
    assert type(true_tokens) == type(predicted_tokens)  # list

    print("[Sentences]")
    print(" Input:", input_text)
    print(" True Translation:", true_translation)
    print(" Predicted Translation:", predicted_translation)
    
    print("\n[Scores]")
    # 1. BLEU
    bleu = calculate_bleu([true_tokens], predicted_tokens)
    print(" BLEU score:", bleu)
    # 2. METEOR (# jhpark: verified that this is a correct way to use this.)
    meteor = meteor_score([true_tokens], predicted_tokens)
    print(" METEOR score:", meteor)
    # 3. ROUGE
    rouge = calculate_rouge(true_translation, predicted_translation)
    print(" ROUGE scores:", rouge)

    print(" ")

German to English
------------ Sample 1 ------------
[Sentences]
 Input: translate English to German: Resumption of the session
 True Translation: Wiederaufnahme der Sitzungsperiode
 Predicted Translation: Wiederaufnahme der Sitzungsperiode

[Scores]
 BLEU score: 1.0
 METEOR score: 0.9993141289437586
 ROUGE scores: {'rouge1': Score(precision=1.0, recall=1.0, fmeasure=1.0), 'rouge2': Score(precision=1.0, recall=1.0, fmeasure=1.0), 'rougeL': Score(precision=1.0, recall=1.0, fmeasure=1.0)}
 
German to English
------------ Sample 2 ------------
[Sentences]
 Input: translate English to German: I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.
 True Translation: Ich erkläre die am Freitag, dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen, wünsche Ihnen nochmals alles Gute zum Jahreswechsel

### 2. Generate translations (French to English)

In [8]:
for i in range(5):
    print("French to English")
    print(f"------------ Sample {i+1} ------------")
    
    input_text = f"translate English to French: {ds_fr_en['train'][i]['translation']['en']}"
    true_translation = ds_fr_en['train'][i]['translation']['fr']
    predicted_translation = generate_output(model=t5_model_small, tokenizer=t5_tokenizer_small, example=input_text)
    assert type(input_text) == type(true_translation)   # str
    assert type(true_translation) == type(predicted_translation)

    true_tokens = t5_tokenizer_small.tokenize(true_translation)
    predicted_tokens = t5_tokenizer_small.tokenize(predicted_translation)
    assert type(true_tokens) == type(predicted_tokens)  # list

    print("[Sentences]")
    print(" Input:", input_text)
    print(" True Translation:", true_translation)
    print(" Predicted Translation:", predicted_translation)
    
    print("\n[Scores]")
    # 1. BLEU
    bleu = calculate_bleu([true_tokens], predicted_tokens)
    print(" BLEU score:", bleu)
    # 2. METEOR (# jhpark: verified that this is a correct way to use this.)
    meteor = meteor_score([true_tokens], predicted_tokens)
    print(" METEOR score:", meteor)
    # 3. ROUGE
    rouge = calculate_rouge(true_translation, predicted_translation)
    print(" ROUGE scores:", rouge)

    print(" ")

French to English
------------ Sample 1 ------------
[Sentences]
 Input: translate English to French: Resumption of the session
 True Translation: Reprise de la session
 Predicted Translation: Reprise de la session

[Scores]
 BLEU score: 1.0
 METEOR score: 0.996
 ROUGE scores: {'rouge1': Score(precision=1.0, recall=1.0, fmeasure=1.0), 'rouge2': Score(precision=1.0, recall=1.0, fmeasure=1.0), 'rougeL': Score(precision=1.0, recall=1.0, fmeasure=1.0)}
 
French to English
------------ Sample 2 ------------
[Sentences]
 Input: translate English to French: I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.
 True Translation: Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances.
 Predicted Translat

### 3. Generate translations (Romanian to English)

In [9]:
for i in range(5):
    print("Romanian to English")
    print(f"------------ Sample {i+1} ------------")
    
    input_text = f"translate English to Romanian: {ds_ro_en['train'][i]['translation']['en']}"
    true_translation = ds_ro_en['train'][i]['translation']['ro']
    predicted_translation = generate_output(model=t5_model_small, tokenizer=t5_tokenizer_small, example=input_text)
    assert type(input_text) == type(true_translation)   # str
    assert type(true_translation) == type(predicted_translation)

    true_tokens = t5_tokenizer_small.tokenize(true_translation)
    predicted_tokens = t5_tokenizer_small.tokenize(predicted_translation)
    assert type(true_tokens) == type(predicted_tokens)  # list

    print("[Sentences]")
    print(" Input:", input_text)
    print(" True Translation:", true_translation)
    print(" Predicted Translation:", predicted_translation)
    
    print("\n[Scores]")
    # 1. BLEU
    bleu = calculate_bleu([true_tokens], predicted_tokens)
    print(" BLEU score:", bleu)
    # 2. METEOR (# jhpark: verified that this is a correct way to use this.)
    meteor = meteor_score([true_tokens], predicted_tokens)
    print(" METEOR score:", meteor)
    # 3. ROUGE
    rouge = calculate_rouge(true_translation, predicted_translation)
    print(" ROUGE scores:", rouge)

    print(" ")

Romanian to English
------------ Sample 1 ------------
[Sentences]
 Input: translate English to Romanian: Membership of Parliament: see Minutes
 True Translation: Componenţa Parlamentului: a se vedea procesul-verbal
 Predicted Translation: Componenţa Parlamentului: a se vedea procesul-verbal

[Scores]
 BLEU score: 1.0
 METEOR score: 0.9998177842565598
 ROUGE scores: {'rouge1': Score(precision=1.0, recall=1.0, fmeasure=1.0), 'rouge2': Score(precision=1.0, recall=1.0, fmeasure=1.0), 'rougeL': Score(precision=1.0, recall=1.0, fmeasure=1.0)}
 
Romanian to English
------------ Sample 2 ------------
[Sentences]
 Input: translate English to Romanian: Approval of Minutes of previous sitting: see Minutes
 True Translation: Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal
 Predicted Translation: Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal

[Scores]
 BLEU score: 1.0
 METEOR score: 0.999958905235473
 ROUGE scores: {'rouge1': Score(pr