# Calculate bleu score of different translation models on the same data set.

In [None]:
import pandas as pd
import numpy as np

import os
import glob
from tqdm import tqdm
import torch
import pickle

from datasets import load_dataset, load_metric
from datasets import Dataset
from datasets import DatasetDict

In [None]:
# load metric offline
# bleu.py requires bleu1.py to run
metric = load_metric("/export/home/cse200093/Expe_Translation/bleu.py")
metric

In [None]:
# example usage of metric
predictions = [
    ["hello", "there", "general", "kenobi"],                             # tokenized prediction of the first sample
    ["foo", "bar", "foobar"]                                             # tokenized prediction of the second sample
]
references = [
    [["hello", "there", "general", "kenobi"], ["hello", "there", "!"]],  # tokenized references for the first sample (2 references)
    [["foo", "bar", "foobar"]]                                           # tokenized references for the second sample (1 reference)
]

metric.compute(predictions=predictions, references=references)

In [None]:
from easynmt import EasyNMT, models
# load the model before FT:
model_fr_en = EasyNMT(translator = models.AutoModel('/export/home/cse200093/opus-mt-fr-en'))

In [None]:
from googletrans import Translator
translator = Translator()

In [None]:
from easynmt import EasyNMT, models
# load the model after FT :
model_fr_en_FT = EasyNMT(translator = models.AutoModel('/export/home/cse200093/Expe_Translation/opus-mt-fr-en-finetuned-fr-to-en/FT_opus_model'))

In [None]:
# load data set
open_file = open('raw_datasets_wmt_biomed_2016.pkl', "rb")
dataset = pickle.load(open_file)
open_file.close()
dataset

In [None]:
en = dataset['test']['en']
fr = dataset['test']['fr']

opus = [model_fr_en.translate(sent_fr, source_lang = 'fr',target_lang='en') for sent_fr in fr[:500]]
google = [translator.translate(sent_fr).text for sent_fr in fr[:500]]
opus_FT = []
for x in tqdm(fr):
  # only translate first 500 documents to save time
    if len(opus_FT) < 500:
        opus_FT.append(model_fr_en_FT.translate(x,source_lang = 'fr',target_lang='en'))
    else:
        break

In [None]:
df1 = pd.DataFrame({'English':en[:500], 'French':fr[:500], 'google':google, 'opus':opus, 'opus_FT':opus_FT})
df1

In [None]:
predictions_goo = [x.split() for x in google]
predictions_opus = [x.split() for x in opus]
predictions_opus_FT = [x.split() for x in opus_FT]

references = [[x.split()] for x in en[:500]]

In [None]:
results_goo = metric.compute(predictions=predictions_goo, references=references)
results_goo

In [None]:
results_opus = metric.compute(predictions=predictions_opus, references=references)
results_opus

In [None]:
results_opus_FT = metric.compute(predictions=predictions_opus_FT, references=references)
results_opus_FT