<a href="https://colab.research.google.com/github/CBaffelli/CAS-NLP_Machine-translation/blob/main/04_CAS_NLP_final_project_benchmark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets evaluate sacrebleu accelerate -U bert_score rouge_score sacremoses

# **Benchmark and evaluation**

This script is used to carry out the benchmark for the fine-tuned model against the baseline and company model.

In [3]:
#@title Imports and varia
from tabulate import tabulate
import numpy as np
import pandas as pd
import evaluate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
#@title Mount GDrive
from google.colab import drive
drive.mount('/content/drive')

In [4]:
#@title Load data
#Load the datasets
italian = pd.read_excel('IT.xlsx', dtype=str, usecols=[0, 2, 4])
french = pd.read_excel('FR.xlsx', dtype=str, usecols=[0, 2, 4])
spanish = pd.read_excel('ES.xlsx', dtype=str, usecols=[0, 2, 4])
romanian = pd.read_excel('RO.xlsx', dtype=str, usecols=[0, 2, 4])
portuguese = pd.read_excel('PT.xlsx', dtype=str, usecols=[0, 2, 4])

#Mapping for the dataset
languages = {
    'Italian': italian,
    'French': french,
    'Spanish' : spanish,
    'Romanian' : romanian,
    'Portuguese' : portuguese
}

#Mapping for the prefixes
prefix_mapping_OPUS =  {
    'Italian' : '>>ita<< ',
    'French' : '>>fra<< ',
    'Spanish' : '>>spa<< ',
    'Romanian' : '>>ron<< ',
    'Portuguese' : '>>por<< '
}

In [5]:
#@title Load metrics
sacrebleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load('meteor')
bert = evaluate.load("bertscore")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

### 1. **Company model**

In [None]:
for language_name, language_df in languages.items():
  #Print the language name
  print("Language:", language_name)
  predictions = language_df['Proposed translation'].values.tolist()
  revised_translation_list = language_df['Revised translation'].tolist()
  references = [[item] for item in revised_translation_list]
  sacrebleu_score = sacrebleu.compute(predictions=predictions,references=references)
  rouge_score = rouge.compute(predictions=predictions,references=references,rouge_types=["rouge1", "rouge2", "rougeL"])
  meteor_score = meteor.compute(predictions=predictions,references=references)
  bert_score = bert.compute(predictions=predictions,references=references, model_type="bert-base-multilingual-cased")
  table = [["Metric", "Score"],
         ["SacreBLEU", sacrebleu_score["score"]],
         ["ROUGE1", rouge_score['rouge1']],
         ["ROUGE2", rouge_score['rouge2']],
         ["ROUGEL", rouge_score['rougeL']],
         ["METEOR", meteor_score["meteor"]],
         ["BERTScore - precision ", np.mean(bert_score['precision'])],
         ["BERTScore - recall ", np.mean(bert_score['recall'])],
         ["BERTScore - F1 ", np.mean(bert_score['f1'])]
         ]
  df = pd.DataFrame(table[1:], columns=table[0])
  print(df)


Language: Italian


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

                   Metric      Score
0               SacreBLEU  67.330242
1                  ROUGE1   0.814173
2                  ROUGE2   0.709258
3                  ROUGEL   0.790893
4                  METEOR   0.761506
5  BERTScore - precision    0.936790
6     BERTScore - recall    0.930335
7         BERTScore - F1    0.933350
Language: French
                   Metric      Score
0               SacreBLEU  64.877260
1                  ROUGE1   0.827018
2                  ROUGE2   0.732669
3                  ROUGEL   0.819354
4                  METEOR   0.800756
5  BERTScore - precision    0.939521
6     BERTScore - recall    0.934192
7         BERTScore - F1    0.936644
Language: Spanish
                   Metric      Score
0               SacreBLEU  77.623090
1                  ROUGE1   0.846418
2                  ROUGE2   0.763394
3                  ROUGEL   0.838747
4                  METEOR   0.825963
5  BERTScore - precision    0.955264
6     BERTScore - recall    0.950966
7  

### 2. **Helsinki-NLP/opus-mt-en-roa baseline**

In [None]:
checkpoint = "Helsinki-NLP/opus-mt-en-roa"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/786k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/793k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/295M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
for language_name, language_df in languages.items():
  prefix = prefix_mapping_OPUS[language_name]
  sources = language_df['Source'].apply(lambda x: prefix + x if isinstance(x, str) else x)
  sources = sources.values.tolist()
  revised_translation_list = language_df['Revised translation'].tolist()
  references = [[item] for item in revised_translation_list]
  predictions = []

  for text in sources:
    inputs = tokenizer(text, return_tensors="pt").input_ids
    outputs = model.generate(inputs, max_new_tokens=80, do_sample=True, top_k=30, top_p=0.95)
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predictions.append(translated_text)

  sacrebleu_score = sacrebleu.compute(predictions=predictions, references=references)
  rouge_score = rouge.compute(predictions=predictions, references=references, rouge_types=["rouge1", "rouge2", "rougeL"])
  meteor_score = meteor.compute(predictions=predictions, references=references)
  bert_score = bert.compute(predictions=predictions, references=references, model_type="bert-base-multilingual-cased")

  table = [
            ["Metric", "Score"],
            ["SacreBLEU", sacrebleu_score["score"]],
            ["ROUGE1", rouge_score['rouge1']],
            ["ROUGE2", rouge_score['rouge2']],
            ["ROUGEL", rouge_score['rougeL']],
            ["METEOR", meteor_score["meteor"]],
            ["BERTScore - precision", np.mean(bert_score['precision'])],
            ["BERTScore - recall", np.mean(bert_score['recall'])],
            ["BERTScore - F1", np.mean(bert_score['f1'])]
        ]

  df = pd.DataFrame(table[1:], columns=table[0])
  #Print the language name
  print("Language:", language_name)
  print(df)

### 3. **Fine-tuned model**

In [12]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

checkpoint = 'Klarly/multilingual-MT_Medical-Diagnostics_ROM'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)


tokenizer_config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/786k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/793k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/416 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/293M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

In [13]:
for language_name, language_df in languages.items():
  prefix = prefix_mapping_OPUS[language_name]
  sources = language_df['Source'].apply(lambda x: prefix + x if isinstance(x, str) else x)
  sources = sources.values.tolist()
  revised_translation_list = language_df['Revised translation'].tolist()
  references = [[item] for item in revised_translation_list]
  predictions = []

  for text in sources:
    inputs = tokenizer(text, return_tensors="pt").input_ids
    outputs = model.generate(inputs, max_new_tokens=80, do_sample=True, top_k=30, top_p=0.95)
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predictions.append(translated_text)

  sacrebleu_score = sacrebleu.compute(predictions=predictions, references=references)
  rouge_score = rouge.compute(predictions=predictions, references=references, rouge_types=["rouge1", "rouge2", "rougeL"])
  meteor_score = meteor.compute(predictions=predictions, references=references)
  bert_score = bert.compute(predictions=predictions, references=references, model_type="bert-base-multilingual-cased")

  table = [
            ["Metric", "Score"],
            ["SacreBLEU", sacrebleu_score["score"]],
            ["ROUGE1", rouge_score['rouge1']],
            ["ROUGE2", rouge_score['rouge2']],
            ["ROUGEL", rouge_score['rougeL']],
            ["METEOR", meteor_score["meteor"]],
            ["BERTScore - precision", np.mean(bert_score['precision'])],
            ["BERTScore - recall", np.mean(bert_score['recall'])],
            ["BERTScore - F1", np.mean(bert_score['f1'])]
        ]

  df = pd.DataFrame(table[1:], columns=table[0])
  #Print the language name
  print("Language:", language_name)
  print(df)
  df_output = pd.DataFrame()
  df_output['Source'] = sources
  df_output['Translation'] = predictions
  df_output.to_csv(f'{language_name}_output.csv', index=False)

Language: Italian
                  Metric      Score
0              SacreBLEU  60.674500
1                 ROUGE1   0.815483
2                 ROUGE2   0.683191
3                 ROUGEL   0.788287
4                 METEOR   0.763731
5  BERTScore - precision   0.939494
6     BERTScore - recall   0.936767
7         BERTScore - F1   0.938061
Language: French
                  Metric      Score
0              SacreBLEU  43.038874
1                 ROUGE1   0.743125
2                 ROUGE2   0.546680
3                 ROUGEL   0.732812
4                 METEOR   0.674928
5  BERTScore - precision   0.943287
6     BERTScore - recall   0.938543
7         BERTScore - F1   0.940789
Language: Spanish
                  Metric      Score
0              SacreBLEU  53.789455
1                 ROUGE1   0.770548
2                 ROUGE2   0.616311
3                 ROUGEL   0.755575
4                 METEOR   0.738432
5  BERTScore - precision   0.952110
6     BERTScore - recall   0.949526
7         B