# Evaluación de traducción de imágenes

## Dependencias

In [None]:
# Jupyter widgets
!pip install ipywidgets
# Install nltk for text processing
!pip install nltk
# Install sacreBLEU for evaluation
!pip install sacrebleu
# Install transformers for model support
!pip install transformers accelerate torch bitsandbytes sentencepiece
# Install my latest version of Doctr from GitHub with PyTorch and visualization support
!pip install python-doctr[torch,viz]@git+https://github.com/BlaiPuchol/doctr.git

## Hugging Face login

In [2]:
from huggingface_hub import login

# Login to Hugging Face Hub
# login()

## Imports

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt

# Import docTR libraries
from doctr.io import DocumentFile
from doctr.models import ocr_predictor

# Import the MTEvaluation class from my TFG
from mt_evaluation import MTEvaluation

## Datos

In [5]:
# Obtain the data

# Set language pairs
source_lang = 'en'
target_lang = 'fr'

# Main directory of the dataset
DATASET_DIR = '/home/blai/TFM/Datasets/IWSLT/'

# Set the path to the corpus files
IMAGES_EN = DATASET_DIR + 'iwslt17.fr-en-images/test_en/'
IMAGES_FR = DATASET_DIR + 'iwslt17.fr-en-images/test_fr/'
CORPUS_EN = DATASET_DIR + 'iwslt17.fr-en/test.en'
CORPUS_FR = DATASET_DIR + 'iwslt17.fr-en/test.fr'

# Engines to evaluate
engines = {
    'euroLLM': 'utter-project/EuroLLM-1.7B',
    'LLaMA': 'meta-llama/Llama-3.2-1B-Instruct'
}

# Create an instance of the MTEvaluation class
mt_eval = MTEvaluation(source_lang, target_lang, engines=engines, source=CORPUS_EN, references=CORPUS_FR)

## Extracción del texto de las imagenes

In [6]:
# Instantiate a pretrained model
predictor = ocr_predictor('fast_base', # Text Detection
                          'crnn_vgg16_bn', # Text Recognition
                          pretrained=True,
                          assume_straight_pages=True,
                          resolve_lines=True,
                          resolve_blocks=True,
                          disable_page_orientation=True, # This will only have an effect with assume_straight_pages=False and/or straighten_pages=True and/or detect_orientation=True.
                          disable_crop_orientation=True, # This will only have an effect with assume_straight_pages=False and/or straighten_pages=True.
                          paragraph_break=1.6, # Paragraph break threshold, in relation to the height of the text line bounding box. 
                          )#.cuda().half() # Use .cpu() if you don't have a GPU

In [7]:
from random import shuffle
import os

# Number the images to evaluate
N = 100

# Makea list of the number of images to evaluate
def get_image_numbers(n):
    file_list = os.listdir(IMAGES_EN)
    shuffle(file_list)
    image_numbers = []
    for i, filename in enumerate(file_list):
        if i >= n:
            break
        if filename.endswith('.jpg') or filename.endswith('.png'):
            image_numbers.append(filename.split('.')[0])
    return image_numbers

numbers = get_image_numbers(N)

# Make a list of image paths and save it in a dict, where the keys are the image names and the values are the image paths
def get_image_paths(image_dir, numbers):
    image_paths = {}
    file_list = os.listdir(image_dir)
    for filename in file_list:
        if filename.split('.')[0] in numbers:
            image_paths[filename.split('.')[0]] = os.path.join(image_dir, filename)
    return image_paths

image_paths_en = get_image_paths(IMAGES_EN, numbers)
print(f"Selected {len(image_paths_en)} images from {IMAGES_EN}")

image_paths_fr = get_image_paths(IMAGES_FR, numbers)
print(f"Selected {len(image_paths_fr)} images from {IMAGES_FR}")

# Read the images and run OCR on them
images_en = DocumentFile.from_images([image_paths_en[name] for name in image_paths_en.keys()])

Selected 100 images from /home/blai/TFM/Datasets/IWSLT/iwslt17.fr-en-images/test_en/
Selected 100 images from /home/blai/TFM/Datasets/IWSLT/iwslt17.fr-en-images/test_fr/


In [8]:
import time

# Process the images with the OCR predictor
start_time = time.time()
results_en = predictor(images_en)
end_time = time.time()
print(f"Processed {len(results_en.pages)} images in {end_time - start_time:.2f} seconds.")

# Print the results of the first N images
i = 0
for n, page in zip(image_paths_en.keys(), results_en.pages):
    if i >= 10:
        break
    print(f"Image {n}: {image_paths_en[n]}")
    print("Text:", page.render().replace('\n', ' '))
    print()
    i += 1

Processed 100 images in 173.13 seconds.
Image 5224: /home/blai/TFM/Datasets/IWSLT/iwslt17.fr-en-images/test_en/5224.jpg
Text: we need every one of you to understand the secrets of domestic violence.

Image 7445: /home/blai/TFM/Datasets/IWSLT/iwslt17.fr-en-images/test_en/7445.jpg
Text: forms got big, forms got bold and colorful.

Image 3058: /home/blai/TFM/Datasets/IWSLT/iwslt17.fr-en-images/test_en/3058.jpg
Text: it's just a quick video of what we do.

Image 364: /home/blai/TFM/Datasets/IWSLT/iwslt17.fr-en-images/test_en/364.jpg
Text: this IS a gamer verge On the epic Ofan win.

Image 619: /home/blai/TFM/Datasets/IWSLT/iwslt17.fr-en-images/test_en/619.jpg
Text: out you can't count my fingers right now can you?  

Image 7351: /home/blai/TFM/Datasets/IWSLT/iwslt17.fr-en-images/test_en/7351.jpg
Text: and 'm a  journalist, - neeo  something to write about.

Image 7847: /home/blai/TFM/Datasets/IWSLT/iwslt17.fr-en-images/test_en/7847.jpg
Text: IS she conscious?

Image 5217: /home/blai/TFM/Da

## Comprobar texto extraido con el del dataset

In [9]:
# Import necessary libraries for evaluation metrics
from sacrebleu.metrics.bleu import BLEU
from sacrebleu.metrics.chrf import CHRF
from sacrebleu.metrics.ter import TER

# Initialize evaluation metrics
bleu = BLEU()
chrf = CHRF()
ter = TER()

# Prepare the extracted sentences, source sentences and reference sentences for evaluation
extracted_sentences = []
source_sentences = []
reference_sentences = []

# Select from the extracted sentences, the i source sentences that correspond to the images
for i, (n, page) in enumerate(zip(image_paths_en.keys(), results_en.pages)):
    if i >= N:
        break
    # Get the corresponding sentence from the results
    extracted_sentences.append(page.render().replace('\n', ' '))
    # Get the corresponding sentence from the source corpus
    source_sentences.append(mt_eval.get_source()[int(n)])
    # Get the corresponding sentence from the reference corpus
    reference_sentences.append(mt_eval.get_references()[int(n)])

#  Compute the evaluation metrics
bleu = bleu.corpus_score(extracted_sentences, [source_sentences])
chrf = chrf.corpus_score(extracted_sentences, [source_sentences])
ter = ter.corpus_score(extracted_sentences, [source_sentences])

# Print the evaluation results
print(f"BLEU: {bleu.score:.2f}")
print(f"CHRF: {chrf.score:.2f}")
print(f"TER: {ter.score:.2f}")

# Set new source and reference sentences for the MTEvaluation class
mt_eval.set_source_from_list(extracted_sentences)
mt_eval.set_references_from_list(reference_sentences)


BLEU: 66.25
CHRF: 84.91
TER: 12.54


## Traducción del texto extraido

In [10]:
# Translate the source sentences using the engines specified in the MTEvaluation class

mt_eval.translate(save=True, folder='translations')

Translating with 'euroLLM'


Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_to

Translation with 'euroLLM' done in 78.9019 seconds
Translating with 'LLaMA'


Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VE

Translation with 'LLaMA' done in 1943.1527 seconds


In [14]:
# Print the translations
for engine, translations in mt_eval.mt.items():
    print(f"Translations for {engine}:")
    for i, translation in zip(image_paths_en.keys(), translations.segments()):
        print(f"Image {i}: {translation}")
    print()

Translations for euroLLM:
Image 5224: Nous avons besoin de chacun d’entre vous pour comprendre les secrets de la violence domestique.
Image 7445: Les formes sont devenues grandes, les formes sont devenues audacieuses et colorées.
Image 3058: C'est juste un petit vidéo de ce que nous faisons.
Image 364: C'est une victoire épique pour le joueur sur l'Ofan.
Image 619: Tu ne peux pas compter mes doigts maintenant, tu peux?
Image 7351: Je suis journaliste, - je veux écrire quelque chose.
Image 7847: Est-elle consciente?
Image 5217: pour finir  - J'ai pu raconter ma propre histoire d'amour folle en brisant le silence.
Image 4645: Ils utilisent des attaques en ligne pour faire beaucoup d'argent, et de l'argent.  lots and lots
Image 6086: En conséquence, j'ai la paralysie cérébrale, ce qui signifie que je secoue tout le temps.
Image 1010: Dans cette vision, mais même nous avons limité beaucoup de choses à faire.
Image 4670: Diginotar est un C.a.
Image 6888: La crise de la midlife post-millenni

## Evaluar los resultados de cada modelo

In [15]:
mt_eval.corpus_evaluate(to_json=False)

Engine: euroLLM
BLEU:  27.593990919658307
CHRF:  53.31222466798231
TER:  67.63602251407129

Engine: LLaMA
BLEU:  2.5667879741955995
CHRF:  23.65798369485121
TER:  500.281425891182

