In [1]:
import os
from tqdm import tqdm
import csv
from sacrebleu import sentence_bleu

import nltk
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
#nltk.download('punkt')

In [2]:
def compute_bleu(reference, hypothesis):
    """
    Computes BLEU score for a single reference-hypothesis pair.
    """
    return sentence_bleu(hypothesis, [reference]).score / 100  # Normalize to [0, 1]

def compute_meteor(reference, hypothesis):
    """
    Computes METEOR score for a single reference-hypothesis pair.
    Tokenizes the input to match the expected format.
    """
    # Tokenize both reference and hypothesis
    tokenized_reference = word_tokenize(reference)
    tokenized_hypothesis = word_tokenize(hypothesis)

    # Calculate METEOR score
    return meteor_score([tokenized_reference], tokenized_hypothesis)

In [3]:
from bleurt.score import BleurtScorer

# Load the BLEURT scorer with a pre-trained checkpoint
bleurt_checkpoint = "bleurt_checkpoints/BLEURT-20"
scorer = BleurtScorer(bleurt_checkpoint)

def compute_bleurt(reference, hypothesis):
    """
    Computes BLEURT score for a single reference-hypothesis pair.
    """
    score = scorer.score(references=[reference], candidates=[hypothesis])
    return score[0] 


INFO:tensorflow:Reading checkpoint bleurt_checkpoints/BLEURT-20.


AssertionError: Could not find BLEURT checkpoint bleurt_checkpoints/BLEURT-20

In [4]:
def evaluate_from_folders(reference_folder, hypothesis_folder, lang="en"):
    """
    Evaluate translations by processing reference and hypothesis files directly from folders.

    Parameters:
    - reference_folder (str): Path to the folder containing reference files.
    - hypothesis_folder (str): Path to the folder containing hypothesis files.
    - lang (str): Language for BERTScore (default is "en").

    Returns:
    - A dictionary with per-file scores for METEOR and BLEU.
    """
    # Find all files in both folders
    reference_files = {
        os.path.basename(f).replace(" English", ""): os.path.join(reference_folder, f)
        for f in os.listdir(reference_folder)
    }
    hypothesis_files = {
        os.path.basename(f): os.path.join(hypothesis_folder, f)
        for f in os.listdir(hypothesis_folder)
    }

    # Ensure matching filenames exist in both folders
    common_files = set(reference_files.keys()) & set(hypothesis_files.keys())
    if not common_files:
        raise ValueError("No matching files found in the reference and hypothesis folders.")

    # Initialize per-file scores
    file_scores = {}

    # Process each matching file
    for filename in tqdm(common_files, desc="Evaluating Files"):
        print(f"Processing: {filename}")

        with open(reference_files[filename], 'r', encoding='utf-8') as ref_file:
            reader = csv.DictReader(ref_file)
            reference = [row["Text"] for row in reader]
        with open(hypothesis_files[filename], 'r', encoding='utf-8') as hyp_file:
            reader = csv.DictReader(hyp_file)
            hypothesis = [row["Text"] for row in reader]

        # Concatenate all texts into single strings
        reference = " ".join(reference)
        hypothesis = " ".join(hypothesis)

        #print(f"Reference: {reference}")
        #print(f"Hypothesis: {hypothesis}")
        # Compute metrics
        meteor = compute_meteor(reference, hypothesis)
        bleu = compute_bleu(reference, hypothesis)

        # Save per-file scores
        file_scores[filename] = {
            "METEOR": meteor,
            "BLEU": bleu,
        }

    return file_scores

In [5]:
# Define folder paths
reference_folder = "../results/Parkinson/en"
hypothesis_folder = "../results/Parkinson/translation_fra_to_eng"

# Evaluate translations
scores = evaluate_from_folders(reference_folder, hypothesis_folder, lang="en")
scores

Evaluating Files:   0%|          | 0/8 [00:00<?, ?it/s]

Processing: 7-1_script_interview_clinique_1.csv





KeyError: 'Text'