# 5.3 Gloss Evaluation

This notebook compares generated gloss output with a reference gloss using string-level metrics like:
- Levenshtein (Edit) Distance
- Token-level Precision, Recall, and F1-score

In [1]:
# ✅ Step 1: Install necessary packages
!pip install -q evaluate nltk
import nltk
nltk.download('punkt')

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m81.9/84.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# ✅ Step 2: Import libraries
import evaluate
from nltk import word_tokenize
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

In [3]:
# ✅ Step 3: Define generated and reference glosses
generated = "COULD TALK SLOW"
reference = "COULD YOU PLEASE TALK SLOWER"  # Replace with gold gloss if available

print("Generated Gloss:", generated)
print("Reference Gloss:", reference)

Generated Gloss: COULD TALK SLOW
Reference Gloss: COULD YOU PLEASE TALK SLOWER


In [4]:
!pip install -q python-Levenshtein

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import Levenshtein

# ✅ Step 4: Compute Levenshtein Edit Distance
edit_distance = Levenshtein.distance(generated, reference)
print("Levenshtein Edit Distance:", edit_distance)

In [6]:
# ✅ Step 5: Token-level Precision/Recall/F1
import nltk
nltk.download('punkt_tab') # Download the necessary resource

gen_tokens = word_tokenize(generated)
ref_tokens = word_tokenize(reference)

all_tokens = sorted(list(set(gen_tokens + ref_tokens)))

# Binary vectors
y_true = [1 if tok in ref_tokens else 0 for tok in all_tokens]
y_pred = [1 if tok in gen_tokens else 0 for tok in all_tokens]

p = precision_score(y_true, y_pred)
r = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Token Precision: {p:.2f}")
print(f"Token Recall: {r:.2f}")
print(f"Token F1 Score: {f1:.2f}")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Token Precision: 0.67
Token Recall: 0.40
Token F1 Score: 0.50


In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
import pandas as pd

# Load generated glosses
generated_glosses_path = '/content/drive/MyDrive/IETGenAI-SLT/Chapter 5/isl_train_meta_with_generated_glosses.csv'
try:
    df_generated = pd.read_csv(generated_glosses_path)
    print(f"Generated glosses loaded successfully from: {generated_glosses_path}")
    display(df_generated.head())
except FileNotFoundError:
    print(f"Error: Generated glosses file not found at: {generated_glosses_path}")
    df_generated = None

# Load original glosses
original_glosses_path = '/content/drive/MyDrive/IETGenAI-SLT/Chapter 4/isl_train_meta.csv'
try:
    df_original = pd.read_csv(original_glosses_path)
    print(f"Original glosses loaded successfully from: {original_glosses_path}")
    display(df_original.head())
except FileNotFoundError:
    print(f"Error: Original glosses file not found at: {original_glosses_path}")
    df_original = None

Generated glosses loaded successfully from: /content/drive/MyDrive/IETGenAI-SLT/Chapter 5/isl_train_meta_with_generated_glosses.csv


Unnamed: 0,Sentences,target_text,input_text,generated_gloss
0,it does not make any difference to me,IT MAKE ANY DIFFERENCE ME DO NOT,translate English to gloss: it does not make a...,I CAN NOT DIFFERENCE
1,tell me truth,TELL TRUTH,translate English to gloss: tell me truth,I RÉAL
2,do me a favour,DO FAVOUR ME,translate English to gloss: do me a favour,DO MONEY FOUR
3,do not worry,DONOT WORRY,translate English to gloss: do not worry,DO NOT HELP ME
4,do not abuse him,HIM ABUSE DONOT,translate English to gloss: do not abuse him,YOU MISS him


Original glosses loaded successfully from: /content/drive/MyDrive/IETGenAI-SLT/Chapter 4/isl_train_meta.csv


Unnamed: 0,Sentences,File location,gloss_sequence,signer_id,sample_id
0,it does not make any difference to me,ISL_CSLRT_Corpus\Videos_Sentence_Level\it does...,IT MAKE ANY DIFFERENCE ME DO NOT,6,ISL_0278_S6
1,tell me truth,ISL_CSLRT_Corpus\Videos_Sentence_Level\tell me...,TELL TRUTH,6,ISL_0341_S6
2,do me a favour,ISL_CSLRT_Corpus\Videos_Sentence_Level\do me a...,DO FAVOUR ME,4,ISL_0046_S4
3,do not worry,ISL_CSLRT_Corpus\Videos_Sentence_Level\do not ...,DONOT WORRY,4,ISL_0065_S4
4,do not abuse him,ISL_CSLRT_Corpus\Videos_Sentence_Level\do not ...,HIM ABUSE DONOT,5,ISL_0048_S5


In [11]:
# Merge the dataframes based on the 'Sentences' column
if df_generated is not None and df_original is not None:
    # Select only the necessary columns from the original dataframe to avoid duplicate columns
    df_original_subset = df_original[['Sentences', 'gloss_sequence']]

    # Perform the merge
    # Using an inner merge to keep only sentences present in both dataframes
    df_merged = pd.merge(df_generated, df_original_subset, on='Sentences', how='inner')

    print("Dataframes merged successfully based on 'Sentences' column.")
    print(f"Number of merged samples: {len(df_merged)}")
    display(df_merged.head())
else:
    print("One or both dataframes were not loaded. Cannot merge.")
    df_merged = None

Dataframes merged successfully based on 'Sentences' column.
Number of merged samples: 500


Unnamed: 0,Sentences,target_text,input_text,generated_gloss,gloss_sequence
0,it does not make any difference to me,IT MAKE ANY DIFFERENCE ME DO NOT,translate English to gloss: it does not make a...,I CAN NOT DIFFERENCE,IT MAKE ANY DIFFERENCE ME DO NOT
1,it does not make any difference to me,IT MAKE ANY DIFFERENCE ME DO NOT,translate English to gloss: it does not make a...,I CAN NOT DIFFERENCE,IT MAKE ANY DIFFERENCE ME DO NOT
2,it does not make any difference to me,IT MAKE ANY DIFFERENCE ME DO NOT,translate English to gloss: it does not make a...,I CAN NOT DIFFERENCE,IT MAKE ANY DIFFERENCE ME DO NOT
3,tell me truth,TELL TRUTH,translate English to gloss: tell me truth,I RÉAL,TELL TRUTH
4,tell me truth,TELL TRUTH,translate English to gloss: tell me truth,I RÉAL,TELL TRUTH


In [13]:
# Save the merged dataframe to a CSV file
if df_merged is not None:
    output_path = '/content/drive/MyDrive/IETGenAI-SLT/Chapter 5/merged_glosses_evaluation.csv'
    df_merged.to_csv(output_path, index=False)
    print(f"Merged dataframe saved successfully to: {output_path}")
else:
    print("No merged dataframe to save.")

Merged dataframe saved successfully to: /content/drive/MyDrive/IETGenAI-SLT/Chapter 5/merged_glosses_evaluation.csv


In [15]:
# ✅ Step 1: Implement Jaccard Similarity and Step 2: Calculate for the dataset
def jaccard_similarity(str1, str2):
    """Calculates Jaccard Similarity between two strings."""
    tokens1 = set(word_tokenize(str(str1).lower())) # Tokenize and convert to lowercase
    tokens2 = set(word_tokenize(str(str2).lower())) # Tokenize and convert to lowercase

    if not tokens1 and not tokens2:
        return 1.0 # Both empty, consider perfect match
    if not tokens1 or not tokens2:
        return 0.0 # One is empty, no overlap

    intersection = tokens1.intersection(tokens2)
    union = tokens1.union(tokens2)
    return len(intersection) / len(union)

if df_merged is not None:
    # Calculate Jaccard Similarity for each row
    df_merged['jaccard_similarity'] = df_merged.apply(
        lambda row: jaccard_similarity(row['generated_gloss'], row['gloss_sequence']),
        axis=1
    )

    print("Jaccard Similarity calculated and added to the merged DataFrame.")
    display(df_merged.head())
else:
    print("Merged DataFrame not loaded. Cannot calculate Jaccard Similarity.")

Jaccard Similarity calculated and added to the merged DataFrame.


Unnamed: 0,Sentences,target_text,input_text,generated_gloss,gloss_sequence,levenshtein_distance,token_precision,token_recall,token_f1_score,jaccard_similarity
0,it does not make any difference to me,IT MAKE ANY DIFFERENCE ME DO NOT,translate English to gloss: it does not make a...,I CAN NOT DIFFERENCE,IT MAKE ANY DIFFERENCE ME DO NOT,19,0.5,0.285714,0.363636,0.222222
1,it does not make any difference to me,IT MAKE ANY DIFFERENCE ME DO NOT,translate English to gloss: it does not make a...,I CAN NOT DIFFERENCE,IT MAKE ANY DIFFERENCE ME DO NOT,19,0.5,0.285714,0.363636,0.222222
2,it does not make any difference to me,IT MAKE ANY DIFFERENCE ME DO NOT,translate English to gloss: it does not make a...,I CAN NOT DIFFERENCE,IT MAKE ANY DIFFERENCE ME DO NOT,19,0.5,0.285714,0.363636,0.222222
3,tell me truth,TELL TRUTH,translate English to gloss: tell me truth,I RÉAL,TELL TRUTH,8,0.0,0.0,0.0,0.0
4,tell me truth,TELL TRUTH,translate English to gloss: tell me truth,I RÉAL,TELL TRUTH,8,0.0,0.0,0.0,0.0


In [16]:
# ✅ Step 5: Aggregate and summarize all metrics
if df_merged is not None:
    print("\n--- Summary Statistics of all calculated metrics ---")
    display(df_merged[['levenshtein_distance', 'token_precision', 'token_recall', 'token_f1_score', 'jaccard_similarity']].describe())
else:
    print("Merged DataFrame not loaded. Cannot summarize metrics.")


--- Summary Statistics of all calculated metrics ---


Unnamed: 0,levenshtein_distance,token_precision,token_recall,token_f1_score,jaccard_similarity
count,500.0,500.0,500.0,500.0,500.0
mean,7.314,0.583705,0.531152,0.548523,0.484167
std,5.681546,0.391599,0.363022,0.368461,0.368511
min,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.25,0.333333,0.285714,0.2
50%,7.0,0.666667,0.5,0.571429,0.4
75%,11.0,1.0,1.0,1.0,1.0
max,22.0,1.0,1.0,1.0,1.0


# BLEU, WER, and ROUGE
Calculate BLEU, WER, and ROUGE scores for generated and reference glosses from the provided CSV files, add these scores to the existing DataFrame, and then aggregate and summarize all calculated metrics. Use the generated glosses from "/content/drive/MyDrive/IETGenAI-SLT/Chapter 5/isl_train_meta_with_generated_glosses.csv" and the original glosses from "/content/drive/MyDrive/IETGenAI-SLT/Chapter 4/isl_train_meta.csv".

## Install necessary libraries

Install libraries that provide implementations for BLEU, WER, and ROUGE (e.g., `evaluate`, `nltk`, `jiwer`).


In [17]:
!pip install -q jiwer evaluate

In [18]:
# ✅ Step 1: Import the evaluate library (already imported in previous cells)
# ✅ Step 2: Load the 'bleu' metric from the evaluate library
bleu = evaluate.load("bleu")

# ✅ Step 3: Initialize an empty list to store the BLEU scores
bleu_scores = []

# ✅ Step 4: Iterate through each row of the df_merged DataFrame
# ✅ Step 5: For each row, get the generated_gloss and gloss_sequence
# ✅ Step 6: Calculate the BLEU score for the current pair of glosses
# ✅ Step 7: Append the calculated BLEU score to the list
if df_merged is not None:
    for index, row in df_merged.iterrows():
        generated_gloss = str(row['generated_gloss'])
        reference_gloss = str(row['gloss_sequence']) # Reference needs to be a list of strings

        # The evaluate bleu metric expects a list of references
        # and a list of predictions (can be multiple predictions for one reference)
        # Here we have one generated and one reference per row
        bleu_score = bleu.compute(predictions=[generated_gloss], references=[[reference_gloss]])
        bleu_scores.append(bleu_score['bleu'])

    # ✅ Step 8: Add the list of BLEU scores as a new column named 'bleu_score' to the df_merged DataFrame
    df_merged['bleu_score'] = bleu_scores

    print("BLEU scores calculated and added to the merged DataFrame.")
    display(df_merged.head())
else:
    print("Merged DataFrame not loaded. Cannot calculate BLEU scores.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

BLEU scores calculated and added to the merged DataFrame.


Unnamed: 0,Sentences,target_text,input_text,generated_gloss,gloss_sequence,levenshtein_distance,token_precision,token_recall,token_f1_score,jaccard_similarity,bleu_score
0,it does not make any difference to me,IT MAKE ANY DIFFERENCE ME DO NOT,translate English to gloss: it does not make a...,I CAN NOT DIFFERENCE,IT MAKE ANY DIFFERENCE ME DO NOT,19,0.5,0.285714,0.363636,0.222222,0.0
1,it does not make any difference to me,IT MAKE ANY DIFFERENCE ME DO NOT,translate English to gloss: it does not make a...,I CAN NOT DIFFERENCE,IT MAKE ANY DIFFERENCE ME DO NOT,19,0.5,0.285714,0.363636,0.222222,0.0
2,it does not make any difference to me,IT MAKE ANY DIFFERENCE ME DO NOT,translate English to gloss: it does not make a...,I CAN NOT DIFFERENCE,IT MAKE ANY DIFFERENCE ME DO NOT,19,0.5,0.285714,0.363636,0.222222,0.0
3,tell me truth,TELL TRUTH,translate English to gloss: tell me truth,I RÉAL,TELL TRUTH,8,0.0,0.0,0.0,0.0,0.0
4,tell me truth,TELL TRUTH,translate English to gloss: tell me truth,I RÉAL,TELL TRUTH,8,0.0,0.0,0.0,0.0,0.0


In [20]:
import jiwer

# ✅ Step 2: Initialize an empty list to store the WER scores
wer_scores = []

# ✅ Step 3: Iterate through each row of the df_merged DataFrame
# ✅ Step 4: For each row, get the generated_gloss and gloss_sequence values. Ensure they are treated as strings.
# ✅ Step 5: Calculate the Word Error Rate using jiwer.wer()
# ✅ Step 6: Append the calculated WER score to the list
if df_merged is not None:
    for index, row in df_merged.iterrows():
        generated_gloss = str(row['generated_gloss'])
        reference_gloss = str(row['gloss_sequence'])

        wer = jiwer.wer(reference_gloss, generated_gloss)
        wer_scores.append(wer)

    # ✅ Step 7: Add the list of WER scores as a new column named 'wer_score' to the df_merged DataFrame.
    df_merged['wer_score'] = wer_scores

    # ✅ Step 8: Print a confirmation message and display the head of the updated DataFrame.
    print("Word Error Rate (WER) calculated and added to the merged DataFrame.")
    display(df_merged.head())
else:
    print("Merged DataFrame not loaded. Cannot calculate WER scores.")

Word Error Rate (WER) calculated and added to the merged DataFrame.


Unnamed: 0,Sentences,target_text,input_text,generated_gloss,gloss_sequence,levenshtein_distance,token_precision,token_recall,token_f1_score,jaccard_similarity,bleu_score,wer_score
0,it does not make any difference to me,IT MAKE ANY DIFFERENCE ME DO NOT,translate English to gloss: it does not make a...,I CAN NOT DIFFERENCE,IT MAKE ANY DIFFERENCE ME DO NOT,19,0.5,0.285714,0.363636,0.222222,0.0,0.857143
1,it does not make any difference to me,IT MAKE ANY DIFFERENCE ME DO NOT,translate English to gloss: it does not make a...,I CAN NOT DIFFERENCE,IT MAKE ANY DIFFERENCE ME DO NOT,19,0.5,0.285714,0.363636,0.222222,0.0,0.857143
2,it does not make any difference to me,IT MAKE ANY DIFFERENCE ME DO NOT,translate English to gloss: it does not make a...,I CAN NOT DIFFERENCE,IT MAKE ANY DIFFERENCE ME DO NOT,19,0.5,0.285714,0.363636,0.222222,0.0,0.857143
3,tell me truth,TELL TRUTH,translate English to gloss: tell me truth,I RÉAL,TELL TRUTH,8,0.0,0.0,0.0,0.0,0.0,1.0
4,tell me truth,TELL TRUTH,translate English to gloss: tell me truth,I RÉAL,TELL TRUTH,8,0.0,0.0,0.0,0.0,0.0,1.0


## Calculate rouge scores

Use a library to calculate the ROUGE score for each pair.


In [22]:
!pip install -q rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [23]:
# ✅ Step 1: Load the 'rouge' metric from the evaluate library.
rouge = evaluate.load("rouge")

# ✅ Step 2: Initialize an empty list to store the ROUGE scores.
rouge_scores = []

# ✅ Step 3: Iterate through each row of the df_merged DataFrame.
# ✅ Step 4: For each row, get the generated_gloss and gloss_sequence values, ensuring they are treated as strings.
# ✅ Step 5: Calculate the ROUGE score for the current pair of glosses using the loaded ROUGE metric.
# ✅ Step 6: Append the calculated ROUGE score (specifically the 'rougeL' score, which is commonly used) to the list.
if df_merged is not None:
    for index, row in df_merged.iterrows():
        generated_gloss = str(row['generated_gloss'])
        reference_gloss = str(row['gloss_sequence'])

        # The evaluate rouge metric expects lists of strings for both predictions and references
        rouge_score = rouge.compute(predictions=[generated_gloss], references=[[reference_gloss]])
        rouge_scores.append(rouge_score['rougeL'])

    # ✅ Step 7: Add the list of ROUGE scores as a new column named 'rougeL_score' to the df_merged DataFrame.
    df_merged['rougeL_score'] = rouge_scores

    # ✅ Step 8: Print a confirmation message and display the head of the updated DataFrame.
    print("ROUGE-L scores calculated and added to the merged DataFrame.")
    display(df_merged.head())
else:
    print("Merged DataFrame not loaded. Cannot calculate ROUGE scores.")

ROUGE-L scores calculated and added to the merged DataFrame.


Unnamed: 0,Sentences,target_text,input_text,generated_gloss,gloss_sequence,levenshtein_distance,token_precision,token_recall,token_f1_score,jaccard_similarity,bleu_score,wer_score,rougeL_score
0,it does not make any difference to me,IT MAKE ANY DIFFERENCE ME DO NOT,translate English to gloss: it does not make a...,I CAN NOT DIFFERENCE,IT MAKE ANY DIFFERENCE ME DO NOT,19,0.5,0.285714,0.363636,0.222222,0.0,0.857143,0.181818
1,it does not make any difference to me,IT MAKE ANY DIFFERENCE ME DO NOT,translate English to gloss: it does not make a...,I CAN NOT DIFFERENCE,IT MAKE ANY DIFFERENCE ME DO NOT,19,0.5,0.285714,0.363636,0.222222,0.0,0.857143,0.181818
2,it does not make any difference to me,IT MAKE ANY DIFFERENCE ME DO NOT,translate English to gloss: it does not make a...,I CAN NOT DIFFERENCE,IT MAKE ANY DIFFERENCE ME DO NOT,19,0.5,0.285714,0.363636,0.222222,0.0,0.857143,0.181818
3,tell me truth,TELL TRUTH,translate English to gloss: tell me truth,I RÉAL,TELL TRUTH,8,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,tell me truth,TELL TRUTH,translate English to gloss: tell me truth,I RÉAL,TELL TRUTH,8,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Aggregate and summarize all metrics

In [24]:
# ✅ Step 1: Check if the df_merged DataFrame exists and is not None.
if df_merged is not None:
    # ✅ Step 2: Select the columns containing the calculated metrics.
    metric_columns = [
        'levenshtein_distance',
        'token_precision',
        'token_recall',
        'token_f1_score',
        'jaccard_similarity',
        'bleu_score',
        'wer_score',
        'rougeL_score'
    ]

    # ✅ Step 3: Calculate and display the descriptive statistics for these selected columns.
    print("\n--- Summary Statistics of all calculated metrics ---")
    display(df_merged[metric_columns].describe())
else:
    # ✅ Step 4: If the DataFrame does not exist, print a message indicating that the metrics cannot be summarized.
    print("Merged DataFrame not loaded. Cannot summarize metrics.")


--- Summary Statistics of all calculated metrics ---


Unnamed: 0,levenshtein_distance,token_precision,token_recall,token_f1_score,jaccard_similarity,bleu_score,wer_score,rougeL_score
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,7.314,0.583705,0.531152,0.548523,0.484167,0.080183,0.578486,0.550879
std,5.681546,0.391599,0.363022,0.368461,0.368511,0.26498,0.475888,0.353442
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.25,0.333333,0.285714,0.2,0.0,0.25,0.333333
50%,7.0,0.666667,0.5,0.571429,0.4,0.0,0.6,0.535714
75%,11.0,1.0,1.0,1.0,1.0,0.0,1.0,0.857143
max,22.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0


## Display updated results


In [25]:
# ✅ Step 1: Check if the df_merged DataFrame exists and is not None.
if df_merged is not None:
    # ✅ Step 2: Display the head of the DataFrame to show the newly added metric columns
    print("DataFrame head with all calculated metrics:")
    display(df_merged.head())
else:
    # ✅ Step 3: If the DataFrame does not exist, print a message indicating that the DataFrame is not loaded.
    print("Merged DataFrame not loaded. Cannot display the head.")


DataFrame head with all calculated metrics:


Unnamed: 0,Sentences,target_text,input_text,generated_gloss,gloss_sequence,levenshtein_distance,token_precision,token_recall,token_f1_score,jaccard_similarity,bleu_score,wer_score,rougeL_score
0,it does not make any difference to me,IT MAKE ANY DIFFERENCE ME DO NOT,translate English to gloss: it does not make a...,I CAN NOT DIFFERENCE,IT MAKE ANY DIFFERENCE ME DO NOT,19,0.5,0.285714,0.363636,0.222222,0.0,0.857143,0.181818
1,it does not make any difference to me,IT MAKE ANY DIFFERENCE ME DO NOT,translate English to gloss: it does not make a...,I CAN NOT DIFFERENCE,IT MAKE ANY DIFFERENCE ME DO NOT,19,0.5,0.285714,0.363636,0.222222,0.0,0.857143,0.181818
2,it does not make any difference to me,IT MAKE ANY DIFFERENCE ME DO NOT,translate English to gloss: it does not make a...,I CAN NOT DIFFERENCE,IT MAKE ANY DIFFERENCE ME DO NOT,19,0.5,0.285714,0.363636,0.222222,0.0,0.857143,0.181818
3,tell me truth,TELL TRUTH,translate English to gloss: tell me truth,I RÉAL,TELL TRUTH,8,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,tell me truth,TELL TRUTH,translate English to gloss: tell me truth,I RÉAL,TELL TRUTH,8,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Summary:

### Data Analysis Key Findings

*   The `jiwer`, `evaluate`, and `rouge_score` libraries were successfully installed to enable the calculation of WER, BLEU, and ROUGE scores.
*   BLEU scores were calculated for each pair of generated and reference glosses and added as a new column named `bleu_score` to the DataFrame.
*   Word Error Rate (WER) was calculated for each pair of generated and reference glosses using the `jiwer` library and added as a new column named `wer_score`.
*   ROUGE-L scores were calculated for each pair of generated and reference glosses using the `evaluate` library and added as a new column named `rougeL_score`.
*   Summary statistics were recalculated and displayed for all metric columns, including the newly added `bleu_score`, `wer_score`, and `rougeL_score`, along with existing metrics like Levenshtein distance, token precision, recall, F1-score, and Jaccard similarity.

### Insights or Next Steps

*   The newly calculated metrics provide a more comprehensive view of the performance of the gloss generation process, complementing the previously calculated metrics.
*   Analyze the distribution and correlation of the new metrics (BLEU, WER, ROUGE-L) with the existing metrics to gain deeper insights into the strengths and weaknesses of the generated glosses.


In [None]:
# Install necessary libraries
!pip install -q jiwer evaluate rouge_score

import jiwer
import evaluate
import pandas as pd # Assuming pandas is needed for the DataFrame
from nltk import word_tokenize # Assuming word_tokenize is still needed for ROUGE/BLEU if not handled by evaluate

# Load metrics from evaluate library
try:
    bleu = evaluate.load("bleu")
    rouge = evaluate.load("rouge")
    # WER might be directly from jiwer, or an evaluate metric if available
    # Check evaluate.list_datasets() or evaluate.list_metrics() for WER if needed
except FileNotFoundError as e:
    print(f"Error loading evaluation metrics: {e}")
    print("Please ensure you are connected to the internet or the metrics are available locally.")

# Initialize lists to store the results
bleu_scores = []
wer_scores = []
rougeL_scores = [] # Using rougeL as an example, can add others

if 'df_merged' in locals() and df_merged is not None:
    # Assuming the merged DataFrame has columns named 'generated_gloss' and 'gloss_sequence'
    generated_gloss_col = 'generated_gloss'
    reference_gloss_col = 'gloss_sequence'

    for index, row in df_merged.iterrows():
        generated_gloss = str(row[generated_gloss_col])
        reference_gloss = str(row[reference_gloss_col])

        # Calculate BLEU
        try:
            # evaluate bleu metric expects a list of predictions and a list of references (each reference can be a list of strings)
            bleu_score = bleu.compute(predictions=[generated_gloss], references=[[reference_gloss]])
            bleu_scores.append(bleu_score['bleu'])
        except Exception as e:
            print(f"Error calculating BLEU for row {index}: {e}")
            bleu_scores.append(None) # Append None if calculation fails

        # Calculate WER
        try:
            wer = jiwer.wer(reference_gloss, generated_gloss)
            wer_scores.append(wer)
        except Exception as e:
            print(f"Error calculating WER for row {index}: {e}")
            wer_scores.append(None) # Append None if calculation fails

        # Calculate ROUGE
        try:
             # evaluate rouge metric expects lists of strings for both predictions and references
            rouge_score = rouge.compute(predictions=[generated_gloss], references=[[reference_gloss]])
            rougeL_scores.append(rouge_score['rougeL']) # Using rougeL
        except Exception as e:
            print(f"Error calculating ROUGE for row {index}: {e}")
            rougeL_scores.append(None) # Append None if calculation fails


    # Add the metrics as new columns to the DataFrame
    df_merged['bleu_score'] = bleu_scores
    df_merged['wer_score'] = wer_scores
    df_merged['rougeL_score'] = rougeL_scores # Add other rouge scores if needed

    print("BLEU, WER, and ROUGE scores calculated and added to the merged DataFrame.")
    display(df_merged.head())

    # Aggregate and summarize all metrics
    print("\n--- Summary Statistics of all calculated metrics ---")
    # Include previous metrics if they exist in the DataFrame
    all_metric_columns = [col for col in ['levenshtein_distance', 'token_precision', 'token_recall', 'token_f1_score', 'jaccard_similarity', 'bleu_score', 'wer_score', 'rougeL_score'] if col in df_merged.columns]
    display(df_merged[all_metric_columns].describe())

else:
    print("Merged DataFrame not found or is None. Please ensure 'df_merged' is loaded before running this cell.")