In [33]:
!pip install datasets evaluate transformers

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [35]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.0.5-py3-none-any.whl.metadata (2.7 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading jiwer-3.0.5-py3-none-any.whl (21 kB)
Downloading rapidfuzz-3.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.5 rapidfuzz-3.11.0


In [40]:

from transformers import pipeline
from datasets import load_dataset
import evaluate
import torch
import sacrebleu

# Load the model
model_name = "farahabdou/whisper-arabic-english-end2end"
asr_pipeline = pipeline(
    "automatic-speech-recognition",
    model=model_name,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# Load dataset
dataset = load_dataset("farahabdou/FLEURS-AR-EN-split", split="test")

def transcribe_audio(example):
    transcription = asr_pipeline(
        example["audio"]["array"],
        chunk_length_s=30,
        stride_length_s=5,
        generate_kwargs={
            "task": "translate",
            "language": "ar"
        }
    )["text"]
    return {"prediction": transcription}

# Process dataset and calculate metrics
results = dataset.map(transcribe_audio)

def calculate_metrics(results):
    references = results["english"]  # Using text_en for end-to-end model
    predictions = results["prediction"]

    bleu = sacrebleu.corpus_bleu(predictions, [references])
    chrf = sacrebleu.corpus_chrf(predictions, [references], word_order=2)

    return bleu, chrf

# Calculate WER for Arabic transcription
wer = evaluate.load("wer")
wer_score = wer.compute(
    references=results["arabic"],  # Using text_ar for Arabic reference
    predictions=results["prediction"]
)

# Compute and display scores
bleu_score, chrf_score = calculate_metrics(results)

print(f"BLEU Score: {bleu_score.score:.2f}")
print(f"ChrF++ Score: {chrf_score.score:.2f}")
print(f"Word Error Rate: {wer_score * 100:.2f}%")

print("\nDetailed BLEU:")
print(f" - System score: {bleu_score.score:.2f}")
print(f" - Precisely matched n-grams: {bleu_score.counts}")
print(f" - Total n-grams considered: {bleu_score.totals}")

print("\nSample Translations:")
for i in range(3):
    print(f"Reference: {results[i]['arabic']}")
    print(f"Predicted: {results[i]['prediction']}\n")

Device set to use cpu


BLEU Score: 15.06
ChrF++ Score: 39.03
Word Error Rate: 139.53%

Detailed BLEU:
 - System score: 15.06
 - Precisely matched n-grams: [3236, 1426, 776, 456]
 - Total n-grams considered: [7930, 7651, 7372, 7093]

Sample Translations:
Reference: خسر موراي المجموعة الأولى في شوط كسر التعادل، بعد كسر كلا اللاعبين لجميع ضربات الإرسال بالمجموعة بالكامل.
Predicted: The first group lost their first meeting in the break-in shoot and after a break, all the messages were sent to the whole group.

Reference: وعلى نحو مماثل، فمن خلال الحصول على تأشيرة شنغن، لن تكون في احتياج إلى التقدم بطلب للحصول على تأشيرات الدخول لكل دولة من البلدان الأعضاء في شنغن على حدة، وبالتالي توفير الوقت والمال والأعمال الورقية.
Predicted: The example is that, through the Schengen Procedure, you need to advance the requirement for the entry fees to each of the Schengen Procedure's population. Thus, the time and the written work are available.

Reference: ولا شيء يمكن رؤيته غير السماء الصافية الجميلة فوق الجبال الكثيرة المحي

In [45]:
from evaluate import load
import torch

# Load the COMET metric
comet_metric = load('comet')

def calculate_metrics(results):
    references = results["english"]  # English references
    predictions = results["prediction"]  # Model predictions
    sources = results["arabic"]  # Arabic source texts

    # Calculate BLEU and ChrF++ as before
    bleu = sacrebleu.corpus_bleu(predictions, [references])
    chrf = sacrebleu.corpus_chrf(predictions, [references], word_order=2)

    # Calculate COMET score
    comet_scores = comet_metric.compute(
        predictions=predictions,
        references=references,
        sources=sources,
        gpus=1 if torch.cuda.is_available() else 0
    )

    return bleu, chrf, comet_scores['mean_score']

# Compute all metrics
bleu_score, chrf_score, comet_score = calculate_metrics(results)

print(f"BLEU Score: {bleu_score.score:.2f}")
print(f"ChrF++ Score: {chrf_score.score:.2f}")
print(f"COMET Score: {comet_score:.4f}")


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/f49d328952c3470eff6bb6f545d62bfdb6e66304/checkpoints/model.ckpt`
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


BLEU Score: 15.06
ChrF++ Score: 39.03
COMET Score: 0.6517
