In [4]:
# Install necessary packages
!pip install datasets transformers openai-whisper jiwer
!pip install soundfile
!pip install librosa
!pip install evaluate

from datasets import load_dataset
import numpy as np
import whisper
import jiwer
import evaluate
import torch

# Load the LibriSpeech dataset (using a small subset for demonstration)
dataset = load_dataset("librispeech_asr", "clean", split="test[:1%]", trust_remote_code=True)

# Load the pre-trained Whisper model
model = whisper.load_model("base")

# Function to transcribe audio using the Whisper model
def transcribe(batch):
    audio = batch["audio"]
    # Whisper model expects 16kHz audio
    audio_array = torch.tensor(audio["array"], dtype=torch.float32)
    result = model.transcribe(audio_array, fp16=False)
    batch["transcription"] = result["text"]
    return batch

# Apply transcription to the dataset
dataset = dataset.map(transcribe)

# Load the CER metric using evaluate
cer_metric = evaluate.load("cer", trust_remote_code=True)

# Compute CER
transcriptions = dataset["transcription"]
references = dataset["text"]
cer = cer_metric.compute(predictions=transcriptions, references=references)

print(f"CER: {cer:.4f}")


Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
   ---------------------------------------- 0.0/84.1 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/84.1 kB ? eta -:--:--
   --------- ------------------------------ 20.5/84.1 kB 330.3 kB/s eta 0:00:01
   ------------------------ --------------- 51.2/84.1 kB 440.4 kB/s eta 0:00:01
   ---------------------------------------- 84.1/84.1 kB 527.8 kB/s eta 0:00:00
Installing collected packages: evaluate
Successfully installed evaluate-0.4.2



Map: 100%|██████████| 26/26 [01:18<00:00,  3.03s/ examples]
Downloading builder script: 100%|██████████| 5.60k/5.60k [00:00<00:00, 5.54MB/s]


CER: 0.8285
