In [1]:
import os
import numpy as np

import torch
import pandas as pd
import whisper
from whisper.audio import (
    log_mel_spectrogram,
    pad_or_trim,
)
from tqdm import tqdm

seed = 42
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
from data import load_dataset
dataset = load_dataset(['test-other'], 'librispeech', 'LibriSpeech', batch_size=1, extra_noise=0.01)
# predict without timestamps for short-form transcription
options = whisper.DecodingOptions(language="en", without_timestamps=True)
model_name = 'tiny.en'
model = whisper.load_model(model_name)
model = model.to(DEVICE)

Read text:   0%|          | 0/2939 [00:00<?, ?it/s]

Read text: 100%|██████████| 2939/2939 [00:00<00:00, 74625.33it/s]


[INFO]    There are 2939 samples.


In [6]:
hypotheses = []
references = []

for batch in tqdm(dataset):
    lens, wavs, texts, files = batch
    wav = pad_or_trim(wavs[0])
    mel = log_mel_spectrogram(wav)

    # mel_list = torch.Tensor()
    # for wav in wavs:
    #     wav = pad_or_trim(wav)
    #     mel = log_mel_spectrogram(wav)
        # mel = mel.unsqueeze(0)
        # mel_list = torch.cat((mel_list, mel),dim=0)
    with torch.no_grad():
        mel = mel.to(DEVICE)
        # mel_list = mel_list.to(DEVICE)
        outputs = model.decode(mel, options)
    hypotheses.extend([result.text for result in outputs])
    references.extend(texts)

100%|██████████| 2939/2939 [02:36<00:00, 18.78it/s]


In [7]:
data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
data

Unnamed: 0,hypothesis,reference
0,I am willing to enter into competitions the ag...,I AM WILLING TO ENTER INTO COMPETITION WITH TH...
1,"In speaking of confectionary, seeking a mark t...",IN SPEAKING OF CONFECTIONARY IT SHOULD BE REMA...
2,He dwells with considerable force and energy o...,HE DWELT WITH CONSIDERABLE FORCE AND ENERGY ON...
3,The Egyptian obey and his master crossed the w...,THE EGYPTIAN OBEYED AND HIS MASTER CROSSED THE...
4,This he said in a source that weathered with a...,THIS HE SET IN A SAUCER WETTED WITH A LITTLE W...
...,...,...
2934,How? Why?,HOW WHY
2935,"No, wait.",NO WAIT
2936,You? Bye.,YOU I
2937,Well...,WELL


# Calculating the word error rate

Now, we use our English normalizer implementation to standardize the transcription and calculate the WER.

In [8]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()

In [9]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,I am willing to enter into competitions the ag...,I AM WILLING TO ENTER INTO COMPETITION WITH TH...,i am willing to enter into competitions the ag...,i am willing to enter into competition with th...
1,"In speaking of confectionary, seeking a mark t...",IN SPEAKING OF CONFECTIONARY IT SHOULD BE REMA...,in speaking of confectionary seeking a mark th...,in speaking of confectionary it should be rema...
2,He dwells with considerable force and energy o...,HE DWELT WITH CONSIDERABLE FORCE AND ENERGY ON...,he dwells with considerable force and energy o...,he dwelt with considerable force and energy on...
3,The Egyptian obey and his master crossed the w...,THE EGYPTIAN OBEYED AND HIS MASTER CROSSED THE...,the egyptian obey and his master crossed the w...,the egyptian obeyed and his master crossed the...
4,This he said in a source that weathered with a...,THIS HE SET IN A SAUCER WETTED WITH A LITTLE W...,this he said in a source that weathered with a...,this he set in a saucer wetted with a little w...
...,...,...,...,...
2934,How? Why?,HOW WHY,how why,how why
2935,"No, wait.",NO WAIT,no wait,no wait
2936,You? Bye.,YOU I,you bye,you i
2937,Well...,WELL,well .,well


In [10]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

WER: 22.88 %


In [11]:
data.to_csv('./ori_tiny_en_0.01.csv')