In [1]:
import os
import numpy as np

import torch
import pandas as pd
import whisper
from whisper.audio import (
    log_mel_spectrogram,
    pad_or_trim,
)


seed = 42
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
from data import load_dataset
dataset = load_dataset(['test-other'], 'librispeech', 'LibriSpeech', batch_size=1, extra_noise=0.)
# predict without timestamps for short-form transcription
options = whisper.DecodingOptions(language="en", beam_size=4, without_timestamps=True)
model_name = 'base'
model = whisper.load_model(model_name)
model = model.to(DEVICE)

Read text: 100%|██████████| 2939/2939 [00:00<00:00, 75906.47it/s]


[INFO]    There are 2939 samples.


In [3]:
for batch in dataset:
    lens, wavs, texts, files = batch
    wav = pad_or_trim(wavs[0])
    mel = log_mel_spectrogram(wav)
    mel = mel.unsqueeze(0)
    with torch.no_grad():
        mel = mel.to(DEVICE)
        outputs = model.decode(mel, options)
    break
outputs

([DecodingResult(audio_features=tensor([[-1.6299, -0.6377, -1.3389,  ..., -0.4067,  1.8369,  1.1494],
          [-0.6606,  0.0308, -0.1426,  ...,  0.0850,  1.0166,  1.6172],
          [ 0.0123,  0.0227,  0.3367,  ...,  0.3865, -0.2915, -0.0286],
          ...,
          [ 0.7446, -1.5283, -0.6187,  ...,  0.1627, -1.2100, -0.6240],
          [ 0.4531, -1.8066, -1.0576,  ...,  0.0733, -0.7373, -0.5044],
          [-1.0342,  1.2314, -0.0084,  ...,  0.1088, -0.6094, -1.2344]],
         device='cuda:0', dtype=torch.float16), language='en', language_probs=None, tokens=[286, 669, 4950, 281, 3242, 666, 6211, 365, 264, 34856, 791, 293, 841, 1075, 281, 27650, 552, 13, 1171, 1670, 729, 2440, 1708, 294, 597, 286, 1027, 264, 38647, 295, 19291, 49517, 11, 286, 362, 3264, 370, 709, 300, 286, 393, 586, 5258, 1400, 1101, 3755, 295, 264, 733, 13, 286, 519, 286, 393, 611, 484, 2595, 264, 13561, 286, 13159, 337, 17380, 7336, 271, 29173, 11, 597, 366, 920, 5167, 294, 1090, 4065, 443, 13, 682, 411, 9060, 11

In [14]:
logits = torch.stack(outputs[1], dim=0)
logits

tensor([[[ 5.0547,  7.1719,  2.8066,  ...,  4.0547,  4.8359,  3.4648],
         [ 5.0547,  7.1719,  2.8066,  ...,  4.0547,  4.8359,  3.4648],
         [ 5.0547,  7.1719,  2.8066,  ...,  4.0547,  4.8359,  3.4648],
         [ 5.0547,  7.1719,  2.8066,  ...,  4.0547,  4.8359,  3.4648]],

        [[30.0781, 30.4844, 25.6875,  ..., 27.1562, 26.7188, 25.7656],
         [17.2812, 16.0469, 13.9219,  ..., 16.0781, 16.2188, 14.1328],
         [21.7656, 19.5625, 16.3125,  ..., 18.2656, 18.2656, 16.5156],
         [21.1875, 20.1562, 16.2031,  ..., 18.8438, 18.7969, 16.8594]],

        [[15.6250, 15.0234,  9.6797,  ..., 10.1562, 11.1797, 10.1484],
         [19.7969, 18.6875, 13.4922,  ..., 14.9141, 15.5312, 14.6094],
         [22.3750, 20.5469, 18.9375,  ..., 19.9688, 20.4219, 19.3438],
         [17.7969, 16.0469, 13.4688,  ..., 14.6406, 14.8906, 13.1719]],

        ...,

        [[39.4375, 35.2500, 32.6250,  ..., 32.7812, 33.1250, 31.8906],
         [39.2500, 35.0000, 32.1875,  ..., 32.2188, 32.59

In [15]:
logits.shape

torch.Size([119, 4, 51865])

In [17]:
tmp = logits[:,outputs[2][0], :]
tmp.shape

torch.Size([119, 1, 51865])

In [4]:
hypotheses = []
references = []

for batch in tqdm(dataset):
    lens, wavs, texts, files = batch
    wav = pad_or_trim(wavs[0])
    mel = log_mel_spectrogram(wav)

    # mel_list = torch.Tensor()a
    # for wav in wavs:
    #     wav = pad_or_trim(wav)
    #     mel = log_mel_spectrogram(wav)
        # mel = mel.unsqueeze(0)
        # mel_list = torch.cat((mel_list, mel),dim=0)
    with torch.no_grad():
        mel = mel.to(DEVICE)
        # mel_list = mel_list.to(DEVICE)
        outputs = model.decode(mel, options)
    hypotheses.extend([result.text for result in outputs])
    references.extend(texts)

  0%|          | 0/2939 [00:00<?, ?it/s]

In [5]:
data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
data

Unnamed: 0,hypothesis,reference
0,I am willing to enter into competition with th...,I AM WILLING TO ENTER INTO COMPETITION WITH TH...
1,In speaking of confectionaries should be remar...,IN SPEAKING OF CONFECTIONARY IT SHOULD BE REMA...
2,He dwelt with considerable force and energy on...,HE DWELT WITH CONSIDERABLE FORCE AND ENERGY ON...
3,The Egyptian obeying and his master crossed th...,THE EGYPTIAN OBEYED AND HIS MASTER CROSSED THE...
4,This he said in a saucer wetted with a little ...,THIS HE SET IN A SAUCER WETTED WITH A LITTLE W...
...,...,...
2934,How? Why?,HOW WHY
2935,"No, wait.",NO WAIT
2936,you i,YOU I
2937,Well...,WELL


# Calculating the word error rate

Now, we use our English normalizer implementation to standardize the transcription and calculate the WER.

In [6]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()

In [7]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,I am willing to enter into competition with th...,I AM WILLING TO ENTER INTO COMPETITION WITH TH...,i am willing to enter into competition with th...,i am willing to enter into competition with th...
1,In speaking of confectionaries should be remar...,IN SPEAKING OF CONFECTIONARY IT SHOULD BE REMA...,in speaking of confectionaries should be remar...,in speaking of confectionary it should be rema...
2,He dwelt with considerable force and energy on...,HE DWELT WITH CONSIDERABLE FORCE AND ENERGY ON...,he dwelt with considerable force and energy on...,he dwelt with considerable force and energy on...
3,The Egyptian obeying and his master crossed th...,THE EGYPTIAN OBEYED AND HIS MASTER CROSSED THE...,the egyptian obeying and his master crossed th...,the egyptian obeyed and his master crossed the...
4,This he said in a saucer wetted with a little ...,THIS HE SET IN A SAUCER WETTED WITH A LITTLE W...,this he said in a saucer wetted with a little ...,this he set in a saucer wetted with a little w...
...,...,...,...,...
2934,How? Why?,HOW WHY,how why,how why
2935,"No, wait.",NO WAIT,no wait,no wait
2936,you i,YOU I,you i,you i
2937,Well...,WELL,well .,well


In [8]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

WER: 10.82 %


In [9]:
data.to_csv('./no_noise_base_beam.csv')