In [1]:
import os
import numpy as np
from nltk.translate.bleu_score import sentence_bleu

import torch
import numpy as np
import pandas as pd
import whisper
from whisper.audio import (
    log_mel_spectrogram,
    pad_or_trim,
)
from tqdm import tqdm

seed = 42
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
from datasets import load_dataset
dataset = load_dataset("covost2", 'pt_en', data_dir='../covost2_pt',split='test', trust_remote_code=True)

In [None]:
# from data import load_dataset
# dataset = load_dataset(['test-other'], 'librispeech', 'LibriSpeech', batch_size=1, extra_noise=0.01)
# dataset = load_dataset(name='aishell3', path='./aishell_test', batch_size=32)
# predict without timestamps for short-form transcription
options = whisper.DecodingOptions(language="pt", task='translation', without_timestamps=True)
model_name = 'base'
model = whisper.load_model(model_name)
model = model.to(DEVICE)

In [None]:
hypotheses = []
references = []
ori_sentences = []

count = 0
for batch in dataset:
    print(batch)
    count+=1
    if count > 10:
        break
    wavs = torch.Tensor(batch['audio']['array'])
    if model_name == 'large':
        mel = log_mel_spectrogram(pad_or_trim(wavs), n_mels=128).unsqueeze(0)
    else:
        mel = log_mel_spectrogram(pad_or_trim(wavs)).unsqueeze(0)
    with torch.no_grad():
        mel = mel.to(DEVICE)
        outputs = model.decode(mel, options)
    ori_sentences.append(batch['sentence'])
    hypotheses.extend([result.text for result in outputs[0]])
    references.append(batch['translation'])


NameError: name 'dataset' is not defined

In [7]:
import nltk

# hypothesis = ['It', 'is', 'a', 'cat', 'at', 'room']
# reference = ['It', 'is', 'a', 'cat', 'inside', 'the', 'room']

scores = []
for i in range(len(hypotheses)):
    reference = [word for word in references[i].split(' ')]
    hypothesis = [word for word in hypotheses[i].split(' ')]
    BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis)
    scores.append(BLEUscore)
np.array(scores).mean()

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


2.388938713042209e-155

In [None]:
hypotheses = []
references = []

for batch in tqdm(dataset):
    lens, wavs, texts, files = batch
    mel = torch.Tensor()
    for wav in wavs:
        mel = torch.cat((mel, log_mel_spectrogram(pad_or_trim(wav)).unsqueeze(0)), dim=0)

    with torch.no_grad():
        mel = mel.to(DEVICE)
        # mel_list = mel_list.to(DEVICE)
        outputs = model.decode(mel, options)
    hypotheses.extend([result.text for result in outputs[0]])
    references.extend(texts)

In [5]:
data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
data

Unnamed: 0,hypothesis,reference
0,常知识五乡线红水镇杨婆装春一户破旧不堪的愿落中,长治市武乡县洪水镇阳坡庄村一户破旧不堪的院落中
1,将会导致现有的车主不会想要再贡猛下一辆特斯拉起车,将会导致现有的车主不会想要再购买下一辆特斯拉汽车
2,电脑板看爆炸新闻紧张用火机是漏气引爆小电,店老板看爆炸新闻紧张用火机试漏气引爆小店
3,腰上 认断胃 左脸 脱伤,验伤认断为左脸挫伤
4,废陆调低 则进一步加到了其生存压力,费率调低则进一步加大了其生存压力
...,...,...
24795,而是灰色的,而是灰色的
24796,我不会吃的,我不会吃的
24797,我是普通的,我是普通的
24798,记收到,济州岛


In [6]:
data.to_csv('./tiny_aishell_prompt.csv')

In [7]:
import jiwer
wer = jiwer.wer(list(data["reference"]), list(data["hypothesis"]))

print(f"WER: {wer * 100:.2f} %")

WER: 91.69 %


# Calculating the word error rate

Now, we use our English normalizer implementation to standardize the transcription and calculate the WER.

In [8]:
import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()

In [9]:
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data

Unnamed: 0,hypothesis,reference,hypothesis_clean,reference_clean
0,I am willing to enter into competitions the ag...,I AM WILLING TO ENTER INTO COMPETITION WITH TH...,i am willing to enter into competitions the ag...,i am willing to enter into competition with th...
1,"In speaking of confectionary, seeking a mark t...",IN SPEAKING OF CONFECTIONARY IT SHOULD BE REMA...,in speaking of confectionary seeking a mark th...,in speaking of confectionary it should be rema...
2,He dwells with considerable force and energy o...,HE DWELT WITH CONSIDERABLE FORCE AND ENERGY ON...,he dwells with considerable force and energy o...,he dwelt with considerable force and energy on...
3,The Egyptian obey and his master crossed the w...,THE EGYPTIAN OBEYED AND HIS MASTER CROSSED THE...,the egyptian obey and his master crossed the w...,the egyptian obeyed and his master crossed the...
4,This he said in a source that weathered with a...,THIS HE SET IN A SAUCER WETTED WITH A LITTLE W...,this he said in a source that weathered with a...,this he set in a saucer wetted with a little w...
...,...,...,...,...
2934,How? Why?,HOW WHY,how why,how why
2935,"No, wait.",NO WAIT,no wait,no wait
2936,You? Bye.,YOU I,you bye,you i
2937,Well...,WELL,well .,well


In [10]:
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

WER: 22.88 %


In [11]:
data.to_csv('./ori_tiny_en_0.01.csv')