In [1]:
from collections import defaultdict

import torch
from transformers import pipeline
import librosa
import io
import os

device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [2]:
def transcribe(filename, model_name):
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model_name,
        chunk_length_s=30, stride_length_s=(10, 5),
        device=device,
    )
    with open(filename, "rb") as f:
        audio_data = f.read()
        audio_input, sample_rate = librosa.load(io.BytesIO(audio_data), sr=16000)
        return pipe(
            audio_input,
            batch_size=int(os.getenv('WHISPER_PARALLEL', '3')),
            return_timestamps=True,
            generate_kwargs={
                'language': 'ru',
                'num_beams': int(os.getenv('NUM_BEAMS', '10')),
                'temperature': float(os.getenv('WHISPER_TEMPERATURE', '0.1')),
            }
        )["text"]

In [3]:
eval_texts = defaultdict(list)

In [17]:
import os
from collections import defaultdict
from tqdm import tqdm

models = ['', '', '']

audio_dir = 'audio'
audio_file_list = filter(lambda x: x.endswith(".mp3"), os.listdir(audio_dir))


def eval(model_name):
    print(model_name)
    for audio_file in audio_file_list:
        print(audio_file)
        with open(audio_dir + "/" + audio_file.replace('.mp3', '.txt'), mode="r", encoding="utf-8") as original:
            eval_texts[model_name].append((original.read(), transcribe(audio_dir + "/" + audio_file, model_name=model_name)))    

In [5]:
eval('openai/whisper-large-v3')

openai/whisper-large-v3
gora.mp3


Device set to use cuda:0
You have passed language=ru, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of language=ru.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


peregovor.mp3


Device set to use cuda:0


In [10]:
eval('antony66/whisper-large-v3-russian')

antony66/whisper-large-v3-russian
gora.mp3


Device set to use cuda:0


peregovor.mp3


Device set to use cuda:0


In [18]:
eval('pyannote/speaker-diarization-3.1')

pyannote/speaker-diarization-3.1
gora.mp3


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/pyannote/speaker-diarization-3.1.
401 Client Error. (Request ID: Root=1-6823cb88-335c2cb433ed3f9e11721b74;74d3f9af-ab1f-4809-84a0-2a9a10cb0917)

Cannot access gated repo for url https://huggingface.co/pyannote/speaker-diarization-3.1/resolve/main/config.json.
Access to model pyannote/speaker-diarization-3.1 is restricted. You must have access to it and be authenticated to access it. Please log in.

In [16]:
import numpy as np

def calculate_wer(reference, hypothesis):
    reference = reference.lower().replace("-", "").replace("«", "").replace("»", "")
    hypothesis = hypothesis.lower().replace("-", "").replace("«", "").replace("»", "")
    ref_words = reference.split()
    hyp_words = hypothesis.split()
    # Counting the number of substitutions, deletions, and insertions
    substitutions = sum(1 for ref, hyp in zip(ref_words, hyp_words) if ref != hyp)
    deletions = len(ref_words) - len(hyp_words)
    insertions = len(hyp_words) - len(ref_words)
    # Total number of words in the reference text
    total_words = len(ref_words)
    # Calculating the Word Error Rate (WER)
    wer = (substitutions + deletions + insertions) / total_words
    return wer

model_evals = {}
for model, texts in eval_texts.items():
    model_evals[model] = [calculate_wer(x[0], x[1]) for x in texts]

print(model_evals)

{'openai/whisper-large-v3': [0.9725061684878392, 0.9918116683725691], 'antony66/whisper-large-v3-russian': [0.9876630243214664, 0.9815762538382804], 'nvidia/stt_ru_conformer_ctc_large': [], 'nvidia/parakeet-tdt-0.6b-v2': []}
