In [14]:
import os

import numpy as np
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM, Wav2Vec2Processor

models = [
    "classla/wav2vec2-xls-r-parlaspeech-hr-lm",
    "classla/wav2vec2-xls-r-parlaspeech-hr",
    "classla/wav2vec2-large-slavic-parlaspeech-hr",
    "classla/wav2vec2-large-slavic-parlaspeech-hr-lm",
]
results = list()
for model_name in models:
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # load model and tokenizer
    if model_name.endswith("-lm"):
        processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_name,)
    else:
        processor = Wav2Vec2Processor.from_pretrained(model_name, pad_token="[PAD]")
    model = Wav2Vec2ForCTC.from_pretrained(model_name)
    # read the wav file
    speech, sample_rate = sf.read("audio.wav")

    overlap_seconds = 1

    indices = np.arange(
        0, speech.shape[0], 10 * 60 * sample_rate, dtype=int
    ).tolist() + [-1]

    transcripts = list()
    for start, stop in zip(indices[0:-2], indices[1:]):
        # If overlap would make the segment go
        # over the end, correct stop variable:
        if stop + overlap_seconds * sample_rate >= speech.shape[0]:
            stop = -1
        speech_segment = speech[start:stop]
        inputs = processor(
            speech_segment, sampling_rate=sample_rate, return_tensors="pt"
        )
        with torch.no_grad():
            logits = model(**inputs).logits
        try:
            transcription = processor.batch_decode(logits.numpy()).text[0]
        except:
            prediction_ids = torch.argmax(logits, dim=-1)
            transcription = processor.batch_decode(prediction_ids)[0]

        transcripts.append(transcription)
    results.append({"model": model_name, "transcription": " ".join(transcripts)})

import pandas as pd

pd.DataFrame(data=results).to_csv("003_all_models_transcriptions.csv", index=False)


Only 0 unigrams passed as vocabulary. Is this small or artificial data?
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/214 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.27k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/463 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/262 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/348 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/36.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/407 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/328 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/868M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

Downloading: 0.00B [00:00, ?B/s]

Only 0 unigrams passed as vocabulary. Is this small or artificial data?


Downloading:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

In [17]:
results_corrected = []
for model_true_name, entry in zip(models, results):
    current = entry
    current["model"] = model_true_name
    results_corrected.append(current)


pd.DataFrame(data=results_corrected).to_csv("003_all_models_transcriptions.csv", index=False)

In [16]:
results[0]["model"]

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (2): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (3): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elemen