In [4]:
import os

import numpy as np
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM, Wav2Vec2Processor

models = [
    "classla/wav2vec2-xls-r-parlaspeech-hr",
    # "classla/wav2vec2-xls-r-parlaspeech-hr-lm",
    "classla/wav2vec2-large-slavic-parlaspeech-hr",
    # "classla/wav2vec2-large-slavic-parlaspeech-hr-lm",
]
results = list()
for model in models:
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # load model and tokenizer
    if model.endswith("-lm"):
        processor = Wav2Vec2ProcessorWithLM.from_pretrained(model)
    else:
        processor = Wav2Vec2Processor.from_pretrained(model)
    model = Wav2Vec2ForCTC.from_pretrained(model)
    # read the wav file
    speech, sample_rate = sf.read("audio.wav")

    overlap_seconds = 1

    indices = np.arange(
        0, speech.shape[0], 10 * 60 * sample_rate, dtype=int
    ).tolist() + [-1]

    transcripts = list()
    for start, stop in zip(indices[0:-2], indices[1:]):
        # If overlap would make the segment go
        # over the end, correct stop variable:
        if stop + overlap_seconds * sample_rate >= speech.shape[0]:
            stop = -1
        speech_segment = speech[start:stop]
        inputs = processor(
            speech_segment, sampling_rate=sample_rate, return_tensors="pt"
        )
        with torch.no_grad():
            logits = model(**inputs).logits
        transcription = processor.batch_decode(logits.numpy()).text[0]
        transcripts.append(transcription)
    results.append(
        {"model": model,
        "transcription": " ".join(transcripts)}
    )

import pandas as pd
pd.DataFrame(data=results).to_csv("003_all_models_transcriptions.csv", index=False)


loading feature extractor configuration file https://huggingface.co/classla/wav2vec2-xls-r-parlaspeech-hr/resolve/main/preprocessor_config.json from cache at /home/peterr/.cache/huggingface/transformers/79aeaf04f96e1442b11c8797bc51a782c55deb7f34336137ce7636efa35238eb.bbc1eb890a39c82e710a893223b8452ac5b78e8b57083b2f893aa7dc59d4ed69
Feature extractor Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

loading configuration file https://huggingface.co/classla/wav2vec2-xls-r-parlaspeech-hr/resolve/main/config.json from cache at /home/peterr/.cache/huggingface/transformers/222cc5b63534bcf9de32ef1072698e562023b540e116ec1834027eb74c80f10b.42d844bcc0df00c157d82044dfe3f27d0ec30a9f5ce9326d90dcfd67a773b373
Model config Wav2Vec2Config {
  "_name_or_path": "classla/wav2vec2-xls-r-parlaspeech-hr",
  "activation_dro

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

storing https://huggingface.co/classla/wav2vec2-xls-r-parlaspeech-hr/resolve/main/pytorch_model.bin in cache at /home/peterr/.cache/huggingface/transformers/6779820bbd24a9113be27f213546a1f7f8261f8717439477e57aeb9f21411f71.5db5465366049b3610c99611f1e65d59b1ac7e41636a1f9778a6b16e08f823f5
creating metadata file for /home/peterr/.cache/huggingface/transformers/6779820bbd24a9113be27f213546a1f7f8261f8717439477e57aeb9f21411f71.5db5465366049b3610c99611f1e65d59b1ac7e41636a1f9778a6b16e08f823f5
loading weights file https://huggingface.co/classla/wav2vec2-xls-r-parlaspeech-hr/resolve/main/pytorch_model.bin from cache at /home/peterr/.cache/huggingface/transformers/6779820bbd24a9113be27f213546a1f7f8261f8717439477e57aeb9f21411f71.5db5465366049b3610c99611f1e65d59b1ac7e41636a1f9778a6b16e08f823f5
All model checkpoint weights were used when initializing Wav2Vec2ForCTC.

All the weights of Wav2Vec2ForCTC were initialized from the model checkpoint at classla/wav2vec2-xls-r-parlaspeech-hr.
If your task is 

TypeError: int() argument must be a string, a bytes-like object or a number, not 'list'

In [12]:
processor.batch_decode(logits.numpy()[::1])

TypeError: int() argument must be a string, a bytes-like object or a number, not 'list'

In [13]:
logits.numpy().shape

(1, 29999, 50)