In [None]:
models = [
    "classla/wav2vec2-large-slavic-parlaspeech-hr-lm",
    "classla/wav2vec2-xls-r-parlaspeech-hr-lm"
]

files = [
    f"/home/peterr/macocu/task18/data/sample{i}.wav" for i in [1,2,3]
]

from pathlib import Path
def transcribe(model_str, audio_filepath):
    from transformers import AutoProcessor, Wav2Vec2ForCTC, AutoModelForCTC, Wav2Vec2ProcessorWithLM
    processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_str)
    model = Wav2Vec2ForCTC.from_pretrained(model_str)

    def get_transcript(audio_filepath:str):
        import soundfile as sf
        import torch

        speech, sample_rate = sf.read(audio_filepath)
        input_values = processor(speech, sampling_rate=sample_rate, return_tensors="pt").input_values.cuda()
        inputs = processor(speech, sampling_rate=sample_rate, return_tensors="pt")
        with torch.no_grad():
            logits = model(**inputs).logits
        transcription = processor.batch_decode(logits.numpy()).text
        return transcription[0]
    
    return get_transcript(audio_filepath)
results = []
for file in files:
    for model in models:
        transcription = transcribe(model, file)
        results.append({
            "model": model,
            "file": str(Path(file).name),
            "transcription": transcription
        })
        
import json
with open("001_results.jsonl", "w") as f:
    for result in results:
        f.write(
            json.dumps(result)
        )

