In [1]:
import pandas as pd
from datasets import load_metric
df = pd.read_json("30_dev_and_test_split_transcriptions_for_inspection.jsonl", lines=True)
wer = load_metric("wer")
cer = load_metric("cer")

r = dict()
for split in "dev test".split():
    subset = df[df.split == split]
    gold = subset.preprocessed_original_text.tolist()
    transcriptions = subset.model_output.tolist()
    w = wer.compute(predictions = transcriptions,
            references = gold)
    c = cer.compute(predictions = transcriptions,
            references = gold)
    r[split] = {"WER": w, "CER": c}
    
print(pd.DataFrame(r).to_markdown())

|     |      dev |     test |
|:----|---------:|---------:|
| WER | 0.295206 | 0.290094 |
| CER | 0.140766 | 0.137642 |


In [5]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2FeatureExtractor, Wav2Vec2Processor,Wav2Vec2CTCTokenizer
import soundfile as sf
import torch
import os

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# load model and tokenizer

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
    "./",
    #unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token=" "
    )

feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor, tokenizer=tokenizer)

model = Wav2Vec2ForCTC.from_pretrained("model_01_preprocessed/checkpoint-2440/")


# download the example wav files:
os.system("wget https://huggingface.co/classla/wav2vec2-xls-r-parlaspeech-hr/raw/main/00020570a.flac.wav")

# read the wav file 
speech, sample_rate = sf.read("00020570a.flac.wav")
input_values = processor(speech, sampling_rate=sample_rate, return_tensors="pt").input_values.to(device)

# remove the raw wav file
os.system("rm 00020570a.flac.wav")

# retrieve logits
logits = model.to(device)(input_values).logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])

transcription

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
--2022-08-18 12:33:02--  https://huggingface.co/classla/wav2vec2-xls-r-parlaspeech-hr/raw/main/00020570a.flac.wav
Resolving www-proxy.ijs.si (www-proxy.ijs.si)... 2001:1470:ff80::3128:1, 193.2.4.4
Connecting to www-proxy.ijs.si (www-proxy.ijs.si)|2001:1470:ff80::3128:1|:8080... connected.
Proxy request sent, awaiting response... 200 OK
Length: 170412 (166K) [audio/wave]
Saving to: ‘00020570a.flac.wav’

     0K .......... .......... .......... .......... .......... 30%  230K 1s
    50K .......... .......... .......... .......... .......... 60%  458K 0s
   100K .......... .......... .......... .......... .......... 90% 34.1M 0s
   150K .......... ......                                     100%  126M=0.3s

2022-08-18 12:33:03 (508 KB/s) - ‘00020570a.flac.wav’ saved [170412/170412]



've[PAD]l[PAD]i[PAD]k[PAD] [PAD]b[PAD]ro[PAD]j[PAD] [PAD]p[PAD]o[PAD]s[PAD]lo[PAD]v[PAD]nih [PAD]s[PAD]u[PAD]b[PAD]je[PAD]k[PAD]a[PAD]t[PAD]a[PAD] [PAD]p[PAD]o[PAD]s[PAD]lu[PAD]je[PAD] [PAD]s[PAD]a[PAD] [PAD]m[PAD]i[PAD]n[PAD]o[PAD]s[PAD]o[PAD]m[PAD] [PAD]v[PAD]e[PAD]l[PAD]i[PAD]k[PAD] [PAD]d[PAD]e[PAD]o[PAD]'

In [6]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import soundfile as sf
import torch
import os

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# load model and tokenizer
processor = Wav2Vec2Processor.from_pretrained(
    "5roop/wav2vec2-xls-r-juznevesti-sr")
model = Wav2Vec2ForCTC.from_pretrained("5roop/wav2vec2-xls-r-juznevesti-sr")


# download the example wav files:
os.system("wget https://huggingface.co/classla/wav2vec2-xls-r-parlaspeech-hr/raw/main/00020570a.flac.wav")

# read the wav file 
speech, sample_rate = sf.read("00020570a.flac.wav")
input_values = processor(speech, sampling_rate=sample_rate, return_tensors="pt").input_values.to(device)

# remove the raw wav file
os.system("rm 00020570a.flac.wav")

# retrieve logits
logits = model.to(device)(input_values).logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])

transcription # 'velik broj poslovnih subjekata posluje sa minosom velik deo'


Downloading:   0%|          | 0.00/214 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/121 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.01k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

--2022-08-18 12:47:33--  https://huggingface.co/classla/wav2vec2-xls-r-parlaspeech-hr/raw/main/00020570a.flac.wav
Resolving www-proxy.ijs.si (www-proxy.ijs.si)... 2001:1470:ff80::3128:1, 193.2.4.4
Connecting to www-proxy.ijs.si (www-proxy.ijs.si)|2001:1470:ff80::3128:1|:8080... connected.
Proxy request sent, awaiting response... 200 OK
Length: 170412 (166K) [audio/wave]
Saving to: ‘00020570a.flac.wav’

     0K .......... .......... .......... .......... .......... 30%  230K 1s
    50K .......... .......... .......... .......... .......... 60%  457K 0s
   100K .......... .......... .......... .......... .......... 90% 42.7M 0s
   150K .......... ......                                     100% 50.2M=0.3s

2022-08-18 12:47:33 (508 KB/s) - ‘00020570a.flac.wav’ saved [170412/170412]



'velik broj poslovnih subjekata posluje sa minosom velik deo'