In [1]:
modelname = "classla/wav2vec2-xls-r-parlaspeech-hr"

from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

processor = Wav2Vec2Processor.from_pretrained(modelname)
model = Wav2Vec2ForCTC.from_pretrained(modelname)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
!pip install https://github.com/kpu/kenlm/archive/master.zip pyctcdecode

Collecting https://github.com/kpu/kenlm/archive/master.zip
  Using cached https://github.com/kpu/kenlm/archive/master.zip
Building wheels for collected packages: kenlm
  Building wheel for kenlm (setup.py) ... [?25ldone
[?25h  Created wheel for kenlm: filename=kenlm-0.0.0-cp38-cp38-linux_x86_64.whl size=2350747 sha256=98ae35e0d37bcc34373da120f35cc5713347742bf9b75f6df9fb40ed813f259c
  Stored in directory: /tmp/pip-ephem-wheel-cache-hi1c1vzg/wheels/ff/08/4e/a3ddc0e786e0f3c1fcd2e7a82c4324c02fc3ae2638471406d2
Successfully built kenlm


In [3]:
with open("5gram.arpa", "r") as read_file, open("5gram_correct.arpa", "w") as write_file:
  has_added_eos = False
  from tqdm.auto import tqdm
  for line in tqdm(read_file):
    if not has_added_eos and "ngram 1=" in line:
      count=line.strip().split("=")[-1]
      write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
    elif not has_added_eos and "<s>" in line:
      write_file.write(line)
      write_file.write(line.replace("<s>", "</s>"))
      has_added_eos = True
    else:
      write_file.write(line)

0it [00:00, ?it/s]

In [4]:
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained(modelname,    eos_token="</s>", bos_token="<s>")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
vocab_dict = processor.tokenizer.get_vocab()
sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}

In [6]:
from pyctcdecode import build_ctcdecoder

decoder = build_ctcdecoder(
    labels=list(sorted_vocab_dict.keys()),
    kenlm_model_path="5gram_correct.arpa",
)

Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?
Unigrams and labels don't seem to agree.


In [7]:
from transformers import Wav2Vec2ProcessorWithLM

processor_with_lm = Wav2Vec2ProcessorWithLM(
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer,
    decoder=decoder,

)

# Reevaluating

In [8]:
import pandas as pd
import numpy as np
import os

transferdir = "transfer/"

def process(text: str):
    from parse import compile
    from string import punctuation

    p = compile("{hit:d}.")
    in_list = text.split()
    out_list = list()
    for seg in in_list:
        parse_result = p.parse(seg)
        if parse_result:
            # We got a number with a dot afterward:
            out_list.append(seg.lower())
        else:
            out_list.append(seg.translate(str.maketrans("", "", punctuation)).lower())
    return " ".join(out_list)

df = pd.read_json("ParlaSpeech-HR.v1.0.jsonl", orient="records", lines=True)
df = df.rename(columns={"path":"hashname"})
df = df.loc[df.split.isin("test,dev".split(",")), :]

df["sentence"] = df.words.apply(" ".join).apply(process)
df["path"] = df.hashname.apply(lambda s: os.path.join(transferdir, s))

In [9]:
from tqdm.auto import tqdm


from itertools import groupby
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor, Wav2Vec2CTCTokenizer
import soundfile as sf

import os

model = Wav2Vec2ForCTC.from_pretrained(modelname).cuda()

def get_transcript(audio_filepath:str):
    speech, sample_rate = sf.read(audio_filepath)
    input_values = processor_with_lm(speech, sampling_rate=sample_rate, return_tensors="pt").input_values.cuda()

    logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor_with_lm.decode(predicted_ids[0]).lower()


    words = [w for w in transcription.split(' ') if len(w) > 0]

    return " ".join(words)

from tqdm.auto import tqdm
df["predictions"] = [get_transcript(path) for path in tqdm(df.path.values)]


  0%|          | 0/1013 [00:00<?, ?it/s]

ValueError: Input logits of size 995, but vocabulary is size 50

In [None]:
from datasets import load_metric
wer_metric = load_metric("wer")
cer_metric = load_metric("cer")

for splt in ["dev", "test"]:
    print("Evaluating on ", splt)
    wer = wer_metric.compute(
        references=df.loc[df.split==splt, "sentence"],
        predictions=df.loc[df.split==splt, "predictions"]
    )

    cer = cer_metric.compute(
        references=df.loc[df.split==splt, "sentence"],
        predictions=df.loc[df.split==splt, "predictions"]
    )

    print(f"{wer=:0.4f}, {cer=:0.4f}")

In [11]:
processor.tokenizer.get_vocab()

{' ': 0,
 '[PAD]': 1,
 '[UNK]': 2,
 'a': 3,
 'b': 4,
 'c': 5,
 'd': 6,
 'e': 7,
 'f': 8,
 'g': 9,
 'h': 10,
 'i': 11,
 'j': 12,
 'k': 13,
 'l': 14,
 'm': 15,
 'n': 16,
 'o': 17,
 'p': 18,
 'q': 19,
 'r': 20,
 's': 21,
 't': 22,
 'u': 23,
 'v': 24,
 'w': 25,
 'x': 26,
 'y': 27,
 'z': 28,
 'ä': 29,
 'ü': 30,
 'ć': 31,
 'č': 32,
 'đ': 33,
 'š': 34,
 'ž': 35,
 'ӧ': 36,
 '1': 37,
 '2': 38,
 '3': 39,
 '4': 40,
 '5': 41,
 '6': 42,
 '7': 43,
 '8': 44,
 '9': 45,
 '0': 46,
 '.': 47,
 '<s>': 48,
 '</s>': 49}