In [1]:
modelname = "38_unnormalised/checkpoint-6700/"

from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

processor = Wav2Vec2Processor.from_pretrained(modelname)
model = Wav2Vec2ForCTC.from_pretrained(modelname)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
!pip install https://github.com/kpu/kenlm/archive/master.zip pyctcdecode

In [None]:
# with open("5gram.arpa", "r") as read_file, open("5gram_correct.arpa", "w") as write_file:
#   has_added_eos = False
#   from tqdm.auto import tqdm
#   for line in tqdm(read_file):
#     if not has_added_eos and "ngram 1=" in line:
#       count=line.strip().split("=")[-1]
#       write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
#     elif not has_added_eos and "<s>" in line:
#       write_file.write(line)
#       write_file.write(line.replace("<s>", "</s>"))
#       has_added_eos = True
#     else:
#       write_file.write(line)

In [2]:
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained(modelname,    eos_token="</s>", bos_token="<s>")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
vocab_dict = processor.tokenizer.get_vocab()
sorted_vocab_dict = {k: v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1]) if k != "<pad>"}

In [12]:
sorted_vocab_dict.keys()

dict_keys([' ', '[PAD]', '[UNK]', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ä', 'ü', 'ć', 'č', 'đ', 'š', 'ž', 'ӧ', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '.', '<s>', '</s>'])

In [14]:
from pyctcdecode import build_ctcdecoder

decoder = build_ctcdecoder(
    labels=list(sorted_vocab_dict.keys()),
    kenlm_model_path="5gram.bin",
)

Unigrams not provided and cannot be automatically determined from LM file (only arpa format). Decoding accuracy might be reduced.
Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?
No known unigrams provided, decoding results might be a lot worse.


In [15]:
from transformers import Wav2Vec2ProcessorWithLM

processor_with_lm = Wav2Vec2ProcessorWithLM(
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer,
    decoder=decoder,
)

In [18]:
processor_with_lm.save_pretrained("38_with_lm/wav2vec2-xls-r-parlaspeech-hr",
)

In [19]:
model.save_pretrained("38_with_lm/wav2vec2-xls-r-parlaspeech-hr",)

# Reevaluating

In [None]:
import pandas as pd
import numpy as np
import os

transferdir = "transfer/"

def process(text: str):
    from parse import compile
    from string import punctuation

    p = compile("{hit:d}.")
    in_list = text.split()
    out_list = list()
    for seg in in_list:
        parse_result = p.parse(seg)
        if parse_result:
            # We got a number with a dot afterward:
            out_list.append(seg.lower())
        else:
            out_list.append(seg.translate(str.maketrans("", "", punctuation)).lower())
    return " ".join(out_list)

df = pd.read_json("ParlaSpeech-HR.v1.0.jsonl", orient="records", lines=True)
df = df.rename(columns={"path":"hashname"})
df = df.loc[df.split.isin("test,dev".split(",")), :]

df["sentence"] = df.words.apply(" ".join).apply(process)
df["path"] = df.hashname.apply(lambda s: os.path.join(transferdir, s))

In [None]:
from tqdm.auto import tqdm


from itertools import groupby
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor, Wav2Vec2CTCTokenizer
import soundfile as sf

import os

model = Wav2Vec2ForCTC.from_pretrained(modelname).cuda()

def get_transcript(audio_filepath:str):
    speech, sample_rate = sf.read(audio_filepath)
    input_values = processor_with_lm(speech, sampling_rate=sample_rate, return_tensors="pt").input_values.cuda()

    logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor_with_lm.decode(predicted_ids[0]).lower()


    words = [w for w in transcription.split(' ') if len(w) > 0]

    return " ".join(words)

from tqdm.auto import tqdm
df["predictions"] = [get_transcript(path) for path in tqdm(df.path.values)]


In [None]:
from datasets import load_metric
wer_metric = load_metric("wer")
cer_metric = load_metric("cer")

for splt in ["dev", "test"]:
    print("Evaluating on ", splt)
    wer = wer_metric.compute(
        references=df.loc[df.split==splt, "sentence"],
        predictions=df.loc[df.split==splt, "predictions"]
    )

    cer = cer_metric.compute(
        references=df.loc[df.split==splt, "sentence"],
        predictions=df.loc[df.split==splt, "predictions"]
    )

    print(f"{wer=:0.4f}, {cer=:0.4f}")

In [None]:
processor.tokenizer.get_vocab()

In [None]:
audio_filepath = df.path.values[2]
speech, sample_rate = sf.read(audio_filepath)
input_values = processor_with_lm(speech, sampling_rate=sample_rate, return_tensors="pt").input_values.cuda()

logits = model(input_values).logits

predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor_with_lm.batch_decode(predicted_ids[0])
transcription


In [None]:
input_values = processor(speech, sampling_rate=sample_rate, return_tensors="pt").input_values.cuda()

logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0]).lower()
logits.shape

In [None]:
input_values = processor_with_lm(speech, sampling_rate=sample_rate, return_tensors="pt").input_values.cuda()

logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
#transcription = processor_with_lm.decode(predicted_ids[0]).lower()
logits

In [None]:
predicted_ids[0]