In [1]:
import pandas as pd
import numpy as np
import os

transferdir = "transfer/"

def process(text: str):
    from parse import compile
    from string import punctuation

    p = compile("{hit:d}.")
    in_list = text.split()
    out_list = list()
    for seg in in_list:
        parse_result = p.parse(seg)
        if parse_result:
            # We got a number with a dot afterward:
            out_list.append(seg.lower())
        else:
            out_list.append(seg.translate(str.maketrans("", "", punctuation)).lower())
    return " ".join(out_list)

df = pd.read_json("ParlaSpeech-HR.v1.0.jsonl", orient="records", lines=True)
df = df.rename(columns={"path":"hashname"})
df = df.loc[df.split.isin("test,dev".split(",")), :]

df["sentence"] = df.words.apply(" ".join).apply(process)
df["path"] = df.hashname.apply(lambda s: os.path.join(transferdir, s))
df.head(2)

Unnamed: 0,hashname,orig_file,start,end,words,word_start_times,norm_words,norm_words_start_times,utterance_id_start,utterance_id_end,speaker_info,split,norm_words_edited,sentence,path
1,Ki_SnDM_EkQ_2917.58-2937.5.wav,"18 10 2018 - 9. sjednica, 9. saziv [Ki_SnDM_Ek...",2917.58,2937.5,"[izraelska, tvrtka, prodaje, avione, Hrvatskoj...","[0, 0.65, 1.04, 1.54, 1.92, 2.51, 3.11, 3.5700...","[izraelska, tvrtka, prodaje, avione, hrvatskoj...","[0, 0.65, 1.04, 1.54, 1.92, 2.51, 3.11, 3.5700...",ParlaMint-HR_S09.u4267,ParlaMint-HR_S09.u4267,"{'Speaker_role': 'Regular', 'Speaker_type': 'M...",dev,,izraelska tvrtka prodaje avione hrvatskoj dire...,transfer/Ki_SnDM_EkQ_2917.58-2937.5.wav
2,XguZsDKdRh4_13797.59-13811.74.wav,"20 1 2017 - 3. sjednica, 9. saziv [XguZsDKdRh4...",13797.59,13811.74,"[jučer, trećem,, sutra, ili, prekosutra, četvr...","[0, 0.43, 1.0, 1.4, 1.52, 2.01, 2.55, 3.12, 3....","[jučer, trećem, sutra, ili, prekosutra, četvrt...","[0, 0.43, 1.0, 1.4, 1.52, 2.01, 2.55, 3.12, 3....",ParlaMint-HR_S03.u9702,ParlaMint-HR_S03.u9702,"{'Speaker_role': 'Regular', 'Speaker_type': 'M...",dev,,jučer trećem sutra ili prekosutra četvrtoj oso...,transfer/XguZsDKdRh4_13797.59-13811.74.wav


In [15]:
import os
from tqdm.auto import tqdm
transferdir = "transfer/"

if all([os.path.exists(
                        os.path.join(
                            transferdir,
                            hashname
                        )
                    )  for hashname in tqdm(df.hashname) ]):
    print("All files exist!")
else:
    for hashname in tqdm(df.hashname):
        path = os.path.join(transferdir, hashname)
        if not os.path.exists(path):
            print(f"File {hashname} doesn't exist!")

  0%|          | 0/1013 [00:00<?, ?it/s]

All files exist!


In [3]:

from itertools import groupby
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor, Wav2Vec2CTCTokenizer
import soundfile as sf

##############
# load model & audio and run audio through model
##############
import os
# Use old or new vocab?
# os.system("cp vocab_december.json vocab.json")
os.system("cp vocab_300_witn_numbers.json vocab.json")

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
    "/home/peterr/macocu/task8/", unk_token="[UNK]", 
    pad_token="[PAD]", word_delimiter_token=" ")

feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor, tokenizer=tokenizer)


model_name = '/home/peterr/macocu/task8/40_unnormalised/checkpoint-6700'

model = Wav2Vec2ForCTC.from_pretrained(model_name).cuda()

def get_transcript(audio_filepath:str, model):
    speech, sample_rate = sf.read(audio_filepath)
    input_values = processor(speech, sampling_rate=sample_rate, return_tensors="pt").input_values.cuda()

    logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0]).lower()

    ##############
    # this is where the logic starts to get the start and end timestamp for each word
    ##############
    words = [w for w in transcription.split(' ') if len(w) > 0]
    predicted_ids = predicted_ids[0].tolist()
    duration_sec = input_values.shape[1] / sample_rate


    ids_w_time = [(i / len(predicted_ids) * duration_sec, _id) for i, _id in enumerate(predicted_ids)]
    # remove entries which are just "padding" (i.e. no characers are recognized)
    ids_w_time = [i for i in ids_w_time if i[1] != processor.tokenizer.pad_token_id]
    # now split the ids into groups of ids where each group represents a word
    split_ids_w_time = [list(group) for k, group
                        in groupby(ids_w_time, lambda x: x[1] == processor.tokenizer.word_delimiter_token_id)
                        if not k]
    assert len(split_ids_w_time) == len(words)  # make sure that there are the same number of id-groups as words. Otherwise something is wrong

    word_start_times = []
    word_end_times = []
    for cur_ids_w_time, cur_word in zip(split_ids_w_time, words):
        _times = [_time for _time, _id in cur_ids_w_time]
        word_start_times.append(min(_times))
        word_end_times.append(max(_times))

    return " ".join(words)

from tqdm.auto import tqdm
df["predictions"] = [get_transcript(path, model) for path in tqdm(df.path.values)]



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/1013 [00:00<?, ?it/s]

In [4]:
from datasets import load_metric
wer_metric = load_metric("wer")
cer_metric = load_metric("cer")
model_name = '/home/peterr/macocu/task8/40_unnormalised/checkpoint-6700'
results = []
for model_number in (38, 40):
    for checkpoint in (1340, 2680, 4020, 5360, 6700):
        model_name = f'/home/peterr/macocu/task8/{model_number}_unnormalised/checkpoint-{checkpoint}'
        model = Wav2Vec2ForCTC.from_pretrained(model_name).cuda()
        for splt in ["dev", "test"]:
            print(f"Evaluating model {model_number}, checkpoint {checkpoint} on ", splt)

            predictions = [get_transcript(path, model) for path in tqdm(df.loc[df.split==splt, "path"].values)]
            wer = wer_metric.compute(
                references=df.loc[df.split==splt, "sentence"],
                predictions=predictions
            )

            cer = cer_metric.compute(
                references=df.loc[df.split==splt, "sentence"],
                predictions=predictions
            )

            print(f"{wer=:0.4f}, {cer=:0.4f}")
            results.append({
                "model_nr": model_number,
                "checkpoint": checkpoint,
                "wer":wer,
                "cer": cer,
                "split": splt
            })

Evaluating model 38, checkpoint 1340 on  dev


  0%|          | 0/500 [00:00<?, ?it/s]

wer=0.1383, cer=0.0408
Evaluating model 38, checkpoint 1340 on  test


  0%|          | 0/513 [00:00<?, ?it/s]

wer=0.1056, cer=0.0289
Evaluating model 38, checkpoint 2680 on  dev


  0%|          | 0/500 [00:00<?, ?it/s]

wer=0.1218, cer=0.0366
Evaluating model 38, checkpoint 2680 on  test


  0%|          | 0/513 [00:00<?, ?it/s]

wer=0.0854, cer=0.0249
Evaluating model 38, checkpoint 4020 on  dev


  0%|          | 0/500 [00:00<?, ?it/s]

wer=0.1118, cer=0.0349
Evaluating model 38, checkpoint 4020 on  test


  0%|          | 0/513 [00:00<?, ?it/s]

wer=0.0813, cer=0.0246
Evaluating model 38, checkpoint 5360 on  dev


  0%|          | 0/500 [00:00<?, ?it/s]

wer=0.1046, cer=0.0335
Evaluating model 38, checkpoint 5360 on  test


  0%|          | 0/513 [00:00<?, ?it/s]

wer=0.0761, cer=0.0234
Evaluating model 38, checkpoint 6700 on  dev


  0%|          | 0/500 [00:00<?, ?it/s]

wer=0.1023, cer=0.0323
Evaluating model 38, checkpoint 6700 on  test


  0%|          | 0/513 [00:00<?, ?it/s]

wer=0.0740, cer=0.0232
Evaluating model 40, checkpoint 1340 on  dev


  0%|          | 0/500 [00:00<?, ?it/s]

wer=0.1100, cer=0.0341
Evaluating model 40, checkpoint 1340 on  test


  0%|          | 0/513 [00:00<?, ?it/s]

wer=0.0815, cer=0.0242
Evaluating model 40, checkpoint 2680 on  dev


  0%|          | 0/500 [00:00<?, ?it/s]

wer=0.0952, cer=0.0305
Evaluating model 40, checkpoint 2680 on  test


  0%|          | 0/513 [00:00<?, ?it/s]

wer=0.0702, cer=0.0221
Evaluating model 40, checkpoint 4020 on  dev


  0%|          | 0/500 [00:00<?, ?it/s]

wer=0.0932, cer=0.0314
Evaluating model 40, checkpoint 4020 on  test


  0%|          | 0/513 [00:00<?, ?it/s]

wer=0.0701, cer=0.0224
Evaluating model 40, checkpoint 5360 on  dev


  0%|          | 0/500 [00:00<?, ?it/s]

wer=0.0928, cer=0.0312
Evaluating model 40, checkpoint 5360 on  test


  0%|          | 0/513 [00:00<?, ?it/s]

wer=0.0666, cer=0.0222
Evaluating model 40, checkpoint 6700 on  dev


  0%|          | 0/500 [00:00<?, ?it/s]

wer=0.0921, cer=0.0311
Evaluating model 40, checkpoint 6700 on  test


  0%|          | 0/513 [00:00<?, ?it/s]

wer=0.0679, cer=0.0222


In [8]:
rezdf = pd.DataFrame(data=results)
rezdf["epoch"] = rezdf["checkpoint"] * 8 / 6700
rezdf.to_json("38_and_40_per_checkpoint_eval.json", orient="records", lines=True)
rezdf[rezdf.checkpoint.isin([6700, 4020])]

Unnamed: 0,model_nr,checkpoint,wer,cer,split,epoch
4,38,4020,0.11182,0.034887,dev,4.8
5,38,4020,0.081251,0.02459,test,4.8
8,38,6700,0.102332,0.032324,dev,8.0
9,38,6700,0.073984,0.023217,test,8.0
14,40,4020,0.09324,0.031436,dev,4.8
15,40,4020,0.070076,0.022351,test,4.8
18,40,6700,0.092054,0.031101,dev,8.0
19,40,6700,0.067874,0.022248,test,8.0


In [21]:
df[["sentence", "predictions", "split", "hashname"]].to_csv(
    "38_model_output_for_evaluation.csv", 
    index=False
    )