In [1]:
import pandas as pd

df = pd.read_csv("006_crawling_juznevesti.csv")


In [2]:
model_name = "classla/wav2vec2-large-slavic-parlaspeech-hr"


from itertools import groupby
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor, Wav2Vec2CTCTokenizer
import soundfile as sf

##############
# load model & audio and run audio through model
##############

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
    model_name, unk_token="[UNK]", 
    # pad_token="[PAD]", 
    # word_delimiter_token="|"
    )

feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor, tokenizer=tokenizer)

model = Wav2Vec2ForCTC.from_pretrained(model_name).cuda()


audio_filepath = "audio_2.wav"
speech, sample_rate = sf.read(audio_filepath)
input_values = processor(speech[0:60*sample_rate], sampling_rate=sample_rate, return_tensors="pt").input_values.cuda()

logits = model(input_values).logits

predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0]).lower()

##############
# this is where the logic starts to get the start and end timestamp for each word
##############
words = [w for w in transcription.split() if len(w) > 0]
predicted_ids = predicted_ids[0].tolist()
duration_sec = input_values.shape[1] / sample_rate


ids_w_time = [(i / len(predicted_ids) * duration_sec, _id) for i, _id in enumerate(predicted_ids) if _id != processor.tokenizer.pad_token_id]
times_and_tokens = [(i, processor.tokenizer.convert_ids_to_tokens(j) )for i, j in ids_w_time]
indices_to_pop = list()
for i, tt in enumerate(times_and_tokens):
    try:
        if tt[1] == times_and_tokens[i+1][1]:
            indices_to_pop.append(i)
    except IndexError:
        continue
for i in sorted(indices_to_pop)[::-1]:
    times_and_tokens.pop(i)
word_starts = []
word_ends = []
word_started = True
for i, (time, token) in enumerate(times_and_tokens):
    if word_started:
        word_starts.append(time)
        word_started = False
    if token == " ":
        word_ends.append(time)
        word_started = True
    if i == len(times_and_tokens) -1:
        word_ends.append(time)
for word, start, end in zip(words, word_starts, word_ends):
    print(word, f"{start:0.3f}", f"{end:0.3f}", sep="\t")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


upoznatom	21.187	21.827
meselu	21.847	22.427
ljudskoj	22.467	22.848
izvanrednoj	22.888	23.528
deceni	23.568	24.588
i	24.608	24.668
saia	24.728	25.128
berlin	25.168	25.549
je	25.589	25.649
tcvridio	25.729	26.649
da	26.689	26.809
bi	26.889	27.309
citiram	27.369	28.149
teško	28.209	28.630
bilo	28.710	29.050
u	29.070	29.150
ljusi	29.190	29.610
19.	29.930	30.270
vijeka	30.310	31.050
pronaći	31.090	31.851
jednu	31.911	32.231
jedinu	32.331	32.751
političku	32.831	33.391
i	33.431	33.491
društvenu	33.531	34.031
ideju	34.071	34.832
nastalu	34.872	35.552
na	35.592	35.692
domaćem	35.752	36.272
tlu	36.332	37.713
zavješen	38.173	38.633
citat	38.693	39.433
te	39.473	39.593
da	39.613	39.733
se	39.793	39.893
sa	40.433	40.674
eventualnim	40.714	41.514
izuzetkom	41.554	42.254
postojevih	42.314	43.094
ideja	43.134	43.615
o	43.655	44.115
nepružanju	44.135	44.915
otporja	44.975	46.255
sve	46.315	46.536
ideje	46.576	46.896
koje	46.956	47.136
nalazimo	47.176	47.736
ne	47.796	47.916
samo	47.996	48.256
što	48.2

In [3]:
list(map(len, (words, word_starts, word_ends)))

[76, 76, 76]