In [2]:
import pandas as pd

df = pd.read_csv("006_crawling_juznevesti.csv")


In [25]:
model_name = "classla/wav2vec2-xls-r-parlaspeech-hr"


from itertools import groupby
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor, Wav2Vec2CTCTokenizer
import soundfile as sf

##############
# load model & audio and run audio through model
##############

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
    model_name, unk_token="[UNK]", 
    #pad_token="[PAD]", 
    #word_delimiter_token="|"
    )

feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor, tokenizer=tokenizer)

model = Wav2Vec2ForCTC.from_pretrained(model_name).cuda()


audio_filepath = df.path[0]
speech, sample_rate = sf.read(audio_filepath)
input_values = processor(speech[0:40*sample_rate], sampling_rate=sample_rate, return_tensors="pt").input_values.cuda()

logits = model(input_values).logits

predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0]).lower()

##############
# this is where the logic starts to get the start and end timestamp for each word
##############
words = [w for w in transcription.split() if len(w) > 0]
predicted_ids = predicted_ids[0].tolist()
duration_sec = input_values.shape[1] / sample_rate


ids_w_time = [(i / len(predicted_ids) * duration_sec, _id) for i, _id in enumerate(predicted_ids) if _id != processor.tokenizer.pad_token_id]
times_and_tokens = [(i, processor.tokenizer.convert_ids_to_tokens(j) )for i, j in ids_w_time]
indices_to_pop = list()
for i, tt in enumerate(times_and_tokens):
    try:
        if tt[1] == times_and_tokens[i+1][1]:
            indices_to_pop.append(i)
    except IndexError:
        continue
for i in sorted(indices_to_pop)[::-1]:
    times_and_tokens.pop(i)
word_starts = []
word_ends = []
word_started = True
for i, (time, token) in enumerate(times_and_tokens):
    if word_started:
        word_starts.append(time)
        word_started = False
    if token == " ":
        word_ends.append(time)
        word_started = True
    if i == len(times_and_tokens) -1:
        word_ends.append(time)
for word, start, end in zip(words, word_starts, word_ends):
    print(word, f"{start:0.3f}", f"{end:0.3f}", sep="\t")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


words=['upreko', 's', 'pričama', 'da', 'srbiji', 'ide', 'sve', 'bolje', 'izgleda', 'da', 'građani', 'to', 'ne', 'osjećaju', 'na', 'svojoj', 'koži', 'o', 'tome', 'koliko', 'su', 'pohvale', 'na', 'račun', 'ekonomije', 'u', 'srbiji', 'realne', 'razgovaramo', 'sa', 'redovnim', 'profesorom', 'ekonomskog', 'fakulteta', 'unišu', 'stšiprofeor', 'kada', 'su', 'mediji', 'nedavno', 'prenijeli', 'informaciju', 'gradske', 'uprave', 'za', 'privredu', 'da', 'industrijska', 'proizvodnja', 'unišu', 'beleži', 'konstantni', 'rast', 'ova', 'vezi', 'je', 'sudeći', 'bar', 'prema', 'komentarima', 'na', 'društvenim', 'mrežama', 'nasmela', 'mnoge'], word_starts=[12.566283141570784, 12.906453226613305, 12.986493246623311, 13.466733366683341, 13.586793396698349, 13.886943471735869, 14.08704352176088, 14.367183591795898, 15.18759379689845, 15.52776388194097, 15.607803901950977, 16.048024012006003, 16.208104052026012, 16.34817408704352, 16.76838419209605, 16.888444222111055, 17.228614307153578, 18.06903451725863, 

In [70]:
cur_ids_w_time

(12.56837891927952, 23)

In [23]:
times_and_tokens = [(i, processor.tokenizer.convert_ids_to_tokens(j) )for i, j in ids_w_time]
indices_to_pop = list()
for i, tt in enumerate(times_and_tokens):
    try:
        if tt[1] == times_and_tokens[i+1][1]:
            indices_to_pop.append(i)
    except IndexError:
        continue
for i in sorted(indices_to_pop)[::-1]:
    times_and_tokens.pop(i)
word_starts = []
word_ends = []
word_started = True
for i, (time, token) in enumerate(times_and_tokens):
    if word_started:
        word_starts.append(time)
        word_started = False
    if token == " ":
        word_ends.append(time)
        word_started = True
    if i == len(times_and_tokens) -1:
        word_ends.append(time)
len(word_ends), len(word_starts)
    

(65, 65)

In [24]:
len(words)

65

In [21]:
times_and_tokens

[(12.566283141570784, 'u'),
 (12.666333166583293, 'p'),
 (12.686343171585792, 'r'),
 (12.726363181590797, 'e'),
 (12.7863931965983, 'k'),
 (12.826413206603302, 'o'),
 (12.866433216608304, ' '),
 (12.906453226613305, 's'),
 (12.96648324162081, ' '),
 (12.986493246623311, 'p'),
 (13.026513256628313, 'r'),
 (13.086543271635819, 'i'),
 (13.186593296648324, 'č'),
 (13.266633316658329, 'a'),
 (13.346673336668333, 'm'),
 (13.366683341670836, 'a'),
 (13.446723361680842, ' '),
 (13.466733366683341, 'd'),
 (13.486743371685844, 'a'),
 (13.546773386693348, ' '),
 (13.586793396698349, 's'),
 (13.646823411705853, 'r'),
 (13.746873436718358, 'b'),
 (13.76688344172086, 'i'),
 (13.786893446723361, 'j'),
 (13.806903451725862, 'i'),
 (13.846923461730864, ' '),
 (13.886943471735869, 'i'),
 (13.946973486743373, 'd'),
 (13.986993496748374, 'e'),
 (14.047023511755878, ' '),
 (14.08704352176088, 's'),
 (14.167083541770886, 'v'),
 (14.207103551775887, 'e'),
 (14.287143571785892, ' '),
 (14.367183591795898, 'b'

In [43]:

word_start_times = []
word_end_times = []




In [34]:
processor.tokenizer.sep_token

Using sep_token, but it is not set yet.


In [26]:
words

['upreko',
 's',
 'pričama',
 'da',
 'srbiji',
 'ide',
 'sve',
 'bolje',
 'izgleda',
 'da',
 'građani',
 'to',
 'ne',
 'osjećaju',
 'na',
 'svojoj',
 'koži',
 'o',
 'tome',
 'koliko',
 'su',
 'pohvale',
 'na',
 'račun',
 'ekonomije',
 'u',
 'srbiji',
 'realne',
 'razgovaramo',
 'sa',
 'redovnim',
 'profesorom',
 'ekonomskog',
 'fakulteta',
 'u',
 'nišu',
 'bostejaviedobrodošliprofesore',
 'kada',
 'su',
 'mediji',
 'nedavno',
 'preneli']

In [12]:
input_values.shape

torch.Size([1, 480000])

In [13]:
processor.tokenizer.pad_token_id

1

In [14]:
cur_ids_w_time

[(12.56837891927952, 23),
 (12.66844563042028, 18),
 (12.688458972648432, 20),
 (12.728485657104738, 7),
 (12.788525683789192, 13),
 (12.808539026017344, 17),
 (12.84856571047365, 0),
 (12.8685790527018, 0),
 (12.888592394929953, 21),
 (12.908605737158105, 21),
 (12.948632421614409, 0),
 (12.968645763842561, 0),
 (12.988659106070713, 18),
 (13.02868579052702, 20),
 (13.068712474983322, 11),
 (13.088725817211474, 11),
 (13.188792528352234, 32),
 (13.268845897264843, 3),
 (13.3288859239493, 15),
 (13.348899266177451, 15),
 (13.368912608405603, 3),
 (13.388925950633755, 0),
 (13.408939292861907, 0),
 (13.428952635090061, 0),
 (13.448965977318213, 0),
 (13.468979319546364, 6),
 (13.488992661774516, 3),
 (13.509006004002668, 0),
 (13.529019346230822, 0),
 (13.549032688458972, 0),
 (13.569046030687124, 21),
 (13.649099399599734, 20),
 (13.729152768512343, 4),
 (13.749166110740493, 4),
 (13.769179452968645, 11),
 (13.789192795196797, 12),
 (13.80920613742495, 11),
 (13.829219479653103, 11),
 