In [37]:
from itertools import groupby
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor, Wav2Vec2CTCTokenizer
import soundfile as sf

def get_words_and_times(
    speech, sample_rate, model_name="classla/wav2vec2-xls-r-parlaspeech-hr"
):
    tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
        model_name,
        unk_token="[UNK]",
        # pad_token="[PAD]",
        # word_delimiter_token=" "
    )
    feature_extractor = Wav2Vec2FeatureExtractor(
        feature_size=1,
        sampling_rate=sample_rate,
        padding_value=0.0,
        do_normalize=True,
        return_attention_mask=True,
    )
    processor = Wav2Vec2Processor(
        feature_extractor=feature_extractor, tokenizer=tokenizer
    )
    model = Wav2Vec2ForCTC.from_pretrained(model_name).cuda()
    input_values = processor(
        speech, sampling_rate=sample_rate, return_tensors="pt"
    ).input_values.cuda()

    logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0]).lower()

    ##############
    # this is where the logic starts to get the start and end timestamp for each word
    ##############
    words = [w for w in transcription.split() if len(w) > 0]
    predicted_ids = predicted_ids[0].tolist()
    duration_sec = input_values.shape[1] / sample_rate

    ids_w_time = [
        (i / len(predicted_ids) * duration_sec, _id)
        for i, _id in enumerate(predicted_ids)
        if _id != processor.tokenizer.pad_token_id
    ]
    times_and_tokens = [
        (i, processor.tokenizer.convert_ids_to_tokens(j)) for i, j in ids_w_time
    ]
    indices_to_pop = list()
    for i, tt in enumerate(times_and_tokens):
        try:
            if tt[1] == times_and_tokens[i + 1][1]:
                indices_to_pop.append(i)
        except IndexError:
            continue
    for i in sorted(indices_to_pop)[::-1]:
        times_and_tokens.pop(i)
    word_starts = []
    word_ends = []
    word_started = True
    for i, (time, token) in enumerate(times_and_tokens):
        if word_started:
            word_starts.append(time)
            word_started = False
        if token == " ":
            word_ends.append(time)
            word_started = True
        if i == len(times_and_tokens) - 1:
            word_ends.append(time)
    return words, word_starts, word_ends


def process_file(
    filename, model_name="classla/wav2vec2-xls-r-parlaspeech-hr", lim_minutes=1
):
    import numpy as np

    speech, sample_rate = sf.read(filename)
    overlap_seconds = 1
    indices = np.arange(
        0, speech.shape[0], lim_minutes * 60 * sample_rate, dtype=int
    ).tolist() + [-1]
    length_of_audio_in_s = len(speech) / sample_rate
    transcript = ""
    word_starts = list()
    word_ends = list()
    print(f"{indices=}")
    print(f"{[i/sample_rate for i in indices] = }")
    for start, stop in zip(indices[0:-2], indices[1:]):
        # If overlap would make the segment go
        # over the end, correct stop variable:
        # if stop + overlap_seconds * sample_rate >= speech.shape[0]:
        #     stop = -1
        speech_segment = speech[start:stop].copy()
        words, starts, stops = get_words_and_times(
            speech_segment, sample_rate, model_name
        )
        transcript = (
            transcript
            + " "
            + " ".join(words)
            .replace("[pad]", "")
            .replace("<pad>", "")
            .replace("[PAD]", "")
            .replace("<PAD>", "")
        )
        #print(f"{start=}, {min(starts)=}, {max(stops)=}, ")
        word_starts.extend([i + start / sample_rate for i in starts])
        word_ends.extend([i + start / sample_rate for i in stops])
    last_word_end = word_ends[-1]
    k = (length_of_audio_in_s - last_word_end) / length_of_audio_in_s
    word_starts = [i + k*i for i in word_starts]
    word_ends = [i + k*i for i in word_ends]
    return transcript, word_starts, word_ends


In [38]:
import pandas as pd

df = pd.read_csv("006_crawling_juznevesti.csv",)

df.path[0]

'audio/00DtyZ2sCUM.wav'

# Interlude:

process a new file from youtube:

In [39]:
words, starts, ends = process_file("audio/s1iBR07bVrg_clipped.wav",
model_name="classla/wav2vec2-xls-r-parlaspeech-hr-lm",  lim_minutes = 0.25)

indices=[0, 240000, 480000, 720000, 960000, 1200000, 1440000, 1680000, 1920000, 2160000, 2400000, 2640000, 2880000, 3120000, 3360000, 3600000, 3840000, 4080000, 4320000, 4560000, 4800000, 5040000, 5280000, 5520000, 5760000, 6000000, 6240000, 6480000, 6720000, 6960000, 7200000, 7440000, 7680000, 7920000, 8160000, 8400000, 8640000, 8880000, 9120000, 9360000, 9600000, 9840000, 10080000, 10320000, 10560000, 10800000, 11040000, 11280000, 11520000, 11760000, 12000000, 12240000, 12480000, 12720000, 12960000, 13200000, 13440000, 13680000, 13920000, 14160000, 14400000, 14640000, 14880000, 15120000, 15360000, 15600000, 15840000, 16080000, 16320000, 16560000, 16800000, 17040000, -1]
[i/sample_rate for i in indices] = [0.0, 15.0, 30.0, 45.0, 60.0, 75.0, 90.0, 105.0, 120.0, 135.0, 150.0, 165.0, 180.0, 195.0, 210.0, 225.0, 240.0, 255.0, 270.0, 285.0, 300.0, 315.0, 330.0, 345.0, 360.0, 375.0, 390.0, 405.0, 420.0, 435.0, 450.0, 465.0, 480.0, 495.0, 510.0, 525.0, 540.0, 555.0, 570.0, 585.0, 600.0, 615.

In [44]:
list(map(len, (words.split(), starts, ends)))

[2196, 2210, 2210]

In [45]:
for w, s, e in zip(words.split(), starts, ends):
    print(f"{w:<20}{s:0.2f}-{e:0.2f}")

konačni             1.14-1.48
prijedlog           1.50-1.89
zakona              1.91-2.40
o                   2.42-2.48
izmjenama           2.50-2.90
i                   2.92-2.98
dopunama            3.01-3.45
zakona              3.47-3.78
o                   3.80-3.84
pravu               3.88-4.18
na                  4.20-4.30
pristup             4.33-4.69
informacijama       4.73-7.47
to                  7.49-7.57
je                  7.59-7.68
prijedlog           7.72-8.04
zakona              8.06-8.39
br                  8.43-9.00
199to               9.02-10.13
nie                 10.17-10.40
u                   10.44-10.50
drugom              10.52-10.88
čitanju             10.92-11.61
sa                  11.66-11.82
svim                11.86-12.49
karakteristikama    12.55-13.54
koje                13.56-13.71
imaju               13.75-13.93
u                   13.95-13.99
zakon               14.01-14.36
u                   14.38-14.40
drugom              14.44-14.68
čitanju      

In [46]:
len(starts), len(ends), len(words.split())

(2210, 2210, 2196)

New rewrite:


In [77]:
from itertools import groupby
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor, Wav2Vec2CTCTokenizer
import soundfile as sf


def get_words_and_times(
    speech, sample_rate, model_name="classla/wav2vec2-xls-r-parlaspeech-hr"
):
    tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
        model_name,
        unk_token="[UNK]",
        # pad_token="[PAD]",
        # word_delimiter_token=" "
    )
    feature_extractor = Wav2Vec2FeatureExtractor(
        feature_size=1,
        sampling_rate=sample_rate,
        padding_value=0.0,
        do_normalize=True,
        return_attention_mask=True,
    )
    processor = Wav2Vec2Processor(
        feature_extractor=feature_extractor, tokenizer=tokenizer
    )
    model = Wav2Vec2ForCTC.from_pretrained(model_name).cuda()
    input_values = processor(
        speech, sampling_rate=sample_rate, return_tensors="pt"
    ).input_values.cuda()

    logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0]).lower()

    ##############
    # this is where the logic starts to get the start and end timestamp for each word
    ##############
    words = [w for w in transcription.split() if len(w) > 0]
    predicted_ids = predicted_ids[0].tolist()
    duration_sec = input_values.shape[1] / sample_rate

    ids_w_time = [
        (i / len(predicted_ids) * duration_sec, _id)
        for i, _id in enumerate(predicted_ids)
        if _id != processor.tokenizer.pad_token_id
    ]
    times_and_tokens = [
        (i, processor.tokenizer.convert_ids_to_tokens(j)) for i, j in ids_w_time
    ]
    indices_to_pop = list()
    for i, tt in enumerate(times_and_tokens):
        try:
            if tt[1] == times_and_tokens[i + 1][1]:
                indices_to_pop.append(i)
        except IndexError:
            continue
    for i in sorted(indices_to_pop)[::-1]:
        times_and_tokens.pop(i)
    word_starts = []
    word_ends = []
    word_started = True
    for i, (time, token) in enumerate(times_and_tokens):
        if word_started:
            word_starts.append(time)
            word_started = False
        if token == " ":
            word_ends.append(time)
            word_started = True
        if i == len(times_and_tokens) - 1:
            word_ends.append(time)
    return words, word_starts, word_ends


def process_file(
    filename, model_name="classla/wav2vec2-xls-r-parlaspeech-hr", lim_minutes=0.25
):
    import numpy as np

    speech, sample_rate = sf.read(filename)
    indices = np.arange(
        0, speech.shape[0], int(lim_minutes * 60 * sample_rate), dtype=int
    )
    transcription = ""
    word_starts = []
    word_ends = []
    overlap_seconds = 1
    for start, stop in zip(indices, indices[1:]):
        speech_segment = speech[start : stop + int(sample_rate * overlap_seconds)]
        words, starts, ends = get_words_and_times(
            speech_segment, sample_rate, model_name=model_name
        )
        transcription += " " + " ".join(words)
        word_starts += [i + start / sample_rate for i in starts]
        word_ends += [i + start / sample_rate for i in ends]
    start, stop = indices[-1], speech.shape[0]
    speech_segment = speech[start:]
    words, starts, ends = get_words_and_times(
        speech_segment, sample_rate, model_name=model_name
    )
    transcription += " " + " ".join(words)
    word_starts += [i + start / sample_rate for i in starts]
    word_ends += [i + start / sample_rate for i in ends]

    transcription = transcription.replace("[pad]", "")
    return transcription, word_starts, word_ends


words, starts, ends = process_file(
    "audio/s1iBR07bVrg_clipped.wav",
    model_name="classla/wav2vec2-xls-r-parlaspeech-hr-lm",
    lim_minutes=0.25,
)


In [78]:
for w, s, e in zip(words.split(), starts, ends):
    print(f"{w.replace('[pad]', ''):<20}{s:0.2f}-{e:0.2f}")

len(starts), len(ends), len(words.split())

konačni             1.12-1.46
prijedlog           1.48-1.86
zakona              1.88-2.36
o                   2.38-2.44
izmjenama           2.46-2.86
i                   2.88-2.94
dopunama            2.96-3.40
zakona              3.42-3.72
o                   3.74-3.78
pravu               3.82-4.13
na                  4.15-4.25
pristup             4.27-4.63
informacijama       4.67-7.37
to                  7.39-7.47
je                  7.49-7.57
prijedlog           7.61-7.91
zakona              7.93-8.27
br                  8.31-8.87
199                 8.89-9.81
to                  9.83-9.99
nije                10.03-10.25
u                   10.29-10.35
drugom              10.37-10.73
čitanju             10.77-11.45
sa                  11.49-11.65
svim                11.69-12.32
karakteristikama    12.38-13.36
koje                13.38-13.52
imaju               13.56-13.80
zakon               13.82-14.16
u                   14.18-14.20
drugom              14.24-14.48
čitanju         

(2388, 2388, 2375)

In [79]:
words, starts, ends = process_file(
    "audio/clipped_clipped.wav",
    model_name="classla/wav2vec2-xls-r-parlaspeech-hr-lm",
    lim_minutes=0.25,
)
for w, s, e in zip(words.split(), starts, ends):
    print(f"{w.replace('[pad]', ''):<20}{s:0.2f}-{e:0.2f}")

len(starts), len(ends), len(words.split())

nadalje             1.90-2.30
utvrđuje            2.32-2.70
se                  2.74-2.84
obveza              2.88-3.16
tijela              3.18-3.44
javne               3.46-3.72
vlasti              3.74-4.13
da                  4.15-4.25
kada                4.27-4.83
obavještava         4.85-5.43
korisnika           5.49-6.15
da                  6.17-6.27
je                  6.29-6.37
informaciju         7.51-8.09
već                 8.11-8.33
dobio               8.37-9.31
ili                 9.33-9.45
je                  9.47-9.55
informacija         9.57-10.43
javno               10.49-10.81
objavljena          10.83-11.63
korisnika           11.67-12.28
treba               12.30-12.58
obavijesti          12.60-13.22
na                  13.24-13.34
koji                13.36-13.58
način               13.60-14.44
i                   14.46-14.52
kojim               14.56-14.86
aktom               14.88-15.32
je                  15.36-15.42
informacija         15.44-15.98
to           

(169, 169, 169)