In [1]:
## get time info dataframe from textgrid
import re
import pandas as pd
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch


def get_audio_df(textgrid_dir):
    
    with open(textgrid_dir, 'r') as file:
        lines = [line.rstrip() for line in file]
       
    text_list = []
    xmin_list = []
    xmax_list = []
    interval_list = []
    for line in lines[14:]:  #informations needed begin on the 9th lines
        if 'text =' in line:
            line = line.split('"')[1]
            if line != "":
                text_list.append(line)
                #print (len(text_list), len(xmin_list), len(xmax_list), len(interval_list), line, interval_list[-2:])
            else:
                xmin_list = xmin_list[:-1]
                xmax_list = xmax_list[:-1]
                interval_list = interval_list[:-1]
        if 'xmin' in line:
            time_min = line.split('=')[1].strip(' ')
            xmin_list.append(time_min)
        if 'xmax' in line:
            time_max = line.split('=')[1].strip(' ')
            xmax_list.append(time_max)
        if 'intervals [' in line:
            interval = line.split('[')[1].strip(']:')
            interval_list.append(interval)
        if 'intervals: size' in line:
            xmin_list = xmin_list[:-1]
            xmax_list = xmax_list[:-1]
    xmin_list = [float(item) for item in xmin_list]
    xmax_list = [float(item) for item in xmax_list]
    interval_list = [int(item) for item in interval_list]

    df = pd.DataFrame()
    df['text'] = text_list
    df['xmin'] = xmin_list
    df['xmax'] = xmax_list
    df['interval'] = interval_list
    df['tier'] = get_interval(df['interval'].tolist())
    df['start'] = df['xmin'].apply(lambda x: int(np.floor(x*16000)))
    df['end'] = df['xmax'].apply(lambda x: int(np.ceil(x*16000)))
    return df

def get_interval(time_list):
    interval_all = []
    interval_curr = 1
    for i in range(len(time_list)):
        if i == 0:
            interval_all.append(1)
        else:
            if time_list[i] > time_list[i - 1]:
                interval_all.append(interval_curr)
                continue
            else:
                interval_curr = interval_curr + 1
                interval_all.append(interval_curr)
    return interval_all



In [3]:
textgrid_dir = '/Users/irisz/downloads/2023_ELIC/Croatian_annotated/ckm001-2022-01-16-Trviž/ckm001-Trviž test data for speech to text/ckm001-2022-01-16-Trviž_01.TextGrid'

df = get_audio_df(textgrid_dir)

In [4]:
df

Unnamed: 0,text,xmin,xmax,interval,tier,start,end
0,Dobro.,0.155012,0.456584,2,1,2480,7306
1,"((noise, smack))",0.456584,1.659126,3,1,7305,26547
2,Onda moremo eh započet.,1.659126,4.068000,4,1,26546,65089
3,((paper rustling)),4.068000,6.757714,5,1,65088,108124
4,S obziron da san i ja govornik,6.757714,8.268000,6,1,108123,132288
...,...,...,...,...,...,...,...
489,je,173.966988,174.032100,414,4,2783471,2784514
490,to,174.032100,174.202306,415,4,2784513,2787237
491,ča,174.358361,174.650000,417,4,2789733,2794400
492,se,174.650000,174.793584,418,4,2794400,2796698


In [3]:
## classla
import librosa
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
processor = Wav2Vec2Processor.from_pretrained("classla/wav2vec2-xls-r-parlaspeech-hr")
model = Wav2Vec2ForCTC.from_pretrained("classla/wav2vec2-xls-r-parlaspeech-hr")

# resample audio
audio_dir = '/Users/irisz/downloads/2023_ELIC/ckm001-2022-01-16-Trviž_01.wav'
speech, sample_rate = librosa.load(audio_dir, sr=16000)
def get_classla_text(speech_curr, processor, model):
    input_values = processor(speech_curr, sampling_rate=16000, return_tensors="pt").input_values.to(device)
    logits = model.to(device)(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0]).lower()
    return transcription

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
df['classla'] = df.apply(lambda x: get_classla_text(speech[x.start:x.end], processor, model), axis =1)

In [9]:
# classla_large
processor = Wav2Vec2Processor.from_pretrained("classla/wav2vec2-large-slavic-parlaspeech-hr")
model = Wav2Vec2ForCTC.from_pretrained("classla/wav2vec2-large-slavic-parlaspeech-hr")
df['classla_lg'] = df.apply(lambda x: get_classla_text(speech[x.start:x.end], processor, model), axis =1)

In [12]:
## nvdia
import nemo.collections.asr as nemo_asr
from pydub import AudioSegment

asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained("nvidia/stt_hr_conformer_ctc_large")

def get_nvdia_text(t1, t2, audio_dir):
    t1 = t1 * 1000 #Works in milliseconds
    t2 = t2 * 1000
    newAudio = AudioSegment.from_wav(audio_dir)
    newAudio = newAudio[t1:t2]
    newAudio.export('/Users/irisz/downloads/2023_ELIC/slice.wav', format="wav")
    text = asr_model.transcribe(['/Users/irisz/downloads/2023_ELIC/slice.wav'])
    return text

[NeMo I 2023-10-02 12:55:08 mixins:170] Tokenizer SentencePieceTokenizer initialized with 128 tokens


[NeMo W 2023-10-02 12:55:08 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    sample_rate: 16000
    batch_size: 16
    shuffle: true
    num_workers: 8
    pin_memory: true
    use_start_end_token: ''
    trim_silence: false
    max_duration: 20.0
    min_duration: 0.1
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: synced_randomized
    bucketing_batch_size: null
    
[NeMo W 2023-10-02 12:55:08 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 16000
    batch_size: 16
    shuffle: false
    num_workers: 8
    pin_

[NeMo I 2023-10-02 12:55:08 features:289] PADDING: 0
[NeMo I 2023-10-02 12:55:11 save_restore_connector:249] Model EncDecCTCModelBPE was successfully restored from /Users/irisz/.cache/huggingface/hub/models--nvidia--stt_hr_conformer_ctc_large/snapshots/1cd2542c83a8ef5b172f2edb1d6132fab5b559e7/stt_hr_conformer_ctc_large.nemo.


In [13]:
df['nvdia'] = df.apply(lambda x: get_nvdia_text(x.xmin, x.xmax, audio_dir), axis = 1)

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

In [22]:
## whisper
import whisper_timestamped as whisper
model = whisper.load_model("base", device="cpu")

def get_whisper_text(t1, t2, audio_dir):
    t1 = t1 * 1000 #Works in milliseconds
    t2 = t2 * 1000
    newAudio = AudioSegment.from_wav(audio_dir)
    newAudio = newAudio[t1:t2]
    newAudio.export('/Users/irisz/downloads/2023_ELIC/slice.wav', format="wav")
    audio = whisper.load_audio('/Users/irisz/downloads/2023_ELIC/slice.wav')
    result = whisper.transcribe(model, audio, language="Croatian")
    return result

def get_whisper_text_padding(t1, t2, audio_dir):
    if t2 - t1 > 1:
        return np.nan
    else:
        pad = (1 - (t2 - t1))/2
    t1 = (t1 - pad) * 1000 #Works in milliseconds
    t2 = (t2 + pad) * 1000
    newAudio = AudioSegment.from_wav(audio_dir)
    newAudio = newAudio[t1:t2]
    newAudio.export('/Users/irisz/downloads/2023_ELIC/slice.wav', format="wav")
    audio = whisper.load_audio('/Users/irisz/downloads/2023_ELIC/slice.wav')
    result = whisper.transcribe(model, audio, language="Croatian")
    result = result['text']
    return result

In [18]:
df['whisper'] = df.apply(lambda x: get_whisper_text(x.xmin, x.xmax, audio_dir), axis = 1)

100%|██████████| 30/30 [00:01<00:00, 27.03frames/s]
  0%|          | 0/120 [00:01<?, ?frames/s]
100%|██████████| 240/240 [00:01<00:00, 205.01frames/s]
100%|██████████| 268/268 [00:06<00:00, 39.60frames/s]
100%|██████████| 151/151 [00:01<00:00, 113.33frames/s]
  0%|          | 0/43 [00:01<?, ?frames/s]
100%|██████████| 143/143 [00:01<00:00, 106.03frames/s]
100%|██████████| 57/57 [00:06<00:00,  8.61frames/s]
100%|██████████| 326/326 [00:01<00:00, 213.73frames/s]
  0%|          | 0/42 [00:01<?, ?frames/s]
100%|██████████| 476/476 [00:01<00:00, 296.64frames/s]
  0%|          | 0/48 [00:01<?, ?frames/s]
100%|██████████| 133/133 [00:01<00:00, 109.52frames/s]
100%|██████████| 31/31 [00:06<00:00,  4.77frames/s]
100%|██████████| 277/277 [00:01<00:00, 196.66frames/s]
100%|██████████| 64/64 [00:01<00:00, 55.75frames/s]
100%|██████████| 30/30 [00:01<00:00, 27.23frames/s]
100%|██████████| 36/36 [00:06<00:00,  5.56frames/s]
100%|██████████| 60/60 [00:01<00:00, 45.94frames/s]
100%|██████████| 44/44 [

100%|██████████| 51/51 [00:01<00:00, 46.49frames/s]
  0%|          | 0/10 [00:01<?, ?frames/s]
  0%|          | 0/17 [00:01<?, ?frames/s]
  0%|          | 0/26 [00:01<?, ?frames/s]
100%|██████████| 49/49 [00:01<00:00, 48.64frames/s]
100%|██████████| 21/21 [00:06<00:00,  3.25frames/s]
  0%|          | 0/40 [00:01<?, ?frames/s]
100%|██████████| 8/8 [00:06<00:00,  1.23frames/s]
  0%|          | 0/54 [00:01<?, ?frames/s]
100%|██████████| 69/69 [00:01<00:00, 62.66frames/s]
100%|██████████| 57/57 [00:01<00:00, 51.01frames/s]
100%|██████████| 73/73 [00:01<00:00, 65.68frames/s]
100%|██████████| 49/49 [00:01<00:00, 46.27frames/s]
  0%|          | 0/16 [00:01<?, ?frames/s]
  0%|          | 0/9 [00:01<?, ?frames/s]
  0%|          | 0/11 [00:01<?, ?frames/s]
100%|██████████| 28/28 [00:01<00:00, 24.78frames/s]
  0%|          | 0/24 [00:01<?, ?frames/s]
100%|██████████| 47/47 [00:01<00:00, 39.03frames/s]
  0%|          | 0/7 [00:01<?, ?frames/s]
  0%|          | 0/8 [00:01<?, ?frames/s]
  0%|       

In [20]:
df['whisper_text'] = df['whisper'].apply(lambda x: x['text'])

In [23]:
df['whisper_pad'] = df.apply(lambda x: get_whisper_text_padding(x.xmin, x.xmax, audio_dir), axis = 1)

0frames [00:00, ?frames/s]
  0%|          | 0/100 [00:01<?, ?frames/s]
100%|██████████| 100/100 [00:07<00:00, 14.18frames/s]
  0%|          | 0/100 [00:01<?, ?frames/s]
100%|██████████| 100/100 [00:01<00:00, 85.81frames/s]
100%|██████████| 100/100 [00:01<00:00, 86.42frames/s]
100%|██████████| 82/82 [00:01<00:00, 60.31frames/s]
0frames [00:00, ?frames/s]
100%|██████████| 100/100 [00:01<00:00, 86.35frames/s]
100%|██████████| 100/100 [00:01<00:00, 85.06frames/s]
  0%|          | 0/100 [00:01<?, ?frames/s]
100%|██████████| 100/100 [00:01<00:00, 89.45frames/s]
100%|██████████| 100/100 [00:01<00:00, 79.87frames/s]
100%|██████████| 100/100 [00:01<00:00, 75.78frames/s]
100%|██████████| 100/100 [00:01<00:00, 76.58frames/s]
100%|██████████| 100/100 [00:01<00:00, 74.74frames/s]
100%|██████████| 100/100 [00:01<00:00, 82.64frames/s]
100%|██████████| 100/100 [00:01<00:00, 85.62frames/s]
100%|██████████| 100/100 [00:01<00:00, 87.65frames/s]
100%|██████████| 100/100 [00:06<00:00, 15.19frames/s]
100%|█

100%|██████████| 100/100 [00:01<00:00, 83.17frames/s]
100%|██████████| 100/100 [00:01<00:00, 87.09frames/s]
  0%|          | 0/100 [00:01<?, ?frames/s]
  0%|          | 0/100 [00:01<?, ?frames/s]
100%|██████████| 100/100 [00:01<00:00, 82.20frames/s]
  0%|          | 0/100 [00:01<?, ?frames/s]
100%|██████████| 100/100 [00:01<00:00, 70.56frames/s]
100%|██████████| 100/100 [00:01<00:00, 79.30frames/s]
100%|██████████| 100/100 [00:01<00:00, 88.70frames/s]
100%|██████████| 100/100 [00:01<00:00, 80.50frames/s]
100%|██████████| 100/100 [00:01<00:00, 83.23frames/s]
100%|██████████| 100/100 [00:07<00:00, 14.10frames/s]
  0%|          | 0/100 [00:01<?, ?frames/s]
100%|██████████| 100/100 [00:01<00:00, 70.62frames/s]
100%|██████████| 100/100 [00:01<00:00, 67.91frames/s]
  0%|          | 0/100 [00:01<?, ?frames/s]
  0%|          | 0/100 [00:01<?, ?frames/s]
  0%|          | 0/100 [00:01<?, ?frames/s]
100%|██████████| 100/100 [00:01<00:00, 75.86frames/s]
100%|██████████| 100/100 [00:01<00:00, 74.70

In [25]:
df

Unnamed: 0,text,xmin,xmax,interval,tier,start,end,classla,classla_lg,nvdia,whisper,whisper_text,whisper_pad
0,Dobro.,0.155012,0.456584,2,1,2480,7306,dobro,dobro,[za mra],"{'text': ' Dobro.', 'segments': [{'id': 0, 'se...",Dobro.,
1,"((noise, smack))",0.456584,1.659126,3,1,7305,26547,pa,o,[],"{'text': '', 'segments': [], 'language': 'Croa...",,
2,Onda moremo eh započet.,1.659126,4.068000,4,1,26546,65089,onda moramo započet,onda moramo započet,[onda moramo započeti],"{'text': ' on da morimo zapocet.', 'segments':...",on da morimo zapocet.,
3,((paper rustling)),4.068000,6.757714,5,1,65088,108124,take,š,[do],{'text': ' Snišlišlišlišlišlišlišlišlišlišlišl...,Snišlišlišlišlišlišlišlišlišlišlišlišlišlišli...,
4,S obziron da san i ja govornik,6.757714,8.268000,6,1,108123,132288,s obzirom da sam ja govornik,s obzirom da sam ja govornik,[es obzirom da sam i ja govornik],"{'text': ' so bezvendace na jago bornik.', 'se...",so bezvendace na jago bornik.,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
489,je,173.966988,174.032100,414,4,2783471,2784514,,i,[],{'text': ' Kako je da je da je da je da je da ...,Kako je da je da je da je da je da je da je d...,"To, to, to."
490,to,174.032100,174.202306,415,4,2784513,2787237,to,to,[to],"{'text': '', 'segments': [], 'language': 'Croa...",,To je to.
491,ča,174.358361,174.650000,417,4,2789733,2794400,ute,če,[če],{'text': ' Kako je da je da je da je da je da ...,Kako je da je da je da je da je da je da je d...,
492,se,174.650000,174.793584,418,4,2794400,2796698,o,,[eu],"{'text': '', 'segments': [], 'language': 'Croa...",,Degončali.


In [28]:
df_word.columns

Index(['text', 'xmin', 'xmax', 'interval', 'tier', 'start', 'end', 'classla',
       'classla_lg', 'nvdia', 'whisper', 'whisper_text', 'whisper_pad'],
      dtype='object')

In [5]:
df_word = df[df['tier'].isin([2,4])]
df_word = df_word[['text', 'xmin', 'xmax', 'interval', 'tier', 'start', 'end', 'classla','classla_lg', 'nvdia', 'whisper_text', 'whisper_pad']]
df_word # use this to get some prediction

KeyError: "['classla', 'classla_lg', 'nvdia', 'whisper_text', 'whisper_pad'] not in index"

In [32]:
df = df[['text', 'xmin', 'xmax', 'interval', 'tier', 'start', 'end', 'classla','classla_lg', 'nvdia', 'whisper_text', 'whisper_pad']]
df

Unnamed: 0,text,xmin,xmax,interval,tier,start,end,classla,classla_lg,nvdia,whisper_text,whisper_pad
0,Dobro.,0.155012,0.456584,2,1,2480,7306,dobro,dobro,[za mra],Dobro.,
1,"((noise, smack))",0.456584,1.659126,3,1,7305,26547,pa,o,[],,
2,Onda moremo eh započet.,1.659126,4.068000,4,1,26546,65089,onda moramo započet,onda moramo započet,[onda moramo započeti],on da morimo zapocet.,
3,((paper rustling)),4.068000,6.757714,5,1,65088,108124,take,š,[do],Snišlišlišlišlišlišlišlišlišlišlišlišlišlišli...,
4,S obziron da san i ja govornik,6.757714,8.268000,6,1,108123,132288,s obzirom da sam ja govornik,s obzirom da sam ja govornik,[es obzirom da sam i ja govornik],so bezvendace na jago bornik.,
...,...,...,...,...,...,...,...,...,...,...,...,...
489,je,173.966988,174.032100,414,4,2783471,2784514,,i,[],Kako je da je da je da je da je da je da je d...,"To, to, to."
490,to,174.032100,174.202306,415,4,2784513,2787237,to,to,[to],,To je to.
491,ča,174.358361,174.650000,417,4,2789733,2794400,ute,če,[če],Kako je da je da je da je da je da je da je d...,
492,se,174.650000,174.793584,418,4,2794400,2796698,o,,[eu],,Degončali.


In [36]:
df['nvdia_text'] = df['nvdia'].apply(lambda x: x[0])
df_word['nvdia_text'] = df_word['nvdia'].apply(lambda x: x[0])


In [44]:
df_snt = df[df['tier'].isin([1,3])]
df_snt

Unnamed: 0,text,xmin,xmax,interval,tier,start,end,classla,classla_lg,nvdia,whisper_text,whisper_pad,nvdia_text
0,Dobro.,0.155012,0.456584,2,1,2480,7306,dobro,dobro,[za mra],Dobro.,,za mra
1,"((noise, smack))",0.456584,1.659126,3,1,7305,26547,pa,o,[],,,
2,Onda moremo eh započet.,1.659126,4.068000,4,1,26546,65089,onda moramo započet,onda moramo započet,[onda moramo započeti],on da morimo zapocet.,,onda moramo započeti
3,((paper rustling)),4.068000,6.757714,5,1,65088,108124,take,š,[do],Snišlišlišlišlišlišlišlišlišlišlišlišlišlišli...,,do
4,S obziron da san i ja govornik,6.757714,8.268000,6,1,108123,132288,s obzirom da sam ja govornik,s obzirom da sam ja govornik,[es obzirom da sam i ja govornik],so bezvendace na jago bornik.,,es obzirom da sam i ja govornik
...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,#,170.700000,171.037673,68,3,2731200,2736603,okuto,,[odluke],Kako je da je da je da je da je da je da je d...,No...,odluke
133,mmm kantana maša.,171.037673,172.235189,69,3,2736602,2755764,kamdana maša,kamdanamašao,[pandana mašea],Kada ne mašio.,,pandana mašea
134,"((smack, breath))",172.235189,173.463225,70,3,2755763,2775412,ot,,[u],,,u
135,To je/ to je to.,173.463225,174.202306,71,3,2775411,2787237,to je to,to je to,[to je to],"To, to, to.","To, to, to.",to je to


In [45]:
df_snt.head(30)

Unnamed: 0,text,xmin,xmax,interval,tier,start,end,classla,classla_lg,nvdia,whisper_text,whisper_pad,nvdia_text
0,Dobro.,0.155012,0.456584,2,1,2480,7306,dobro,dobro,[za mra],Dobro.,,za mra
1,"((noise, smack))",0.456584,1.659126,3,1,7305,26547,pa,o,[],,,
2,Onda moremo eh započet.,1.659126,4.068,4,1,26546,65089,onda moramo započet,onda moramo započet,[onda moramo započeti],on da morimo zapocet.,,onda moramo započeti
3,((paper rustling)),4.068,6.757714,5,1,65088,108124,take,š,[do],Snišlišlišlišlišlišlišlišlišlišlišlišlišlišli...,,do
4,S obziron da san i ja govornik,6.757714,8.268,6,1,108123,132288,s obzirom da sam ja govornik,s obzirom da sam ja govornik,[es obzirom da sam i ja govornik],so bezvendace na jago bornik.,,es obzirom da sam i ja govornik
5,((noise)),8.268,8.702694,7,1,132288,139244,ed,,[],,,
6,materinskega idioma,8.702694,10.140975,8,1,139243,162256,materinskoga idioma,materinskoga edioma,[materinskoga dioma],Matrenj se ga je dioma.,,materinskoga dioma
7,((breath)),10.140975,10.717607,9,1,162255,171482,d,,[],Kako je da je da je da je da je da je da je d...,Daj naš da je da je da je da je da je da je d...,
8,"onda ćemo govorit, ću provat po domaće pa, za ...",10.717607,13.978445,10,1,171481,223656,onda ćemo govoriti ćekulatko do majhe pa za po...,onda ćemo govorit ehlatko do maćepa za početak,[onda ćemo govoriti čehva po domate pa za poče...,"onda ćemo govoriti čekrat koja da mati pa, za...",,onda ćemo govoriti čehva po domate pa za početak
9,"((breath, noise))",13.978445,14.399799,11,1,223655,230397,op,,[o],,,o


In [37]:
df.to_csv('/Users/irisz/downloads/2023_ELIC/ckm001-2022-01-16-Trviž_01.csv')
df_word.to_csv('/Users/irisz/downloads/2023_ELIC/ckm001-2022-01-16-Trviž_01_word.csv')

In [46]:
df_snt.to_csv('/Users/irisz/downloads/2023_ELIC/ckm001-2022-01-16-Trviž_01_sentence.csv')


In [38]:
df_word

Unnamed: 0,text,xmin,xmax,interval,tier,start,end,classla,classla_lg,nvdia,whisper_text,whisper_pad,nvdia_text
16,dobro,0.154668,0.456584,2,2,2474,7306,dobro,dobro,[zamra],Dobro.,,zamra
17,onda,1.659126,2.020000,4,2,26546,32320,onda,onda,[onda],Kako je da je da je da je da je da je da je d...,"Ovo, da moram.",onda
18,moremo,2.020000,2.629858,5,2,32320,42078,moramo,morano,[moramo],Moja demora.,Moja ima...,moramo
19,eh,2.629858,3.070000,6,2,42077,49120,,a,[i],A.,,i
20,započet,3.210000,4.067719,8,2,51360,65084,započet,započeti,[započet],zapocet.,zapocet.,započet
...,...,...,...,...,...,...,...,...,...,...,...,...,...
489,je,173.966988,174.032100,414,4,2783471,2784514,,i,[],Kako je da je da je da je da je da je da je d...,"To, to, to.",
490,to,174.032100,174.202306,415,4,2784513,2787237,to,to,[to],,To je to.,to
491,ča,174.358361,174.650000,417,4,2789733,2794400,ute,če,[če],Kako je da je da je da je da je da je da je d...,,če
492,se,174.650000,174.793584,418,4,2794400,2796698,o,,[eu],,Degončali.,eu


In [39]:
def get_acc(trans):
    text = df_word['text'].tolist()
    trans_text = df_word[trans].tolist()
    acc = 0
    for i in range(len(text)):
        if text[i] == trans_text[i]:
            acc = acc + 1
        else:
            continue
    acc_value = acc/len(text)
    return acc_value
get_acc('nvdia_text')

0.3071253071253071

In [40]:
get_acc('classla')

0.40540540540540543

In [41]:
get_acc('classla_lg')

0.371007371007371

In [42]:
get_acc('whisper_text')

0.0

In [43]:
get_acc('whisper_pad')

0.0

In [None]:
## functions from: classla, nvdia, whisper
## potentially self-trained model using eplis etc.



# generate textgrid file from df

In [None]:
## get word and sentence string

In [6]:
df_word = df[df['tier'].isin([2,4])]
df_word

Unnamed: 0,text,xmin,xmax,interval,tier,start,end
16,dobro,0.154668,0.456584,2,2,2474,7306
17,onda,1.659126,2.020000,4,2,26546,32320
18,moremo,2.020000,2.629858,5,2,32320,42078
19,eh,2.629858,3.070000,6,2,42077,49120
20,započet,3.210000,4.067719,8,2,51360,65084
...,...,...,...,...,...,...,...
489,je,173.966988,174.032100,414,4,2783471,2784514
490,to,174.032100,174.202306,415,4,2784513,2787237
491,ča,174.358361,174.650000,417,4,2789733,2794400
492,se,174.650000,174.793584,418,4,2794400,2796698


In [18]:
df_sent = df[df['tier'].isin([1,3])]
df_sent

Unnamed: 0,text,xmin,xmax,interval,tier,start,end
0,Dobro.,0.155012,0.456584,2,1,2480,7306
1,"((noise, smack))",0.456584,1.659126,3,1,7305,26547
2,Onda moremo eh započet.,1.659126,4.068000,4,1,26546,65089
3,((paper rustling)),4.068000,6.757714,5,1,65088,108124
4,S obziron da san i ja govornik,6.757714,8.268000,6,1,108123,132288
...,...,...,...,...,...,...,...
132,#,170.700000,171.037673,68,3,2731200,2736603
133,mmm kantana maša.,171.037673,172.235189,69,3,2736602,2755764
134,"((smack, breath))",172.235189,173.463225,70,3,2755763,2775412
135,To je/ to je to.,173.463225,174.202306,71,3,2775411,2787237


In [23]:
df_sent.sort_values('start').iloc[30*1:30*2]

Unnamed: 0,text,xmin,xmax,interval,tier,start,end
81,eh koji su mi eh dan danas ((noise)) neki usta...,50.049048,54.179343,17,3,800784,866870
82,((breath)),54.179343,55.028524,18,3,866869,880457
83,Eh najlepče iz djetinjstva eh su raznorazne eh...,55.028524,63.829778,19,3,880456,1021277
84,"((smack, breath))",63.829778,64.771086,20,3,1021276,1036338
85,"Ne znan, igrali smo se na primjer eh „restoran...",64.771086,69.290358,21,3,1036337,1108646
86,((breath)),69.290358,69.924432,22,3,1108645,1118791
87,"mmm van sve eh kalotine, i jena kalotina malo ...",69.924432,76.895482,23,3,1118790,1230328
88,((breath)),76.895482,77.364,24,3,1230327,1237824
89,eh karege.,77.364,80.029552,25,3,1237824,1280473
90,#,80.029552,80.2297,26,3,1280472,1283676


In [17]:
df_word.sort_values('start').iloc[30*7:30*8]

Unnamed: 0,text,xmin,xmax,interval,tier,start,end
297,eh,98.62,98.855752,194,4,1577920,1581693
298,skrivača,98.855752,99.67,195,4,1581692,1594720
299,ili,99.67,100.268678,196,4,1594720,1604299
300,eh,100.268678,100.686766,197,4,1604298,1610989
301,plan,101.090005,101.581159,199,4,1617440,1625299
302,plan,101.86,102.15,201,4,1629760,1634400
303,grada,102.15,102.49,202,4,1634400,1639840
304,pa,102.49,102.59,203,4,1639840,1641440
305,smo,102.59,102.79,204,4,1641440,1644640
306,se,102.79,102.95,205,4,1644640,1647200


In [24]:
word = df_word.sort_values('start')['text'].tolist()
text = " ".join(word)
text

'dobro onda moremo eh započet s obziron da san i ja govornik materinskega idioma onda ćemo govorit ću provat po domaće pa za početak moreš nan eh reć nešto u kratko o sebe kade si rođena kade si kade si se zgojila i tako nešto ča te domišja na djetinjstvo eh znači rodila san se u Trvižu eh jenen eh mestu srednjovjekovnen gradiću eh blizu Pazina eh živila san z mamon i z ćaćon z nonićima i z braton i sestron eh djet za eh djetinstvo me vežu jako lepe uspomene eh najviše mmm bezbrižna igra sa svojin susedima eh koji su mi eh dan danas neki ustali i preteli eh najlepče iz djetinjstva eh su raznorazne eh n ne znan igre vezane za ne znan igrali smo se na primjer eh restorana pa smo si stavili mmm van sve eh kalotine i jena kalotina malo veća nan je bila kako miza a sve okolo su bili eh karege eh onda smo se igrali eh z muškema smo se s obziron da ni bilo još preveć igračkah ki je ča ima to smo se igrali z muškema smo se igrali z veturicama u sablonu eh najveć smo delali kućice na drevu eh a