In [1]:
import huggingsound
from huggingsound import SpeechRecognitionModel
import os
import pandas as pd
import numpy as np
import torch
import json
import ast
import librosa

  from .autonotebook import tqdm as notebook_tqdm


# Erstellen der CSV

In [2]:
#Lädt Referenz
path_csv     = r"D:\Masterarbeit\LABELD\test.csv"
csv          = pd.read_csv(path_csv)

#Lädt Datein
folder_test  = r"D:\Masterarbeit\SAMPLES PROCESSED\Voice\TEST"
files_test   = csv.filename.apply(lambda filename: os.path.join(folder_test,filename))

In [3]:
#Lädt Model
model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english")

03/31/2023 15:57:12 - INFO - huggingsound.speech_recognition.model - Loading model...


In [4]:
result = model.transcribe(files_test)
csv["result_wav2vec2"] = result

100%|██████████| 1024/1024 [26:14<00:00,  1.54s/it]


In [5]:
csv.to_csv("result_wav2vec2.csv",index=False)

# Auswertung

In [2]:
csv = pd.read_csv("result_wav2vec2.csv")
csv["result_wav2vec2"] = csv.result_wav2vec2.apply(ast.literal_eval)

In [4]:
start_wav2vec2 = csv.result_wav2vec2.apply(lambda json: min(json["start_timestamps"]) / 1000)
end_wav2vec2   = csv.result_wav2vec2.apply(lambda json: max(json["end_timestamps"])   / 1000)

diff_start = np.abs((start_wav2vec2 - csv.start) * 1000)
diff_end   = np.abs((end_wav2vec2 - csv.end) * 1000)

total_diff = diff_start + diff_end
csv["total_diff_wav2vec2"] = total_diff

In [12]:
csv["total_diff_wav2vec2"].mean()

155.654296875

# Analyse Über DB Treshhold

In [18]:
#Konstante
SAMPLE_RATE = 16000

#Funktion lädt Tensor
def load_file_as_tensor(file_path):

    waveform, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE, mono=True, dtype="float64")
    
    #Tensor
    waveform = torch.from_numpy(waveform).to(torch.float32)

    #Return
    return waveform, sample_rate

In [25]:
#Erstellt Pfad
folder_test  = r"D:\Masterarbeit\SAMPLES PROCESSED\Voice\TEST"
files_test   = csv.filename.apply(lambda filename: os.path.join(folder_test,filename))

#Lädt Datein
waveforms_samplerates = files_test.apply(load_file_as_tensor)
waveforms = waveforms_samplerates.apply(lambda ws: ws[0])

In [13]:
#AMP to DB und umgekehrt
def amp_to_db(tensor):
    return tensor.log10() * 20

def db_to_amp(tensor):
    return 10.0**(0.5 * tensor/10)

#Berechnet RMS
def rms(tensor):
    tensor = tensor.square()
    tensor = tensor.mean(dim=-1)
    tensor = tensor.sqrt()
    return tensor

def db(tensor):
    return amp_to_db(rms(tensor))

In [31]:
#Konstanten
NON_SILENT_SAMPLE_LENGTH = 0.03
NON_SILENT_HOP_LENGTH    = 0.01
MAX_GAP_LENGTH           = 0.6

NON_SILENT_SAMPLE_LENGTH   = librosa.time_to_samples(NON_SILENT_SAMPLE_LENGTH, sr=SAMPLE_RATE)
NON_SILENT_HOP_LENGTH      = librosa.time_to_samples(NON_SILENT_HOP_LENGTH,    sr=SAMPLE_RATE)
MAX_GAP_LENGTH             = librosa.time_to_samples(MAX_GAP_LENGTH,           sr=SAMPLE_RATE)

In [33]:
#Erzeugt Matrix welche zum Mulitplizieren verwendet wird
matrix_m_number_of_rows  = 1000
matrix_n_waveform_length = librosa.time_to_samples(15,sr=SAMPLE_RATE)

multiplication_matix = []

for idx in range(matrix_m_number_of_rows):
    
    #erzeugt neue Zeile
    row = torch.zeros(matrix_n_waveform_length)
    
    #Setzt der 1en
    current_pos = idx * NON_SILENT_HOP_LENGTH
    row[current_pos:current_pos + NON_SILENT_SAMPLE_LENGTH] = 1

    multiplication_matix.append(row)

multiplication_matix = torch.vstack(multiplication_matix)

In [34]:
@torch.no_grad()
def get_non_silent(waveform, treshhold = 20):

    #Sichert das der Übergebense Tensor die Form[Datenpunkt]
    if len(waveform.shape) != 1:
        raise Exception("BAD TENSOR SHAPE")
    
    #Speichert Wert für Später
    waveform_length = waveform.shape[-1]

    #Fügt 0 am Tensor an damit über die Gesamte Länge die Sprache erkannt werden kann
    waveform = torch.cat([waveform, torch.zeros(NON_SILENT_SAMPLE_LENGTH - 1)])

    #Transformiert in Shape [X, NON_SILENT_SAMPLE_LENGTH]
    waveform = waveform.unfold(
        dimension = 0,
        size      = NON_SILENT_SAMPLE_LENGTH,
        step      = NON_SILENT_HOP_LENGTH
    )

    #Berechnet DBFS
    waveform_db = db(waveform)

    #Treshhold
    waveform_db.gt_(waveform_db.max() - treshhold)

    #Mulitpliziert mit Matrix um Abdeckung zu vergleichen
    waveform_db = waveform_db.diag()
    result   = torch.matmul(waveform_db, multiplication_matix[:waveform_db.shape[-1], :waveform_length].to(waveform.dtype))

    return result.sum(dim=0).gt_(0)

In [45]:
non_silent = waveforms.apply(lambda waveform: get_non_silent(waveform))
nonzero    = non_silent.apply(lambda tensor: tensor.flatten().nonzero().flatten())

In [55]:
#Kalkuliert Start und Endpunkte
start_selfmade = nonzero.apply(lambda tensor: tensor.min().item())
start_selfmade = start_selfmade.apply(lambda startpoint: librosa.samples_to_time(samples=startpoint,sr=SAMPLE_RATE))

end_selfmade   = nonzero.apply(lambda tensor: tensor.max().item())
end_selfmade   = end_selfmade.apply(lambda end_selfmade: librosa.samples_to_time(samples=end_selfmade,sr=SAMPLE_RATE))

In [59]:
diff_start = np.abs((start_selfmade - csv.start) * 1000)
diff_end   = np.abs((end_wav2vec2 - csv.end) * 1000)

total_diff = diff_start + diff_end
csv["total_diff_selfmade"] = total_diff

In [61]:
csv["total_diff_selfmade"].mean()

219.443359375

# Kombination aus beidem

In [68]:
files_duration = files_test.apply(lambda filepath: librosa.get_duration(path=filepath))

In [170]:
search_window_size = 0.2

#Start
search_start_lower = start_wav2vec2 - search_window_size / 2
search_start_lower[search_start_lower < 0] = 0
search_start_upper = start_wav2vec2 + search_window_size / 2
search_start_upper[search_start_upper > files_duration] = files_duration[search_start_upper > files_duration]

#Ende
search_end_lower = end_wav2vec2 - search_window_size / 2
search_end_lower[search_start_lower < 0] = 0
search_end_upper = end_wav2vec2 + search_window_size / 2
search_end_upper[search_end_upper > files_duration] = files_duration[search_end_upper > files_duration]

#to Frames
search_start_lower = search_start_lower.apply(lambda time: librosa.time_to_samples(times=time,sr=SAMPLE_RATE))
search_start_upper = search_start_upper.apply(lambda time: librosa.time_to_samples(times=time,sr=SAMPLE_RATE))
search_end_lower   = search_end_lower.apply(lambda time: librosa.time_to_samples(times=time,sr=SAMPLE_RATE))
search_end_upper   = search_end_upper.apply(lambda time: librosa.time_to_samples(times=time,sr=SAMPLE_RATE))


In [171]:
_calc = pd.DataFrame({
    "nonzero": nonzero,
    "search_start_lower": search_start_lower,
    "search_start_upper": search_start_upper,
    "w2v_start"         : start_wav2vec2.apply(lambda time: librosa.time_to_samples(times=time,sr=SAMPLE_RATE)).apply(torch.tensor),

    "search_end_lower": search_end_lower,
    "search_end_upper": search_end_upper,
    "w2v_end"         : end_wav2vec2.apply(lambda time: librosa.time_to_samples(times=time,sr=SAMPLE_RATE)).apply(torch.tensor),
})

calc_start = _calc.apply(lambda row: row["nonzero"][ torch.logical_and (row["nonzero"] >= row["search_start_lower"], row["nonzero"] <= row["search_start_upper"]) ] ,axis=1)
calc_start[calc_start.apply(lambda t: t.shape[0]) == 0] = _calc[calc_start.apply(lambda t: t.shape[0]) == 0].w2v_start
calc_end   = _calc.apply(lambda row: row["nonzero"][ torch.logical_and (row["nonzero"] >= row["search_end_lower"],   row["nonzero"] <= row["search_end_upper"]) ] ,axis=1)
calc_end[calc_end.apply(lambda t: t.shape[0]) == 0] = _calc[calc_end.apply(lambda t: t.shape[0]) == 0].w2v_end

In [173]:
calc_start = calc_start.apply(lambda t: t.min()).apply(lambda t: librosa.samples_to_time(samples=t,sr=SAMPLE_RATE))
calc_end   = calc_end.apply(lambda t: t.max()).apply(lambda t: librosa.samples_to_time(samples=t,sr=SAMPLE_RATE))

In [174]:
diff_start = np.abs((calc_start - csv.start) * 1000)
diff_end   = np.abs((calc_end - csv.end) * 1000)

total_diff = diff_start + diff_end
csv["total_diff_combined"] = total_diff
csv["total_diff_combined"].mean()

110.37896728515625