In [1]:
%load_ext autoreload
%autoreload 2

from importlib.util import find_spec
if find_spec("core") is None:
    import sys
    sys.path.append('..')
    
import pandas as pd
import numpy as np

import os
import sys
import glob
from tqdm.notebook import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

import librosa
import librosa.display
from IPython.display import Audio

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split


#import cv2
#from PIL import Image

#from typing import Dict, Any
#import math

import torch
from torch import LongTensor, FloatTensor, HalfTensor, Tensor
from torch.utils.data import Dataset, DataLoader
from copy import deepcopy
from torch import nn, optim
import torch.nn.functional as F
import inspect

from catalyst import dl, utils
import timm

from resemblyzer import preprocess_wav, VoiceEncoder
from spectralcluster import SpectralClusterer
from resemblyzer.audio import sampling_rate as resempler_sampling_rate
resemblyzer_encoder = VoiceEncoder()#"cuda:0"

#from torch.fft import fft, ifft
#from torchaudio.functional import bandpass_biquad
#from nnAudio.Spectrogram import CQT1992v2
#import torchvision.models as models

%matplotlib inline

Loaded the voice encoder model on cpu in 0.04 seconds.


In [None]:
from core.datasets import BasicAudioDataset

In [2]:
audio_file_path = f'../datasets/audios_for_testing/happy_43f17262de5f95e736b13225da0f28ab.wav'
input_audio_wave, audio_sampling_rate = librosa.load(audio_file_path, sr=None, duration=120)

In [3]:
print(input_audio_wave.shape, audio_sampling_rate)
Audio(input_audio_wave, rate=audio_sampling_rate)

(1263401,) 22050


In [4]:
audio_wave_resempler_sr = librosa.resample(input_audio_wave, audio_sampling_rate, resempler_sampling_rate)
input_audio_wave.shape, audio_wave_resempler_sr.shape

((1263401,), (916754,))

In [5]:
audio_without_silence = preprocess_wav(audio_wave_resempler_sr)
print(audio_wave_resempler_sr.shape, audio_without_silence.shape, )
Audio(audio_without_silence, rate=resempler_sampling_rate)

(916754,) (881280,)


In [6]:
mean_normalized_slice_audio_embedding, slice_audio_embedding, time_slices = resemblyzer_encoder.embed_utterance(
    audio_without_silence, return_partials=True)


spectral_clusterer = SpectralClusterer(
    min_clusters=2,
    max_clusters=3,
    #p_percentile=0.90,
    #gaussian_blur_sigma=1
)

cluster_labels_slices = spectral_clusterer.predict(slice_audio_embedding)


def create_labelling_spectral_clusterer(cluster_labels_slices, time_slices):

    times = [((s.start + s.stop) / 2) / resempler_sampling_rate for s in time_slices]
    labelling = []
    start_time = 0

    for i,time in enumerate(times):
        if i>0 and cluster_labels_slices[i]!=cluster_labels_slices[i-1]:
            temp = [str(cluster_labels_slices[i-1]),start_time,time]
            labelling.append(tuple(temp))
            start_time = time
        if i==len(times)-1:
            temp = [str(cluster_labels_slices[i]),start_time,time]
            labelling.append(tuple(temp))

    return labelling

labelling_spectral_clusterer = create_labelling_spectral_clusterer(cluster_labels_slices, time_slices)
phrase_df = pd.DataFrame(labelling_spectral_clusterer, columns=['person', 'start', 'stop'])
phrase_df.head()

Unnamed: 0,person,start,stop
0,1,0.0,1.57
1,0,1.57,5.42
2,1,5.42,8.5
3,0,8.5,13.12
4,1,13.12,26.98


In [40]:
audio_preparator_dataset = BasicAudioDataset()
target_sampling_rate = audio_preparator_dataset.sample_rate

audio_without_silence_dataset_sr = librosa.resample(audio_without_silence, resempler_sampling_rate, target_sampling_rate)

prepared_audio_samples_list = []
for index_phrase, phrase in phrase_df.iterrows():
    audio_slice_phrase = audio_without_silence_dataset_sr[int(target_sampling_rate*phrase['start']):
                                                         int(target_sampling_rate*phrase['stop'])]
    prepared_audio_sample = audio_preparator_dataset.slice_prepare_test_audio(audio_slice_phrase, target_sampling_rate)
    prepared_audio_samples_list.append(np.array(prepared_audio_sample))
    
    
phrase_df['list_of_audios_for_model'] = prepared_audio_samples_list
phrase_df.head(1)

Unnamed: 0,person,start,stop,list_of_audios_for_model
0,1,0.0,1.57,"[[[-80.0, -80.0, -80.0, -80.0, -80.0, -80.0, -..."
