In [7]:
from pathlib import Path

from MakeSlices.AudioSlicer import AudioSlicer

def slicer(path:str):
	audio_file = Path(path)
	audio_list = []

	if 'mp3' in audio_file.suffix:
		from MakeSlices.PydubSlices import MakeSlices
		from MakeSlices.PydubSlices import SliceLoader

		SlicesKwargs = {'winlen' : 1000, 'stride' : 500,}
		LoaderKwargs = {'samplerate' : None, 'to_torch' : True}

	else:
		from MakeSlices.LibrosaSlices import MakeSlices
		from MakeSlices.LibrosaSlices import SliceLoader

		SlicesKwargs = {'winlen' : 0.5, 'stride' : 0.5,}
		LoaderKwargs = {'samplerate' : None, 'to_torch' : True}


	load_audio = AudioSlicer(MakeSlices = MakeSlices,
				SliceLoader = SliceLoader,
				SlicesKwargs = SlicesKwargs,
				LoaderKwargs = LoaderKwargs)
	
	audio_slices = load_audio(audio_file)	
	for i, audio_slice in enumerate(audio_slices):
# 		print(i, audio_slice.shape)
		audio_list.append(audio_slice)

	return audio_list

# audio_slices = slicer('/home/n2202857e/Documents/pyannote-audio/data/pyannote/amicorpus/ES2012c/audio/ES2012c.Mix-Headset.wav')



In [1]:
import umap
import numpy as np
def umap_projection(data : np.array, n_components : int = 2):
    

    ''' data: Numpy array (num_samples X feature_dim) '''


    reducer = umap.UMAP(n_components = n_components)

    projections = reducer.fit_transform(data)



    return projections

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_mask(data, sample_rate=16000, min_utterance_length_ms=25, merge_dist_ms=32,
             silence_value=0, silence_rtol=1e-1, silence_atol=1e-2):
    import numpy as np
    from numpy import ma



    min_utterance_length = min_utterance_length_ms * sample_rate / 1000.0
    merge_dist = merge_dist_ms * sample_rate / 1000.0



    if not isinstance(data, np.ndarray):
        data = np.array(data)
        assert len(data.shape) == 1
    
    masked = (~ma.getmaskarray(ma.masked_values(data, silence_value, rtol=silence_rtol, atol=silence_atol))).astype(np.int8)
    last_i = -1
    for i in range(1, masked.shape[0]):
        if masked[i - 1] == 1 and masked[i] == 0:
            last_i = i - 1
        elif masked[i - 1] == 0 and masked[i] == 1 and last_i > -1 and i - last_i <= merge_dist:
            masked[last_i:i] = 1
            last_i = -1
    last_i = -1
    for i in range(masked.shape[0]):
        if last_i == -1 or (masked[i - 1] == 0 and masked[i] == 1):
            last_i = i - 1
        elif masked[i - 1] == 1 and masked[i] == 0 and i - last_i < min_utterance_length:
            masked[last_i:i] = 0
            last_i = -1



    return masked

In [3]:
import matplotlib.pyplot as plt
def show_emb_label(emb,label):
    plt.scatter(emb[:, 0],emb[:, 1],s=10,c =label)
    plt.show()
def show_emb_nolabel(emb):
    plt.scatter(emb[:, 0],emb[:, 1],s=10)
    plt.show()

In [4]:
def majorityElement(arr, n) :
     
    # sort the array in O(nlogn)
    arr.sort()  
    count, max_ele, temp, f = 1, -1, arr[0], 0
    for i in range(1, n) :
         
        # increases the count if the same element occurs
        # otherwise starts counting new element
        if(temp == arr[i]) :
            count += 1
        else :
            count = 1
            temp = arr[i]
             
        # sets maximum count
        # and stores maximum occurred element so far
        # if maximum count becomes greater than n/2
        # it breaks out setting the flag
        if(max_ele < count) :
            max_ele = count
            ele = arr[i]
             
            if(max_ele > (n//2)) :
                f = 1
                break
             
    # returns maximum occurred element
    # if there is no such element, returns -1
    if f == 1 :
        return ele
    else :
        return max_ele

In [5]:
from transformers import Wav2Vec2FeatureExtractor, WavLMForXVector
import torch, torchaudio

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/wavlm-base-plus-sv")
model = WavLMForXVector.from_pretrained("microsoft/wavlm-base-plus-sv")

In [97]:
import os
directory = '/home/n2202857e/Documents/pyannote-audio/data/pyannote/amicorpus'

audio_list = []
metadata = []
count = 0

for filename in sorted(os.listdir(directory)):
    f = os.path.join(directory, filename,"audio",filename+'.Mix-Headset.wav')
    
    metadata.append(torchaudio.info(f))
    audio_list.append(slicer(f))
    
    print(count)
    count = count + 1
    if (count==10):
        break
    else:
        continue

0
1
2
3
4
5
6
7
8
9


In [79]:
len(audio_list[0])

10501

In [6]:
# waveform, sample_rate = torchaudio.load('/home/n2202857e/Documents/pyannote-audio/data/pyannote/amicorpus/ES2012c/audio/ES2012c.Mix-Headset.wav')
# metadata = torchaudio.info('/home/n2202857e/Documents/pyannote-audio/data/pyannote/amicorpus/ES2012c/audio/ES2012c.Mix-Headset.wav')
# print(waveform[0].shape, type(waveform),metadata)


torch.Size([35339094]) <class 'torch.Tensor'> AudioMetaData(sample_rate=16000, num_frames=35339094, num_channels=1, bits_per_sample=16, encoding=PCM_S)


In [15]:
count = 0
directory = '/home/n2202857e/Documents/pyannote-audio/data/only_words/rttms/train'

sliced_mask_list = []

for filename in sorted(os.listdir(directory)):
    f = os.path.join(directory, filename)
    
    rttm = pd.read_csv(f,sep=' ',header=None)
    rttm.columns=['label_type', 'utt_id', 'channel', 'start', 'duration', 'x', 'y', 'spk_id', 'z','v']
    
    mask = np.zeros((metadata[count].num_frames))
    sampling_rate = metadata[count].sample_rate
    for i in range(len(rttm)):
        mask[int(rttm['start'][i]*sampling_rate):int((rttm['start'][i]+rttm['duration'][i])*sampling_rate)] = 1

    sliced_mask = []
    num_frames_per_slice = 8000
    for i in range(int(metadata[count].num_frames/num_frames_per_slice)):
        sliced_mask.append(majorityElement(mask[i*num_frames_per_slice:(i+1)*num_frames_per_slice],num_frames_per_slice))
    sliced_mask.append(0)
    
    sliced_mask_list.append(sliced_mask)
    
    print(count)
    count = count + 1
    if (count==10):
        break
    else:
        continue

0
1
2
3
4
5
6
7
8
9


In [18]:
metadata[0].sample_rate

16000

In [13]:
# import pandas as pd
# rttm = pd.read_csv('/home/n2202857e/Documents/pyannote-audio/data/only_words/rttms/train/ES2012c.rttm',sep=' ',header=None)
# rttm.columns=['label_type', 'utt_id', 'channel', 'start', 'duration', 'x', 'y', 'spk_id', 'z','v']

In [14]:
# type(rttm)

pandas.core.frame.DataFrame

In [9]:
# mask = np.zeros((metadata.num_frames))
# sampling_rate = metadata.sample_rate
# for i in range(len(rttm)):
#     mask[int(rttm['start'][i]*sampling_rate):int((rttm['start'][i]+rttm['duration'][i])*sampling_rate)] = 1

In [36]:
# sliced_mask = []
# for i in range(int(metadata.num_frames/8000)):
#     sliced_mask.append(majorityElement(mask[i*8000:(i+1)*8000],8000))

In [37]:
# count = 0
# for i in sliced_mask:
#     if i==2000:
#         count = count+1
# print(count)

0


In [33]:
# sliced_mask

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0

In [476]:
# type(audio_slices)

list

In [None]:
# the resulting embeddings can be used for cosine similarity-based retrieval
# cosine_sim = torch.nn.CosineSimilarity(dim=-1)
# similarity = cosine_sim(embeddings[0], embeddings[1])
# threshold = 0.7  # the optimal threshold is dataset-dependent
# if similarity < threshold:
#     print("Speakers are not the same!")
# round(similarity.item(), 2)

In [None]:
# model

In [98]:
silence = torch.zeros(8000)
silences = []
silences.append(silence)

In [99]:
silences

[tensor([0., 0., 0.,  ..., 0., 0., 0.])]

In [101]:
for i in range(len(audio_list)):
#     print(len(j))

    np.random.seed(i)
    a = np.random.choice(len(audio_list[i]), size=int(len(audio_list[i])*0.8))
    
    audio_list[i] = np.insert(audio_list[i], a, silences)
    sliced_mask_list[i] = np.insert(sliced_mask_list[i], a, 0)

In [104]:
for i in range(10):
    print(len(audio_list[i])==len(sliced_mask_list[i]))

True
True
True
True
True
True
True
True
True
True


In [None]:
count = 0
embeddings_list = []
for audio_slices in audio_list:
    final_emb = []
    for i in range(len(audio_slices)):    
        inputs = feature_extractor(audio_slices[i], sampling_rate=16000, return_tensors="pt",padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = model(**inputs).embeddings

        last_hidden_states = outputs.hidden_states
        embeddings = torch.nn.functional.normalize(embeddings, dim=-1).cpu()
        final_emb.append(embeddings)
    embeddings_list.append(final_emb)
    
    count = count +1
    print(count)

  return (input_length - kernel_size) // stride + 1


In [None]:
transformed_embedding = []
count = 0 

for embeddings in embeddings_list:
    transformed_embedding.append([t[0].numpy() for t in embeddings])
    show_emb_label(umap_projection(transformed_embedding[count]),sliced_mask_list[count])
    
    count = count + 1

In [None]:
num_silences_list = []
num_frames_per_slice = 8000


for i in metadata:
    num_silences_list.append(int(0.8*(i.num_frames)/num_frames_per_slice))

In [29]:
metadata[0]

<torchaudio.backend.common.AudioMetaData at 0x7f40d5d6d430>

In [44]:
len(transformed_embedding[1])

6904

In [47]:
transformed_embedding[1][0]

array([-3.66830686e-03, -2.50686556e-02, -2.32856013e-02,  3.22603732e-02,
       -1.54198008e-02, -8.02203789e-02, -4.53274809e-02,  3.36432736e-03,
       -3.27243879e-02, -2.47094315e-02, -1.08638518e-02, -2.31245533e-02,
        2.77865049e-03, -2.56632194e-02, -1.03143707e-01, -4.88215964e-03,
       -2.18123645e-02, -2.50225868e-02, -3.00547350e-02, -4.30534668e-02,
       -1.22429198e-02, -3.50289829e-02, -2.98339017e-02,  6.14097156e-03,
       -1.03674401e-02, -2.54389290e-02, -1.07974345e-02, -1.58925913e-02,
       -1.14500746e-02, -7.45474845e-02, -2.94441469e-02, -1.36640649e-02,
       -5.05592208e-03, -1.47956293e-02, -2.89838482e-03, -4.43647653e-02,
       -7.39014223e-02, -4.45848331e-02, -1.63345002e-02, -3.26130278e-02,
       -9.50325374e-03, -1.54562304e-02, -3.86627689e-02, -6.72732294e-03,
       -3.90293300e-02, -2.51860693e-02, -8.19451455e-03, -3.82768409e-03,
       -4.63133864e-02, -9.82267899e-04, -3.24181281e-02, -2.83361338e-02,
       -8.69179261e-04, -

In [38]:
silence = np.zeros((512,))

In [46]:
silence

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [None]:
# for i,j in enumerate(transformed_embedding):
#     transformed_embedding[i] = np.insert(j, np.random.choice(len(j), size=int(len(j)*0.8)), silence)

In [61]:
a = list(np.ones((20)))
n = 5
a = np.insert(a, np.random.choice(20, size=n), silence)
a

ValueError: shape mismatch: value array of shape (8000,) could not be broadcast to indexing result of shape (5,)

In [None]:
# new = [t[0].numpy() for t in final_emb]
# show_emb_label(umap_projection(new),sliced_mask)

In [321]:
import torch
import speechbrain as sb

class SimpleBrain(sb.Brain):
    def compute_forward(self, batch, stage):
        return self.modules.model(batch["input"])

    
    def compute_objectives(self, predictions, batch, stage):
#         return torch.nn.functional.l1_loss(predictions, batch["target"])
        return sb.nnet.losses.bce_loss(predictions, batch["target"])

#model = torch.nn.Sequential(torch.nn.LSTM(input_size = 512,hidden_size=128,num_layers=4,dropout =0.5,bidirectional=True),torch.nn.Linear(128,128),torch.nn.Softmax(2))

In [467]:
class NeuralNetwork(torch.nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.bilstm = torch.nn.LSTM(input_size = 512,hidden_size=64,num_layers=4,dropout =0.5,bidirectional=True)#,batch_first=True)
        self.linear1 = torch.nn.Linear(128,128)
        self.linear2 = torch.nn.Linear(128,2)
#         self.linear_relu_stack = torch.nn.Sequential(
#             torch.nn.Linear(28*28, 512),
#             torch.nn.ReLU(),
#             torch.nn.Linear(512, 512),
#             torch.nn.ReLU(),
#             torch.nn.Linear(512, 10),
#         )

    def forward(self, x):
        x,y = self.bilstm(x)
        out = self.linear1(x)
        out = self.linear2(out)
        return out

In [468]:
model = NeuralNetwork()
print(model)

NeuralNetwork(
  (bilstm): LSTM(512, 64, num_layers=4, dropout=0.5, bidirectional=True)
  (linear1): Linear(in_features=128, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=2, bias=True)
  (softmax): Softmax(dim=2)
)


In [78]:
new_torch = torch.from_numpy(np.array(new))

In [121]:
new_torch.shape

torch.Size([4418, 512])

In [82]:
sliced_mask_torch = torch.from_numpy(np.array(sliced_mask))

In [166]:
sliced_mask_torch

tensor([0., 0., 0.,  ..., 1., 1., 0.], dtype=torch.float64)

In [167]:
sliced_mask_torch=sliced_mask_torch.reshape(4418)

import torch.nn.functional as Fun
one_hot = Fun.one_hot(sliced_mask_torch.to(torch.int64), num_classes = 2)

In [227]:
one_hot[-2]

tensor([0, 1])

In [417]:
new_torch.shape

torch.Size([4418, 512])

In [419]:
one_hot.shape

torch.Size([4418, 2])

In [420]:
data_new = []
for i in range(64):
    for j in range(int(4418/64)):
        batch_torch.append(new_torch[j*64:(j+1)*64])
        dict1 = {"input": new_torch[i], "target": one_hot[i]}
    data_new.append(dict1)

In [450]:
from hyperpyyaml import load_hyperpyyaml

hparams_file = "hyp.yaml"

with open(hparams_file) as fin:

    hparams = load_hyperpyyaml(fin)

In [471]:
brain = SimpleBrain({"model": model}, opt_class=lambda x: torch.optim.Adam(x))
data = [{"input": new_torch, "target": one_hot}]
brain.fit(range(10), data_new,train_loader_kwargs=hparams["dataloader_options"])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 70/70 [00:07<00:00,  9.31it/s, train_loss=0.494]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 70/70 [00:07<00:00,  9.22it/s, train_loss=0.489]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 70/70 [00:08<00:00,  8.74it/s, train_loss=0.5]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 70/70 [00:07<00:00,  9.01it/s, train_loss=0.465]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 70/70 [00:07<00:00,  8.80it/s

In [472]:
test = {"input": new_torch[-10].reshape(1,512)}

In [473]:
new_torch[-2].reshape(1,512).shape

torch.Size([1, 512])

In [474]:
one_hot[1]

tensor([1, 0])

In [475]:
output = brain.compute_forward(test,stage=3)
print(output.shape)
print(output)

torch.Size([1, 2])
tensor([[-0.3809,  0.5143]], grad_fn=<AddmmBackward0>)


In [237]:
count = 0
for i in sliced_mask_torch:
    if i == 1:
        count =count +1
print(count)

3645


In [96]:
dir(model)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_backward_hooks',
 '_buffers',
 '_call_impl',
 '_forward_hooks',
 '_forward_pre_hooks',
 '_get_backward_hooks',
 '_get_item_by_idx',
 '_get_name',
 '_is_full_backward_hook',
 '_load_from_state_dict',
 '_load_state_dict_post_hooks',
 '_load_state_dict_pre_hooks',
 '_maybe_warn_non_full_backward_hook',
 '_modules',
 '_named_members',
 '_non_persistent_buffers_set',
 '_parameters',
 '_register_load_state_dict_pre_hook',
 '_register_state_dict_hook',
 '_replic

In [246]:
for i in model.named_parameters():
    print(i)

('0.weight', Parameter containing:
tensor([[-0.0330, -0.0186,  0.0369,  ..., -0.0313,  0.0146, -0.0516],
        [ 0.0560,  0.0094, -0.0207,  ..., -0.0033,  0.0498,  0.0189],
        [-0.0122,  0.0279, -0.0160,  ...,  0.0153, -0.0415, -0.0350],
        ...,
        [ 0.0605, -0.0034, -0.0140,  ..., -0.0072,  0.0149,  0.0575],
        [-0.0037,  0.0501,  0.0012,  ..., -0.0103,  0.0143, -0.0187],
        [-0.0139, -0.0418, -0.0313,  ..., -0.0159, -0.0233, -0.0401]],
       requires_grad=True))
('0.bias', Parameter containing:
tensor([ 1.3918e-02,  4.1625e-03,  1.9940e-02,  2.6180e-02,  2.2688e-02,
         2.5424e-02, -1.7914e-02, -2.0885e-02, -4.0077e-02, -4.1558e-02,
        -1.0527e-02, -6.8387e-03, -4.8117e-02, -1.8956e-02,  3.4427e-02,
        -4.6843e-02,  3.4231e-02, -5.4439e-02,  5.2945e-02, -2.6281e-03,
         2.5140e-02, -4.4587e-02,  1.3619e-02,  4.0298e-02, -1.9755e-02,
        -2.3687e-02, -7.6622e-03,  1.4093e-02, -1.8238e-02, -1.5738e-02,
        -1.0142e-02, -1.7558e-02