In [1]:
import os
import torch
import librosa
import zipfile
import torchaudio
import soundfile as sf
import numpy as np
from tqdm import tqdm
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


Распаковка трейна и теста с Google Drive

In [4]:
with zipfile.ZipFile('./drive/MyDrive/YVector/Voxceleb/features.zip', 'r') as zip_ref:
    zip_ref.extractall('./')

In [5]:
path_to_zip_file = './drive/MyDrive/YVector/Voxceleb/vox1_test_wav.zip'
directory_to_extract_to = './test'
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
    zip_ref.extractall(directory_to_extract_to)

In [26]:
!cp ./drive/MyDrive/YVector/Voxceleb/veri_test.txt ./

In [6]:
num_classes = 1251
embedding_dim = 128

В этот лоадер загружаются заранее обрезанные и нормализованные данные в виде npy массивов

In [7]:
class train_ds_loader(Dataset):
    def __init__(self):
        self.ds = []
        self.ds_len = 0

        for r, d, files in os.walk('./features/'):
            for file in files:
              self.ds_len+=1

    def __getitem__(self, idx):
        loaded = np.load('./features/' + str(idx) +'.npy', allow_pickle=True)
        waveform = torch.from_numpy(loaded[0])
        waveform = torch.unsqueeze(waveform, 0)
        label = loaded[1] - 1
        return waveform, label
    
    def __len__(self):
        return self.ds_len

In [8]:
train_ds = train_ds_loader()

In [9]:
train_dl = DataLoader(train_ds, batch_size=80, shuffle=True)

#Squeeze and excitation блок из статьи (tf-SE)

In [10]:
class SE(nn.Module):
    def __init__(self, channels):
        super().__init__()
        self.fgate = nn.Sequential(nn.Linear(channels, channels), nn.Sigmoid())
        self.tgate = nn.Sequential(nn.Linear(channels, 1), nn.Sigmoid())

    def forward(self, x):
        fg = self.fgate(x.mean(dim=-1))
        x = x * fg.unsqueeze(-1)
        tg = x.permute(0, 2, 1).contiguous().view(-1, x.shape[1])
        tg = self.tgate(tg).view(x.shape[0], x.shape[2]).unsqueeze(1)
        out = x * tg

        return out

Y = tf-SE(ReLU(Norm(Dropout(Conv(X))) 

DSConv это ReLU(Norm(Dropout(Conv(X))

In [11]:
class DSConv(nn.Module):
    def __init__(self, n_in, n_out, kernel, stride, padding=0):
        super(DSConv, self).__init__()
        self.conv = nn.Conv1d(n_in, n_out, kernel, stride=stride, padding=padding,bias=False)
        self.norm = nn.GroupNorm(n_out, n_out, affine=False)
        self.activation = nn.ReLU()
              
    def forward(self, x):
      out = self.conv(x)
      out = self.norm(out)
      out = self.activation(out)
      return out

#Multi Scale Feature Extraction Module. 
В сравнении с оригинальной работой изменен второй сверточный слой, который в стате называется dimension match convolution. Так вот, в коде автора этот слой не матчил размерности, часть вектора там просто обрезается. Изменен также выход сети, т.к. в коде автора на выход идет три блока skip1, skip2, tfSE3, хотя в самой статье преполагается skip1, skip2, skip3, tfSE3

In [12]:
# Multi Scale Feature Extraction Module
class MSFEM(nn.Module):
    def __init__(self):
        super(MSFEM, self).__init__()

        # MISSMATCHED DIM FROM PAPER
        #par_conv1 = nn.Sequential(nn.Conv1d(1, 90, 12, 6), nn.Conv1d(90, 160, 5, 3))
        #par_conv2 = nn.Sequential(nn.Conv1d(1, 90, 18, 9), nn.Conv1d(90, 160, 5, 2))
        #par_conv3 = nn.Sequential(nn.Conv1d(1, 90, 36, 18), nn.Conv1d(90, 192, 5, 1))

        #Matched Dimesions
        self.par_conv1 = nn.Sequential(nn.Conv1d(1, 90, 12, 6), nn.Conv1d(90, 160, 5, 3))
        self.par_conv2 = nn.Sequential(nn.Conv1d(1, 90, 18, 9), nn.Conv1d(90, 160, 3, 2))
        self.par_conv3 = nn.Sequential(nn.Conv1d(1, 90, 36, 18), nn.Conv1d(90, 160, 1, 1))

        self.skip1 = nn.MaxPool1d(kernel_size=5, stride=8)
        self.skip2 = nn.MaxPool1d(kernel_size=3, stride=4, padding=1)
        self.skip3 = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)

        self.tfSE1 = nn.Sequential(DSConv(480, 480, 5, 2), SE(480))
        self.tfSE2 = nn.Sequential(DSConv(480, 480, 3, 2), SE(480))
        self.tfSE3 = nn.Sequential(DSConv(480, 480, 3, 2, padding=1), SE(480))

    def forward(self, x):
        x1 = self.par_conv1(x)
        x2 = self.par_conv2(x)
        x3 = self.par_conv3(x)
        x_filtered_cat = torch.cat((x1, x2, x3), dim=1)
        
        x_skip1 = self.skip1(x_filtered_cat)
        x_SE1 = self.tfSE1(x_filtered_cat)
        x_skip2 = self.skip2(x_SE1)
        x_SE2 = self.tfSE2(x_SE1)
        x_skip3 = self.skip3(x_SE2)
        x_SE3 = self.tfSE3(x_SE2)
        out = torch.cat((x_SE3, x_skip3, x_skip2, x_skip1), dim=1)

        return out

In [13]:
class TDNNLayer(nn.Module):

    # FROM https://github.com/cvqluu/TDNN/blob/master/tdnn.py

    def __init__(self, input_dim, output_dim,
                 context_size, dilation=1):
        super(TDNNLayer, self).__init__()
        self.context_size = context_size
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.dilation = dilation
        self.kernel = nn.Linear(input_dim*context_size, output_dim)
        
    def forward(self, inputs):
        # ----------Convolution = unfold + matmul + fold
        x = inputs
        _, d, _ = x.shape
        assert (d == self.input_dim), 'Input dimension was wrong. Expected ({}), got ({})'.format(self.input_dim, d)
        x = x.unsqueeze(1)
        
        # Unfold input into smaller temporal contexts
        x = torch.nn.functional.unfold(x, (self.input_dim, self.context_size), 
                     stride=(self.input_dim, 1), 
                     dilation=(1, self.dilation))

        # N, output_dim*context_size, new_t = x.shape
        x = x.transpose(1, 2)
        x = self.kernel(x) # matmul
        
        # transpose to channel first
        x = x.transpose(1, 2)

        return x

In [14]:
class TDNN(nn.Module):
    def __init__(self, input_dim, output_dim=460, context_size=5, dilation=1):
        super(TDNN, self).__init__()

        self.tdnn_layer = TDNNLayer(input_dim, output_dim, context_size, dilation)
        self.norm = nn.GroupNorm(1, output_dim, affine=True)
        self.activation = nn.ReLU()

    def forward(self, x):
        out = self.tdnn_layer(x)
        out = self.norm(out)
        out = self.activation(out)
        return out

#TDNN 

In [15]:
class AgregationTDNN(nn.Module):
    def __init__(self, feature_dim=460, embed_dim=512, p_dropout=0.0):
        super(AgregationTDNN, self).__init__()
        self.tdnn = nn.Sequential(
            TDNN(feature_dim, 460, 5, 1),
            TDNN(460, 460, 3, 2),
            TDNN(460, 460, 3, 3),
            TDNN(460, 460, 1, 1),
            TDNN(460, 1500, 1, 1),
        )

        self.fc1 = nn.Linear(3000, 460)
        self.bn = nn.BatchNorm1d(460)
        self.dropout_fc1 = nn.Dropout(p=p_dropout)
        self.lrelu = nn.LeakyReLU(0.2)
        self.fc2 = nn.Linear(460, embed_dim)

    def forward(self, x):
        # Note: x must be (batch_size, feat_dim, chunk_len)
        x = self.tdnn(x)
        
        stats = torch.cat((x.mean(dim=2), x.std(dim=2)), dim=1)
        
        x = self.dropout_fc1(self.lrelu(self.bn(self.fc1(stats))))
        x = self.fc2(x)
        
        return x

#YVector 

In [16]:
class YVector(nn.Module):
    def __init__(self):
        super(YVector, self).__init__()
        self.MSFEM = MSFEM()
        self.tdnn_aggregator = AgregationTDNN(feature_dim=480*4, embed_dim=embedding_dim)

    def forward(self, x):
        out = self.MSFEM(x)
        out = self.tdnn_aggregator(out)
        return out

#Additive Margin Softmax

In [17]:
class AdMSoftmaxLoss(nn.Module):
    # FROM https://github.com/Leethony/Additive-Margin-Softmax-Loss-Pytorch/blob/master/AdMSLoss.py
    def __init__(self, in_features, out_features, s=30.0, m=0.4):
        '''
        AM Softmax Loss
        '''
        super(AdMSoftmaxLoss, self).__init__()
        self.s = s
        self.m = m
        self.in_features = in_features
        self.out_features = out_features
        self.fc = nn.Linear(in_features, out_features, bias=False)

    def forward(self, x, labels):
        '''
        input shape (N, in_features)
        '''
        assert len(x) == len(labels)
        assert torch.min(labels) >= 0
        assert torch.max(labels) < self.out_features
        
        for W in self.fc.parameters():
            W = nn.functional.normalize(W, dim=1)

        x = nn.functional.normalize(x, dim=1)

        wf = self.fc(x)
        numerator = self.s * (torch.diagonal(wf.transpose(0, 1)[labels]) - self.m)
        excl = torch.cat([torch.cat((wf[i, :y], wf[i, y+1:])).unsqueeze(0) for i, y in enumerate(labels)], dim=0)
        denominator = torch.exp(numerator) + torch.sum(torch.exp(self.s * excl), dim=1)
        L = numerator - torch.log(denominator)
        return -torch.mean(L)

In [72]:
model = YVector().to(device)
loss_function = AdMSoftmaxLoss(embedding_dim, num_classes).to(device)
model.load_state_dict(torch.load('./drive/MyDrive/YVector/v5/yvector_model_v5_e0.pt'))
loss_function.load_state_dict(torch.load('./drive/MyDrive/YVector/v4/yvector_loss_v4_e7.pt'))
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

In [19]:
def train(dataloader, model, loss, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X,y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_function(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        if batch % 50 == 0:
            #torch.cuda.empty_cache()
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [20]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
    test_loss /= num_batches
    epochs_loss.append(test_loss)
    print(f"Avg loss: {test_loss:>8f} \n")

In [31]:
!mkdir ./drive/MyDrive/YVector/v6

In [None]:
epochs_loss = []
epochs = 30
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    torch.save(model.state_dict(), './drive/MyDrive/YVector/v6/yvector_model_v6_e'+ str(t) +'.pt')
    torch.save(loss_function.state_dict(), './drive/MyDrive/YVector/v6/yvector_loss_v6_e'+ str(t) +'.pt')
    train(train_dl, model, loss_function, optimizer)
    #test(test_dataloader, model, loss_function)
print("Done!")

#Equal Error Rate


In [77]:
def eval_eer(positive_sim, negative_sim):
    #From https://github.com/gzhu06/Y-vector/blob/main/eer_monitor.py
    target_scores = sorted(positive_sim)
    nontarget_scores = sorted(negative_sim)

    target_size = len(target_scores)
    nontarget_size = len(nontarget_scores)

    target_position = 0
    for target_position in range(target_size):
        nontarget_n = nontarget_size * target_position * 1.0 / target_size
        nontarget_position = int(nontarget_size - 1 - nontarget_n)
        if nontarget_position < 0:
            nontarget_position = 0
        if nontarget_scores[nontarget_position] < target_scores[target_position]:
            break

    threshold = target_scores[target_position]
    eer = target_position * 1.0 / target_size

    return eer, threshold

In [78]:
with open('./veri_test.txt', 'r') as f:
    data = f.readlines()
    positive_similarity = []
    negative_similarity = []
    prefix = './test/wav/'
    sample_len = int(np.ceil(16000 * 3.9))
    model.eval()
    for line in tqdm(data, desc='Computing cosine similarities'):
        content = line.split(' ')
        file1 = content[1].split('.')[0]
        file2 = content[2][:-1].split('.')[0]
        path1 = prefix + file1 + '.wav'
        path2 = prefix + file2 + '.wav'
        waveform1, sr = librosa.load(path1, sr=None)
        waveform2, sr = librosa.load(path2, sr=None)
        waveform1 = librosa.util.normalize(waveform1)
        waveform2 = librosa.util.normalize(waveform2)
        waveform1 = waveform1[:sample_len]
        waveform2 = waveform2[:sample_len]
        waveform1 = torch.from_numpy(waveform1)
        waveform2 = torch.from_numpy(waveform2)
        waveform1 = torch.unsqueeze(waveform1, 0)
        waveform1 = torch.unsqueeze(waveform1, 0)
        waveform2 = torch.unsqueeze(waveform2, 0)
        waveform2 = torch.unsqueeze(waveform2, 0)
        waveform1 = waveform1.to(device)
        waveform2 = waveform2.to(device)
        anchor_embeds = model(waveform1).squeeze(0)
        pair_embeds = model(waveform2).squeeze(0)
        sims_temp = torch.nn.functional.cosine_similarity(anchor_embeds, pair_embeds, dim=0).cpu().detach().numpy()
        if content[0] == '1':
            positive_similarity.append(sims_temp)
        if content[0] == '0':
            negative_similarity.append(sims_temp)

Computing cosine similarities: 100%|██████████| 37720/37720 [21:17<00:00, 29.53it/s]


In [79]:
positive_similarity = np.array(positive_similarity)
negative_similarity = np.array(negative_similarity)

In [80]:
eer, threshold = eval_eer(positive_similarity, negative_similarity)
print("threshold is --> {:.4f}".format(threshold), "eer is --> {:.4f}%".format(eer*100.0))

threshold is --> 0.3013 eer is --> 28.3775%


In [52]:
#Cuda Memory Release
import gc

model.cpu()
del model
gc.collect()
torch.cuda.empty_cache()