In [1]:
import torch
from torch.nn import CosineSimilarity
from model import ECAPA_TDNN
from ECAPAModel import ECAPAModel
import soundfile
import os
import numpy as np
from torch.nn import functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Create test_list.tsv
with open("../../data/Test/test_list.tsv", 'w') as f_write:
    f_write.writelines(["common_voice_vi_24476184.wav\tcommon_voice_vi_24388665.wav\n", "common_voice_vi_23926783.wav\tcommon_voice_vi_23847785.wav\n"])

In [3]:
# load params
def load_parameters(self, path):
    self_state = self.state_dict()
    loaded_state = torch.load(path)
    for name, param in loaded_state.items():
        origname = name
        if name not in self_state:
            name = name.replace("speaker_encoder.", "")
            if name not in self_state:
                print("%s is not in the model."%origname)
                continue
        if self_state[name].size() != loaded_state[origname].size():
            print("Wrong parameter length: %s, model: %s, loaded: %s"%(origname, self_state[name].size(), loaded_state[origname].size()))
            continue
        self_state[name].copy_(param)

In [4]:
speaker_encoder = ECAPA_TDNN(C=1024)
speaker_encoder.parameters

<bound method Module.parameters of ECAPA_TDNN(
  (torchfbank): Sequential(
    (0): PreEmphasis()
    (1): MelSpectrogram(
      (spectrogram): Spectrogram()
      (mel_scale): MelScale()
    )
  )
  (specaug): FbankAug()
  (conv1): Conv1d(80, 1024, kernel_size=(5,), stride=(1,), padding=(2,))
  (relu): ReLU()
  (bn1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer1): Bottle2neck(
    (conv1): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
    (bn1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (convs): ModuleList(
      (0): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
      (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
      (2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
      (3): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
      (4): Conv1d(128, 128, kernel_size

In [5]:
load_parameters(speaker_encoder, "../../output/model/model_0030.model")

speaker_loss.weight is not in the model.


In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
speaker_encoder.to(device)
speaker_encoder.eval()

cuda:0


ECAPA_TDNN(
  (torchfbank): Sequential(
    (0): PreEmphasis()
    (1): MelSpectrogram(
      (spectrogram): Spectrogram()
      (mel_scale): MelScale()
    )
  )
  (specaug): FbankAug()
  (conv1): Conv1d(80, 1024, kernel_size=(5,), stride=(1,), padding=(2,))
  (relu): ReLU()
  (bn1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer1): Bottle2neck(
    (conv1): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
    (bn1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (convs): ModuleList(
      (0): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
      (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
      (2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
      (3): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
      (4): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(2,), d

In [7]:
speaker_encoder.eval()

test_path = "../../data/MSV_CommonVoice_data/vi/clips/"

f_read = open("../../data/Test/test_list.tsv")
f_write = open("../../output/result.tsv", "w")

lines = f_read.readlines()
f_read.close()

for line in lines:
    file_1, file_2 = line.strip().split("\t")

    audio_1, _  = soundfile.read(os.path.join(test_path, file_1))
    data_1 = torch.FloatTensor(np.stack([audio_1],axis=0)).to(device)

    audio_2, _  = soundfile.read(os.path.join(test_path, file_2))
    data_2 = torch.FloatTensor(np.stack([audio_2],axis=0)).to(device)

    with torch.no_grad():
        embedding_1 = speaker_encoder.forward(data_1, aug=False)
        embedding_1 = F.normalize(embedding_1, p=2, dim=1)
        embedding_2 = speaker_encoder.forward(data_2, aug=False)
        embedding_2 = F.normalize(embedding_2, p=2, dim=1)

    score = CosineSimilarity()(embedding_1, embedding_2)

    f_write.write(file_1 + "\t" + file_2 + "\t" + str(score.item()) + "\n")

f_write.close()

# Evaluation

In [8]:
# Create eval_list.tsv
with open("../../data/Test/eval_list.tsv", 'w') as f_write:
    f_write.writelines(["1\tcommon_voice_vi_24476184.wav\tcommon_voice_vi_24388665.wav\n", "0\tcommon_voice_vi_23926783.wav\tcommon_voice_vi_23847785.wav\n"])

In [9]:
ecapa_model = ECAPAModel(0.01, 0.97, 1024, 96, 0.2, 30, 1)
ecapa_model.speaker_encoder = speaker_encoder

09-23 10:44:12 Model para number = 14.73


In [10]:
eval_list = "../../data/Test/eval_list.tsv"
eval_path = "../../data/MSV_CommonVoice_data/vi/clips/"

In [11]:
ecapa_model.eval_network(eval_list, eval_path)

100%|██████████| 4/4 [00:00<00:00, 30.46it/s]


(0.0, 0.0)