In [1]:
import pickle
import os
import torch
import numpy as np
import random
import librosa
from thirdparty.DeepSpeaker.util.audio import read_mfcc,get_mfcc_from_wave
from thirdparty.DeepSpeaker.util.batcher import sample_from_mfcc
from thirdparty.DeepSpeaker.util.constants import SAMPLE_RATE, NUM_FRAMES
from thirdparty.DeepSpeaker.util.conv_models import DeepSpeakerModel
from thirdparty.DeepSpeaker.util.test import batch_cosine_similarity
from sklearn import metrics
from sklearn.metrics import auc

In [2]:
model = DeepSpeakerModel()
model.m.load_weights(f'thirdparty/DeepSpeaker/model/ResCNN_triplet_training_checkpoint_265.h5', by_name=True)

In [3]:
num_speaker = 80
erroment_num = 16
num_uttrs = 99
device = "cuda:0"
ROOT = 'train_wave_vctk80'

In [4]:
all_speaker = next(iter(os.walk(ROOT)))[1]

In [5]:
def generate_dataset(num_uttrs, rootDir):
    dirName, subdirList, _ = next(os.walk(rootDir))
    print(f"Found directory: {dirName}")
    data_dict = {}
    for j, speaker in enumerate(sorted(subdirList)):
        print("Processing speaker: %s" % speaker)
        _, _, fileList = next(os.walk(os.path.join(dirName, speaker)))
        fileList = fileList[:num_uttrs]
        all_data = []
        for file in fileList:
            path = os.path.join(dirName, speaker, file)
            mfcc = sample_from_mfcc(read_mfcc(path, SAMPLE_RATE), NUM_FRAMES)
            all_data.append(mfcc)
        data_dict[speaker] = all_data
    return data_dict

In [6]:
data_dict = generate_dataset(num_uttrs,ROOT)

Found directory: train_wave_vctk80
Processing speaker: p225
Processing speaker: p226
Processing speaker: p227
Processing speaker: p228
Processing speaker: p229
Processing speaker: p230
Processing speaker: p231
Processing speaker: p232
Processing speaker: p233
Processing speaker: p234
Processing speaker: p236
Processing speaker: p237
Processing speaker: p238
Processing speaker: p239
Processing speaker: p240
Processing speaker: p241
Processing speaker: p243
Processing speaker: p244
Processing speaker: p245
Processing speaker: p246
Processing speaker: p247
Processing speaker: p248
Processing speaker: p249
Processing speaker: p250
Processing speaker: p251
Processing speaker: p252
Processing speaker: p253
Processing speaker: p254
Processing speaker: p255
Processing speaker: p256
Processing speaker: p257
Processing speaker: p258
Processing speaker: p259
Processing speaker: p260
Processing speaker: p261
Processing speaker: p262
Processing speaker: p263
Processing speaker: p264
Processing spea

In [7]:
def get_truth_dv(enrollment_mfcc):
    _dv = np.zeros((1, 512))
    for mfcc in enrollment_mfcc:
        _dv += model.m.predict(np.expand_dims(mfcc, axis=0)) 
    return (_dv / erroment_num).reshape(-1)

In [8]:
def generate_real_cos(all_dv,enrroment_idxs):
    cos_result = np.zeros((num_speaker, num_speaker))
    var_result = np.zeros((num_speaker, num_speaker))
    for i,key in enumerate(data_dict.keys()):
        # Generate embed from remain idx
        tmp_style = []
        print(f"Now process --- {key}")
        all_mfcc = data_dict[key]
        for j in range(num_uttrs):
            if j not in enrroment_idxs:
                mfcc = all_mfcc[j]
                tmp_style.append(model.m.predict(np.expand_dims(mfcc, axis=0)).reshape(-1))

        # Compare with all truth dv
        for k,dv in enumerate(all_dv):
            tmp_cos = []
            for _style in tmp_style:
                cos_ = np.dot(dv,_style) / (np.linalg.norm(dv) * np.linalg.norm(_style))
                tmp_cos.append(cos_)

            cos_result[i][k] = np.array(tmp_cos).mean()
            var_result[i][k] = np.array(tmp_cos).std()
    return cos_result, var_result

In [9]:
def get_ytrue_yscore(cos_res):
        N = len(cos_res)
        y_true, y_score = [], []
        for i in range(N):
            for ele in np.diagonal(cos_res[i]):
                y_score.append(ele)
            for _ in range(len(cos_res[i])):
                y_true.append(1)
            # remove diagonal element
            m = cos_res[i].shape[0]
            strided = np.lib.stride_tricks.as_strided
            s0, s1 = cos_res[i].strides
            outs = (
                strided(cos_res[i].ravel()[1:], shape=(m - 1, m), strides=(s0 + s1, s1))
                .reshape(m, -1)
                .flatten()
            )
            for ele in outs:
                y_score.append(ele)
            for _ in range(len(outs)):
                y_true.append(0)
        return y_true, y_score

In [10]:
all_real_cos_result = []
all_real_var_result = [] 
all_non_real_cos_result = []
all_non_real_var_result = []
all_yt, all_ys = [],[]
for i in range(1):
    all_dv,enrroment_idxs = [],[]
    for _ in range(erroment_num):
        enrroment_idxs.append(random.randint(0,1))
        
    # Generate truth embed
    for key in data_dict.keys():
        #print(f"Now process --- {key}")
        all_mfcc = data_dict[key]
        enrollment_mfcc = []
        for enrroment_idx in enrroment_idxs:
            enrollment_mfcc.append(all_mfcc[enrroment_idx])
        all_dv.append(get_truth_dv(enrollment_mfcc))
    print(f"Generate truth_dv --- done")
    real_cos_result, real_var_result = generate_real_cos(all_dv,enrroment_idxs)
    
    y_true, y_score = get_ytrue_yscore([real_cos_result])
    for ele in y_true:
        all_yt.append(ele)
    for ele in y_score:
        all_ys.append(ele)
    ## Mean of 40 speaker Cos-similarity
    all_real_cos_result.append(np.diagonal(real_cos_result).mean())
    ## Mean of 40 speaker Cos-similarity Variance
    all_real_var_result.append(np.diagonal(real_var_result).mean())
    ##  Mean of 40 speaker Cos-similarity with non_source speaker embedding
    all_non_real_cos_result.append((np.sum(real_cos_result) - (np.sum(np.diagonal(real_cos_result)))) /(num_speaker*(num_speaker-1)))
    ##  Mean of 40 speaker Cos-similarity Variance with non_source speaker embedding
    all_non_real_var_result.append((np.sum(real_var_result) - (np.sum(np.diagonal(real_var_result)))) /(num_speaker*(num_speaker-1)))
fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score)

Generate truth_dv --- done
Now process --- p225
Now process --- p226
Now process --- p227
Now process --- p228
Now process --- p229
Now process --- p230
Now process --- p231
Now process --- p232
Now process --- p233
Now process --- p234
Now process --- p236
Now process --- p237
Now process --- p238
Now process --- p239
Now process --- p240
Now process --- p241
Now process --- p243
Now process --- p244
Now process --- p245
Now process --- p246
Now process --- p247
Now process --- p248
Now process --- p249
Now process --- p250
Now process --- p251
Now process --- p252
Now process --- p253
Now process --- p254
Now process --- p255
Now process --- p256
Now process --- p257
Now process --- p258
Now process --- p259
Now process --- p260
Now process --- p261
Now process --- p262
Now process --- p263
Now process --- p264
Now process --- p265
Now process --- p266
Now process --- p267
Now process --- p268
Now process --- p269
Now process --- p270
Now process --- p271
Now process --- p272
Now pro

In [11]:
auc(fpr, tpr)

0.9911036392405064

In [12]:
np.array(all_real_cos_result).mean(),np.array(all_real_cos_result).std()

(0.7456258727690876, 0.0)

In [13]:
np.array(all_non_real_cos_result).mean(),np.array(all_non_real_cos_result).std()

(0.40959155662053015, 0.0)