<center><font size=6><bold>实验七、声纹识别</bold></font></center>

### 一、数据预处理

按一定比例划分训练集与数据集并采用滑动窗口方法利用librosa语音处理库对语音的MFCC（梅尔频率倒谱系数）特征进行提取，并保存为.npy格式的文件至指定文件夹，文件表示每一位说话者每一条口语的声纹特征。

In [3]:
import numpy as np
import os
import librosa
import tensorflow as  tf
import time
from tensorflow.contrib import rnn
import random

定义全局配置参数。

In [2]:
class CONFIG(object):
    def __init__(self):
        self.sr = 8000
        self.nfft = 512
        self.window = 0.025
        self.hop = 0.01
        self.tisv_frame = 180
        self.train_path = "./train_tisv"
        self.test_path = "./test_tisv"
        
        self.hidden = 128
        self.proj = 64
        self.num_layer = 3
        self.restore = False
        self.model_path = "./model"
        self.model_num = 5
        
        self.train = False
        self.N = 8
        self.M = 10
        self.loss = "softmax"
        self.optim = "sgd"
        self.lr = 1e-2
        self.beta1 = 0.5
        self.beta2 = 0.9
        self.iteration = 60000

In [3]:
config = CONFIG()
audio_path = "./wav48"

In [4]:
def save_spectrogram_tisv():
    print("start text independent utterance feature extraction")
    os.makedirs(config.train_path, exist_ok=True)   # make folder to save train file
    os.makedirs(config.test_path, exist_ok=True)    # make folder to save test file

    utter_min_len = (config.tisv_frame * config.hop + config.window) * config.sr    # lower bound of utterance length
    total_speaker_num = len(os.listdir(audio_path))
    train_speaker_num= (total_speaker_num//10)*9            # split total data 90% train and 10% test
    print("total speaker number : %d"%total_speaker_num)
    print("train : %d, test : %d"%(train_speaker_num, total_speaker_num-train_speaker_num))
    for i, folder in enumerate(os.listdir(audio_path)):
        if i <= 107:
            continue
        speaker_path = os.path.join(audio_path, folder)     # path of each speaker
        print("%dth speaker processing..."%i)
        utterances_spec = []
        for utter_name in os.listdir(speaker_path):
            utter_path = os.path.join(speaker_path, utter_name)         # path of each utterance
            utter, sr = librosa.core.load(utter_path, config.sr)        # load utterance audio
            intervals = librosa.effects.split(utter, top_db=20)         # voice activity detection
            for interval in intervals:
                if (interval[1]-interval[0]) > utter_min_len:           # If partial utterance is sufficient long,
                    utter_part = utter[interval[0]:interval[1]]         # save first and last 180 frames of spectrogram.
                    S = librosa.core.stft(y=utter_part, n_fft=config.nfft,
                                          win_length=int(config.window * sr), hop_length=int(config.hop * sr))
                    S = np.abs(S) ** 2
                    mel_basis = librosa.filters.mel(sr=config.sr, n_fft=config.nfft, n_mels=40)
                    S = np.log10(np.dot(mel_basis, S) + 1e-6)           # log mel spectrogram of utterances

                    utterances_spec.append(S[:, :config.tisv_frame])    # first 180 frames of partial utterance
                    utterances_spec.append(S[:, -config.tisv_frame:])   # last 180 frames of partial utterance

        utterances_spec = np.array(utterances_spec)
        print(utterances_spec.shape)
        if i<train_speaker_num:      # save spectrogram as numpy file
            np.save(os.path.join(config.train_path, "speaker%d.npy"%i), utterances_spec)
        else:
            np.save(os.path.join(config.test_path, "speaker%d.npy"%(i-train_speaker_num)), utterances_spec)


In [5]:
save_spectrogram_tisv()

start text independent utterance feature extraction
total speaker number : 109
train : 90, test : 19
108th speaker processing...
(260, 40, 180)


### 二、构建训练模型

1、在步骤一中生成的.npy文件中，随机选择N位说话者并对应选择M条口语声纹特征，组成的batch大小N×M。

In [5]:
def random_batch(speaker_num=config.N, utter_num=config.M, shuffle=True, noise_filenum=None, utter_start=0):
    # data path
    if config.train:
        path = config.train_path
    else:
        path = config.test_path

    np_file_list = os.listdir(path)
    total_speaker = len(np_file_list)

    if shuffle:
        selected_files = random.sample(np_file_list, speaker_num)  # select random N speakers
    else:
        selected_files = np_file_list[:speaker_num]                # select first N speakers

    utter_batch = []
    for file in selected_files:
        utters = np.load(os.path.join(path, file))        # load utterance spectrogram of selected speaker
        if shuffle:
            utter_index = np.random.randint(0, utters.shape[0], utter_num)   # select M utterances per speaker
            utter_batch.append(utters[utter_index])       # each speakers utterance [M, n_mels, frames] is appended
        else:
            utter_batch.append(utters[utter_start: utter_start+utter_num])

    utter_batch = np.concatenate(utter_batch, axis=0)     # utterance batch [batch(NM), n_mels, frames]

    if config.train:
        frame_slice = np.random.randint(140,181)          # for train session, random slicing of input batch
        utter_batch = utter_batch[:,:,:frame_slice]
    else:
        utter_batch = utter_batch[:,:,:160]               # for train session, fixed length slicing of input batch

    utter_batch = np.transpose(utter_batch, axes=(2,0,1))     # transpose [frames, batch, n_mels]

    return utter_batch

2、定义数据归一化函数和相似度函数

In [6]:
def normalize(x):
    """ normalize the last dimension vector of the input matrix
    :return: normalized input
    """
    return x/tf.sqrt(tf.reduce_sum(x**2, axis=-1, keepdims=True)+1e-6)


def cossim(x,y, normalized=True):
    """ calculate similarity between tensors
    :return: cos similarity tf op node
    """
    if normalized:
        return tf.reduce_sum(x*y)
    else:
        x_norm = tf.sqrt(tf.reduce_sum(x**2)+1e-6)
        y_norm = tf.sqrt(tf.reduce_sum(y**2)+1e-6)
        return tf.reduce_sum(x*y)/x_norm/y_norm

3、计算相似度矩阵

![相似度](./图片2.png)

In [7]:
def similarity(embedded, w, b, N=config.N, M=config.M, P=config.proj, center=None):
    """ Calculate similarity matrix from embedded utterance batch (NM x embed_dim) eq. (9)
        Input center to test enrollment. (embedded for verification)
    :return: tf similarity matrix (NM x N)
    """
    embedded_split = tf.reshape(embedded, shape=[N, M, P])

    if center is None:
        center = normalize(tf.reduce_mean(embedded_split, axis=1))              # [N,P] normalized center vectors eq.(1)
        center_except = normalize(tf.reshape(tf.reduce_sum(embedded_split, axis=1, keepdims=True)
                                             - embedded_split, shape=[N*M,P]))  # [NM,P] center vectors eq.(8)
        # make similarity matrix eq.(9)
        S = tf.concat(
            [tf.concat([tf.reduce_sum(center_except[i*M:(i+1)*M,:]*embedded_split[j,:,:], axis=1, keepdims=True) if i==j
                        else tf.reduce_sum(center[i:(i+1),:]*embedded_split[j,:,:], axis=1, keepdims=True) for i in range(N)],
                       axis=1) for j in range(N)], axis=0)
    else :
        # If center(enrollment) exist, use it.
        S = tf.concat(
            [tf.concat([tf.reduce_sum(center[i:(i + 1), :] * embedded_split[j, :, :], axis=1, keepdims=True) for i
                        in range(N)],
                       axis=1) for j in range(N)], axis=0)

    S = tf.abs(w)*S+b   # rescaling

    return S

4、根据相似度矩阵计算损失

In [8]:
def loss_cal(S, type="softmax", N=config.N, M=config.M):
    """ calculate loss with similarity matrix(S) eq.(6) (7) 
    :type: "softmax" or "contrast"
    :return: loss
    """
    S_correct = tf.concat([S[i*M:(i+1)*M, i:(i+1)] for i in range(N)], axis=0)  # colored entries in Fig.1

    if type == "softmax":
        total = -tf.reduce_sum(S_correct-tf.log(tf.reduce_sum(tf.exp(S), axis=1, keepdims=True) + 1e-6))
    elif type == "contrast":
        S_sig = tf.sigmoid(S)
        S_sig = tf.concat([tf.concat([0*S_sig[i*M:(i+1)*M, j:(j+1)] if i==j
                              else S_sig[i*M:(i+1)*M, j:(j+1)] for j in range(N)], axis=1)
                             for i in range(N)], axis=0)
        total = tf.reduce_sum(1-tf.sigmoid(S_correct)+tf.reduce_max(S_sig, axis=1, keepdims=True))
    else:
        raise AssertionError("loss type should be softmax or contrast !")

    return total

5、定义模型优化器函数

In [9]:
def optim(lr):
    """ return optimizer determined by configuration
    :return: tf optimizer
    """
    if config.optim == "sgd":
        return tf.train.GradientDescentOptimizer(lr)
    elif config.optim == "rmsprop":
        return tf.train.RMSPropOptimizer(lr)
    elif config.optim == "adam":
        return tf.train.AdamOptimizer(lr, beta1=config.beta1, beta2=config.beta2)
    else:
        raise AssertionError("Wrong optimizer type!")

6、定义训练模型函数

本实验使用了3层LSTM网络，其输出作为Embedding d-Vector，再进行L2正则化，得到的向量就是说话人的声纹表征。

In [10]:
def train(path):
    tf.reset_default_graph()    # reset graph

    # draw graph
    batch = tf.placeholder(shape= [None, config.N*config.M, 40], dtype=tf.float32)  # input batch (time x batch x n_mel)
    lr = tf.placeholder(dtype= tf.float32)  # learning rate
    global_step = tf.Variable(0, name='global_step', trainable=False)
    w = tf.get_variable("w", initializer= np.array([10], dtype=np.float32))
    b = tf.get_variable("b", initializer= np.array([-5], dtype=np.float32))

    # embedding lstm (3-layer default)
    with tf.variable_scope("lstm"):
        lstm_cells = [tf.contrib.rnn.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer)]
        lstm = tf.contrib.rnn.MultiRNNCell(lstm_cells)    # define lstm op and variables
        outputs, _ = tf.nn.dynamic_rnn(cell=lstm, inputs=batch, dtype=tf.float32, time_major=True)   # for TI-VS must use dynamic rnn
        embedded = outputs[-1]                            # the last ouput is the embedded d-vector
        embedded = normalize(embedded)                    # normalize
    print("embedded size: ", embedded.shape)

    # loss
    sim_matrix = similarity(embedded, w, b)
    print("similarity matrix size: ", sim_matrix.shape)
    loss = loss_cal(sim_matrix, type=config.loss)

    # optimizer operation
    trainable_vars= tf.trainable_variables()                # get variable list
    optimizer= optim(lr)                                    # get optimizer (type is determined by configuration)
    grads, vars= zip(*optimizer.compute_gradients(loss))    # compute gradients of variables with respect to loss
    grads_clip, _ = tf.clip_by_global_norm(grads, 3.0)      # l2 norm clipping by 3
    grads_rescale= [0.01*grad for grad in grads_clip[:2]] + grads_clip[2:]   # smaller gradient scale for w, b
    train_op= optimizer.apply_gradients(zip(grads_rescale, vars), global_step= global_step)   # gradient update operation

    # check variables memory
    variable_count = np.sum(np.array([np.prod(np.array(v.get_shape().as_list())) for v in trainable_vars]))
    print("total variables :", variable_count)

    # record loss
    loss_summary = tf.summary.scalar("loss", loss)
    merged = tf.summary.merge_all()
    saver = tf.train.Saver()

    # training session
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        os.makedirs(os.path.join(path, "Check_Point"), exist_ok=True)  # make folder to save model
        os.makedirs(os.path.join(path, "logs"), exist_ok=True)          # make folder to save log
        writer = tf.summary.FileWriter(os.path.join(path, "logs"), sess.graph)
        epoch = 0
        lr_factor = 1   # lr decay factor ( 1/2 per 10000 iteration)
        loss_acc = 0    # accumulated loss ( for running average of loss)

        for iter in range(config.iteration):
            # run forward and backward propagation and update parameters
            _, loss_cur, summary = sess.run([train_op, loss, merged],
                                  feed_dict={batch: random_batch(), lr: config.lr*lr_factor})

            loss_acc += loss_cur    # accumulated loss for each 100 iteration

            if iter % 10 == 0:
                writer.add_summary(summary, iter)   # write at tensorboard
            if (iter+1) % 100 == 0:
                print("(iter : %d) loss: %.4f" % ((iter+1),loss_acc/100))
                loss_acc = 0                        # reset accumulated loss
            if (iter+1) % 10000 == 0:
                lr_factor /= 2                      # lr decay
                print("learning rate is decayed! current lr : ", config.lr*lr_factor)
            if (iter+1) % 10000 == 0:
                saver.save(sess, os.path.join(path, "./Check_Point/model.ckpt"), global_step=iter//10000)
                print("model is saved!")

7、定义测试模型函数

利用已处理好的测试数据进行模型的测试使用，并根据评价指标EER分析实验结果。

In [11]:
# Test Session
def test(path):
    tf.reset_default_graph()

    # draw graph
    enroll = tf.placeholder(shape=[None, config.N*config.M, 40], dtype=tf.float32) # enrollment batch (time x batch x n_mel)
    verif = tf.placeholder(shape=[None, config.N*config.M, 40], dtype=tf.float32)  # verification batch (time x batch x n_mel)
    batch = tf.concat([enroll, verif], axis=1)

    # embedding lstm (3-layer default)
    with tf.variable_scope("lstm"):
        lstm_cells = [tf.contrib.rnn.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer)]
        lstm = tf.contrib.rnn.MultiRNNCell(lstm_cells)    # make lstm op and variables
        outputs, _ = tf.nn.dynamic_rnn(cell=lstm, inputs=batch, dtype=tf.float32, time_major=True)   # for TI-VS must use dynamic rnn
        embedded = outputs[-1]                            # the last ouput is the embedded d-vector
        embedded = normalize(embedded)                    # normalize

    print("embedded size: ", embedded.shape)

    # enrollment embedded vectors (speaker model)
    enroll_embed = normalize(tf.reduce_mean(tf.reshape(embedded[:config.N*config.M, :], shape= [config.N, config.M, -1]), axis=1))
    # verification embedded vectors
    verif_embed = embedded[config.N*config.M:, :]

    similarity_matrix = similarity(embedded=verif_embed, w=1., b=0., center=enroll_embed)

    saver = tf.train.Saver(var_list=tf.global_variables())
    with tf.Session() as sess:
        tf.global_variables_initializer().run()

        # load model
        print("model path :", path)
        ckpt = tf.train.get_checkpoint_state(checkpoint_dir=os.path.join(path, "Check_Point"))
        ckpt_list = ckpt.all_model_checkpoint_paths
        loaded = 0
        for model in ckpt_list:
            if config.model_num == int(model[-1]):    # find ckpt file which matches configuration model number
                print("ckpt file is loaded !", model)
                loaded = 1
                saver.restore(sess, model)  # restore variables from selected ckpt file
                break

        if loaded == 0:
            raise AssertionError("ckpt file does not exist! Check config.model_num or config.model_path.")

        print("test file path : ", config.test_path)

        # return similarity matrix after enrollment and verification
        time1 = time.time() # for check inference time
        S = sess.run(similarity_matrix, feed_dict={enroll:random_batch(shuffle=False),
                                                       verif:random_batch(shuffle=False, utter_start=config.M)})
        S = S.reshape([config.N, config.M, -1])
        time2 = time.time()

        np.set_printoptions(precision=2)
        print("inference time for %d utterences : %0.2fs"%(2*config.M*config.N, time2-time1))
        print(S)    # print similarity matrix

        # calculating EER
        diff = 1; EER=0; EER_thres = 0; EER_FAR=0; EER_FRR=0

        # through thresholds calculate false acceptance ratio (FAR) and false reject ratio (FRR)
        for thres in [0.01*i+0.5 for i in range(50)]:
            S_thres = S>thres

            # False acceptance ratio = false acceptance / mismatched population (enroll speaker != verification speaker)
            FAR = sum([np.sum(S_thres[i])-np.sum(S_thres[i,:,i]) for i in range(config.N)])/(config.N-1)/config.M/config.N

            # False reject ratio = false reject / matched population (enroll speaker = verification speaker)
            FRR = sum([config.M-np.sum(S_thres[i][:,i]) for i in range(config.N)])/config.M/config.N

            # Save threshold when FAR = FRR (=EER)
            if diff> abs(FAR-FRR):
                diff = abs(FAR-FRR)
                EER = (FAR+FRR)/2
                EER_thres = thres
                EER_FAR = FAR
                EER_FRR = FRR

        print("\nEER : %0.2f (thres:%0.2f, FAR:%0.2f, FRR:%0.2f)"%(EER,EER_thres,EER_FAR,EER_FRR))

In [12]:
config.train= True
train("./model")

embedded size:  (80, 64)
similarity matrix size:  (80, 8)
total variables : 210434
(iter : 100) loss: 125.1124
(iter : 200) loss: 102.2868
(iter : 300) loss: 89.9621
(iter : 400) loss: 86.9429
(iter : 500) loss: 79.9499
(iter : 600) loss: 75.2657
(iter : 700) loss: 70.0881
(iter : 800) loss: 65.6341
(iter : 900) loss: 62.9165
(iter : 1000) loss: 61.5927
(iter : 1100) loss: 54.8214
(iter : 1200) loss: 51.2984
(iter : 1300) loss: 48.3105
(iter : 1400) loss: 45.9122
(iter : 1500) loss: 41.6896
(iter : 1600) loss: 43.2339
(iter : 1700) loss: 38.5662
(iter : 1800) loss: 36.0750
(iter : 1900) loss: 35.9866
(iter : 2000) loss: 35.6972
(iter : 2100) loss: 32.1313
(iter : 2200) loss: 32.2583
(iter : 2300) loss: 31.1465
(iter : 2400) loss: 28.6453
(iter : 2500) loss: 29.0325
(iter : 2600) loss: 28.3069
(iter : 2700) loss: 27.3037
(iter : 2800) loss: 26.8961
(iter : 2900) loss: 26.5261
(iter : 3000) loss: 27.4743
(iter : 3100) loss: 26.0053
(iter : 3200) loss: 25.8477
(iter : 3300) loss: 24.0968


(iter : 28600) loss: 6.0508
(iter : 28700) loss: 5.8554
(iter : 28800) loss: 6.1459
(iter : 28900) loss: 6.6405
(iter : 29000) loss: 6.1738
(iter : 29100) loss: 6.7811
(iter : 29200) loss: 6.3596
(iter : 29300) loss: 6.6660
(iter : 29400) loss: 7.1217
(iter : 29500) loss: 7.2853
(iter : 29600) loss: 6.0356
(iter : 29700) loss: 6.9032
(iter : 29800) loss: 6.5223
(iter : 29900) loss: 6.9550
(iter : 30000) loss: 6.4661
learning rate is decayed! current lr :  0.00125
model is saved!
(iter : 30100) loss: 6.4794
(iter : 30200) loss: 6.0957
(iter : 30300) loss: 5.8880
(iter : 30400) loss: 6.8376
(iter : 30500) loss: 5.5698
(iter : 30600) loss: 6.2785
(iter : 30700) loss: 5.2932
(iter : 30800) loss: 5.7432
(iter : 30900) loss: 5.4160
(iter : 31000) loss: 6.5425
(iter : 31100) loss: 5.7405
(iter : 31200) loss: 6.9640
(iter : 31300) loss: 5.5323
(iter : 31400) loss: 6.5483
(iter : 31500) loss: 5.9443
(iter : 31600) loss: 5.8195
(iter : 31700) loss: 5.6073
(iter : 31800) loss: 6.0223
(iter : 3190

(iter : 57200) loss: 4.7432
(iter : 57300) loss: 5.2697
(iter : 57400) loss: 4.3247
(iter : 57500) loss: 5.0200
(iter : 57600) loss: 4.9058
(iter : 57700) loss: 4.7815
(iter : 57800) loss: 5.0891
(iter : 57900) loss: 5.6554
(iter : 58000) loss: 4.7796
(iter : 58100) loss: 4.6124
(iter : 58200) loss: 4.3885
(iter : 58300) loss: 5.4216
(iter : 58400) loss: 5.0885
(iter : 58500) loss: 5.1005
(iter : 58600) loss: 4.5721
(iter : 58700) loss: 4.9611
(iter : 58800) loss: 5.1177
(iter : 58900) loss: 5.3826
(iter : 59000) loss: 5.0858
(iter : 59100) loss: 5.3879
(iter : 59200) loss: 5.5520
(iter : 59300) loss: 4.6062
(iter : 59400) loss: 4.9064
(iter : 59500) loss: 5.0871
(iter : 59600) loss: 4.8817
(iter : 59700) loss: 5.1417
(iter : 59800) loss: 5.0438
(iter : 59900) loss: 5.2398
(iter : 60000) loss: 4.5126
learning rate is decayed! current lr :  0.00015625
model is saved!


In [13]:
config.train = False
test("./model")

embedded size:  (160, 64)
model path : ./model
ckpt file is loaded ! ./model\Check_Point\model.ckpt-5
INFO:tensorflow:Restoring parameters from ./model\Check_Point\model.ckpt-5
test file path :  ./test_tisv
inference time for 160 utterences : 0.59s
[[[ 0.84  0.25 -0.21  0.33  0.12  0.25  0.49  0.11]
  [ 0.85  0.53 -0.42  0.47  0.21  0.44  0.34 -0.02]
  [ 0.88  0.41 -0.15  0.34  0.28  0.47  0.36  0.08]
  [ 0.81  0.55 -0.15  0.43  0.22  0.45  0.22  0.07]
  [ 0.74  0.08 -0.07  0.53  0.03  0.18  0.36  0.01]
  [ 0.8   0.39  0.06 -0.06  0.32  0.33  0.51  0.21]
  [ 0.79  0.35 -0.28  0.21  0.24  0.36  0.16  0.07]
  [ 0.64  0.42  0.12 -0.12  0.33  0.47  0.21  0.21]
  [ 0.36  0.18  0.19 -0.05  0.53  0.24 -0.08  0.07]
  [ 0.68  0.59 -0.34  0.38  0.29  0.56 -0.01 -0.12]]

 [[ 0.34  0.79 -0.58  0.14 -0.12  0.69 -0.14 -0.44]
  [ 0.28  0.74  0.14 -0.02  0.39  0.63 -0.19  0.  ]
  [ 0.3   0.77 -0.34  0.29  0.    0.72 -0.22 -0.6 ]
  [ 0.12  0.86 -0.16 -0.04  0.02  0.74 -0.19 -0.18]
  [ 0.54  0.79 -0.49 