Из [статьи](https://www.researchgate.net/publication/332242485_Angular_Softmax_Loss_for_End-to-end_Speaker_Verification) "angular softmax speaker verification" и еще [одной статьи](https://deepai.org/publication/deep-speaker-embeddings-for-far-field-speaker-recognition-on-short-utterances) "Deep Speaker Embeddings for Speaker Recognition" взяты параметры для достижения цели этого проекта:
    features: MFFC - n_mel = 23, frame_length = 25ms, duration treck = 3s.
    loss: AMSoftmax(s=3,m-0.35), ArcFace loss, ASoftmax(m = 3)
    backend: cosine similarity

Use layer TimeDistributed from [article](https://ru-keras.com/wrappers-sloi/)

In [8]:
#libs
import tensorflow as tf
import numpy as np
import librosa
import os
import torch

from loss_layers import AMSoftmax, ArcFace
from pathlib import Path
from natsort import natsorted
from sklearn.model_selection import train_test_split
from DataGen import DataGenerator
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Conv1D, Conv2D, LSTM, Dropout,LeakyReLU, PReLU, Reshape, Bidirectional, TimeDistributed
from tensorflow.keras.layers import MaxPooling1D, MaxPooling2D, Flatten, concatenate, GlobalAveragePooling1D, AveragePooling1D, Lambda, PReLU, GRU, LayerNormalization
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import backend as K
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers, initializers

In [2]:
def amsoftmax_loss(y_true, y_pred):
    scale=30.0
    margin=0.25

    label = tf.reshape(tf.argmax(y_true, axis=-1), shape=(-1,1))
    label = tf.cast(label, dtype=tf.int32) # y
    batch_range = tf.reshape(tf.range(tf.shape(y_pred)[0]),shape=(-1,1)) # 0~batchsize-1
    indices_of_groundtruth = tf.concat([batch_range, tf.reshape(label,shape=(-1,1))], axis=1) # 2columns vector, 0~batchsize-1 and label
    groundtruth_score = tf.gather_nd(y_pred, indices_of_groundtruth) # score of groundtruth

    m = tf.constant(margin,name='m')
    s = tf.constant(scale,name='s')

    added_margin = tf.cast(tf.greater(groundtruth_score,m),dtype=tf.float32)*m # if groundtruth_score>m, groundtruth_score-m
    added_margin = tf.reshape(added_margin,shape=(-1,1))
    added_embeddingFeature = tf.subtract(y_pred, y_true*added_margin)*s # s(cos_theta_yi-m), s(cos_theta_j)

    cross_ent = tf.nn.softmax_cross_entropy_with_logits(labels=y_true, logits=added_embeddingFeature)
    loss = tf.reduce_mean(cross_ent)
    return loss

In [3]:
path = Path('C:\\datasets_from_D\\train_vox2_2half\\wav_vad\\0\\')
all_audio = natsorted(list(path.rglob('*.wav')))
# print(audio[:150])
train_audio, test_audio = train_test_split(all_audio, test_size=0.2, random_state=42, shuffle=True)
print(len(train_audio), len(test_audio))

37776 9445


Prepaire generic data for train

In [4]:
# data = DataGenerator(train_audio[:20])
# d, cl = next(iter(data))
# print(d.shape, cl)
train_gen = DataGenerator(train_audio)
val_gen = DataGenerator(test_audio)
print(f'Lenght train_gen {len(train_gen)} batches')

Lenght train_gen 2276 batches


## Model from article "angular softmax speaker verification"


In [5]:
# #Если не  с работает: 1 -  применить рекурентные слои, 2 - применить Conv2D, 3 - изменить Amsoftmax layer на Arclayer, 4- Применить amsoftmax loss а не слой.
# from tensorflow.keras import regularizers
# def create(input_shape=(301, 23)):
#     input_1 = Input(shape=input_shape)
#
#     #features block
#     #1
#     x = Conv1D(23, kernel_size=5, name='Conv_1')(input_1)
#     x = PReLU()(x)
#     x = BatchNormalization(momentum=0.99, name='BN_1')(x)
#     x = MaxPooling1D(pool_size=3, name='MP_1')(x)
#     # x = Conv1D(128, kernel_size=3, activation='relu')(x) kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4)
#     # x = Conv1D(256, kernel_size=3, activation='relu')(x)
#     # x = LSTM(128, return_sequences=True)(x)
#     x = TimeDistributed(Dense(23, activation='relu', name="TD_1"))(x)
#     # x = BatchNormalization(momentum=0.99, name='BN_1')(x)
#
#     #2
#     x = Conv1D(512, 3, activation='relu', name='Conv_2')(x)
#     x = PReLU()(x)
#     x = BatchNormalization(momentum=0.95, name='BN_2')(x)
#     x = MaxPooling1D(pool_size=3, name='MP_2')(x)
#     #x = TimeDistributed(Dense(512, activation='relu', name="TD_2"))(x)
#     # x = BatchNormalization(momentum=0.95, name='BN_2')(x)
#
#     #3
#     x = Conv1D(512, 3, activation='relu', name='Conv_3')(x)
#     x = PReLU()(x)
#     x = BatchNormalization(momentum=0.99, name='BN_3')(x)
#     x = MaxPooling1D(pool_size=3, name='MP_3')(x)
#     #x = TimeDistributed(Dense(512, activation='relu', name="TD_3"))(x)
#     # x = BatchNormalization(momentum=0.99, name='BN_3')(x)
#
#     #4
#     x = Conv1D(512, 1, activation='relu', name='Conv_4')(x)
#     x = PReLU()(x)
#     x = BatchNormalization(momentum=0.95, name='BN_4')(x)
#     x = MaxPooling1D(pool_size=3, name='MP_4')(x)
#     #x = TimeDistributed(Dense(512, activation='relu', name="TD_4"))(x)
#     # x = BatchNormalization(momentum=0.95, name='BN_4')(x)
#
#     #5
#     x = Conv1D(1500, 1, activation='relu', name='Conv_5')(x)
#     x = PReLU()(x)
#     # x = TimeDistributed(Dense(1500, activation='relu', name="TD_5"))(x)
#     #x = BatchNormalization(momentum=0.99, name='BN_5')(x)
#
#     #Statistic block
#     x = LayerNormalization(name='LNorm')(x) # layerNorm with mean = 0, std = 1, standardization before averaging
#     x = AveragePooling1D(name='AVGP')(x)
#
#     #Recurent layers block
#     x = GRU(256,recurrent_activation=None,return_sequences=True)(x)
#     x = BatchNormalization()(x)
#     x = GRU(256,recurrent_activation=None,return_sequences=False)(x)
#     x = BatchNormalization()(x)
#     x = PReLU()(x)
#
#     # Classification block
#     # x = Flatten(name='Flt')(x)
#     x = Dropout(0.5, name='Drop_1')(x)
#     x = Dense(3000, activation='relu', name='D_1')(x)
#     x = BatchNormalization(momentum=0.95, name='BN_6')(x)
#     # x = Dense(512, activation='relu', use_bias=False, name='D_2')(x)
#     # x = BatchNormalization(momentum=0.95, name='BN_7')(x)
#     x = Lambda(lambda c: K.l2_normalize(c, axis=-1))(x) # l2 normalization ?
#     embed = Dense(300, activation='relu', name='embed')(x)
#     outputs = Dense(1000, activation='softmax')(embed)  #  AMSoftmax(1000, s=35, m=0.63, name='AMSoftmax')(embed)    Dense(1000, activation='softmax')(embed)
#
#     out_model = Model(input_1, outputs)
#     out_model.compile(loss=amsoftmax_loss, optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])
#
#     return out_model
#
# model = create()
# model.summary()

In [10]:
def create(input_shape=(301, 23)):
    input_1 = Input(shape=input_shape)
    label = Input(shape=(1000,))

    #features block
    #Conv1D_ 1
    x = Conv1D(23, kernel_size=5, name='Conv_1')(input_1)
    x = PReLU()(x)
    x = BatchNormalization(momentum=0.99, name='BN_1')(x)
    #x = MaxPooling1D(pool_size=3, name='MP_1')(x)

    x = TimeDistributed(Dense(23, activation='relu', name="TD_1"))(x)
    x = BatchNormalization(momentum=0.99, name='BN_2')(x)
    x = TimeDistributed(Dense(512, activation='relu', name="TD_1"))(x)
    x = TimeDistributed(Dense(1500, activation='relu', name="TD_1"))(x)
    x = BatchNormalization(momentum=0.99, name='BN_3')(x)
    x = TimeDistributed(Dense(3000, activation='relu', name="TD_1"))(x)
    x = LeakyReLU()(x)

    #Average block
    x = LayerNormalization()(x)
    x = GlobalAveragePooling1D( name='GLAB_1')(x)


    #Classific block

    # x = Flatten(name='Flt')(x)
    x = Dense(3000, activation='relu', name='D_1')(x)
    x = Dropout(0.3, name='Drop_1')(x)
    # x = Dense(500, activation='relu', name='D_2')(x)
    x = BatchNormalization(momentum=0.95, name='BN_7')(x)
    x = LeakyReLU(alpha=0.1)(x)

    embed = Dense(300, name='embed')(x)

    #output with custom layer
    #norm_emb = BatchNormalization(momentum=0.95, name='BN_6')(embed)
    outputs = ArcFace(n_classes=1000)([embed, label])  # AMSoftmax(1000, s=30, m=0.63, name = 'AMSoftmax')(norm_emb)

    model = Model([input_1, label], outputs)
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

    return model

my_model = create()
my_model.summary()


Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, 301, 23)]    0           []                               
                                                                                                  
 Conv_1 (Conv1D)                (None, 297, 23)      2668        ['input_7[0][0]']                
                                                                                                  
 p_re_lu_3 (PReLU)              (None, 297, 23)      6831        ['Conv_1[0][0]']                 
                                                                                                  
 BN_1 (BatchNormalization)      (None, 297, 23)      92          ['p_re_lu_3[0][0]']              
                                                                                            

In [7]:
#Example TimeDistribute
# inputs = tf.keras.Input(shape=(10, 128, 128, 3))
# conv_2d_layer = tf.keras.layers.Conv2D(64, (3, 3))
# outputs = tf.keras.layers.TimeDistributed(conv_2d_layer)(inputs)
# outputs.shape


**Callbaxk**

In [8]:
sv_mod = ModelCheckpoint(
    filepath = 'D:\\Andrey\\data\\weights\\ARC_mffc_img\\\
model_TDL\\modelTD_b32_301_23_0f_ohe_{epoch:02d}-loss-{loss:.4f}_val_los-{val_loss:.4f}.hdf5',
    monitor = 'val_loss',
    mode = 'min',
    save_best_only = True
)
learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss',
                                            patience=6,
                                            verbose=1,
                                            factor=0.9,
                                            min_lr=0.00001)

**train**

In [None]:
history = my_model.fit(train_gen,
              epochs = 100,
              validation_data = (val_gen),
              callbacks = [sv_mod, learning_rate_reduction],
              verbose = 1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100

In [9]:
#  model.save('D:\Andrey\data\weights\ARC_mffc_img\model_TDL\Model_weights.h5')

In [13]:
#my_model.load_weights('D:\Andrey\data\weights\ARC_mffc_img\model_TDL\modelTD_b32_301_23_0f_ohe_28-loss-6.1041_val_los-6.2124.hdf5')