### Speaker Recognition

Использование одномерной сверточной сети с остаточными соединениями для классификации звука.

Что делаем?  
1. Подготавливаем данные.
2. Берём Быстрое преобразование Фурье.
3. Обучаем одномерную свертку предсказывать привильного говорящего повыборке, обработанной БПФ.

### Import

In [1]:
# import os
from pathlib import Path

import numpy as np
import pandas as pd
from tensorflow import keras

# import preparation as pr
import create_dataframe as cd
import dataset_generation as dg
import model as model

### Setup

In [11]:
FIRST_FOLDER_PATH= Path('data/clips/')
DATASET_AUDIO_PATH = Path('data/splitted/')

EPOCHS = 100

### Create new dataframe

In [3]:
#df = cd.create()

In [4]:
df = pd.read_csv('data/main_train.csv')

In [5]:
labels = np.array(df['client_id'].value_counts().index)

### Preparation

In [6]:
#pr.preparation(FIRST_FOLDER_PATH, df)

### Dataset generation

1. Соединяем label и  name.
2. Перемешиваем.
3. Сплитим на training and validation
4. Преобразуем звуковую волну в частотную область с помощью Быстрого преобразования Фурье

In [7]:
train, val = dg.generate_sets(labels, DATASET_AUDIO_PATH)


Found 4000 files belonging to 20 classes.
Using 3600 files for training.
Using 400 files for validation.


### Model defenition

In [8]:
model = model.start_model(labels)

model.summary()

# Compile the model using Adam's default learning rate
model.compile(
    optimizer="Adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

# Add callbacks:
# 'EarlyStopping' остановить обучения, если модель больше не улучшается
# 'ModelCheckPoint' чтобы сохранять модель с наилучшим val_accuracy
model_save_filename = "checkpoints32/model-{epoch:02d}.h5"

earlystopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
mdlcheckpoint_cb = keras.callbacks.ModelCheckpoint(
    model_save_filename, monitor="val_accuracy", save_best_only=True)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, 24000, 1)]   0                                            
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 24000, 16)    64          input[0][0]                      
__________________________________________________________________________________________________
activation (Activation)         (None, 24000, 16)    0           conv1d_1[0][0]                   
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 24000, 16)    784         activation[0][0]                 
______________________________________________________________________________________________

In [9]:
#model.save('best_result.h5')

## Training

In [None]:
history = model.fit(
    train,
    epochs=EPOCHS,
    validation_data=val,
    callbacks=[earlystopping_cb, mdlcheckpoint_cb],
)

In [None]:
model = keras.models.load_model('best_result.h5')

### Demonstration

In [112]:
SAMPLES_TO_DISPLAY = 10

test = to_dataset(valid_audio_paths, valid_labels)
test = test.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(BATCH_SIZE)

for audios, labels in test.take(1):
    # Get the signal FFT
    ffts = audio_to_fft(audios)
    # Predict
    y_pred = model.predict(ffts)
    # Take random samples
    rnd = np.random.randint(0, BATCH_SIZE, SAMPLES_TO_DISPLAY)
    audios = audios.numpy()[rnd, :, :]
    labels = labels.numpy()[rnd]
    y_pred = np.argmax(y_pred, axis=-1)[rnd]
    
    for index in range(SAMPLES_TO_DISPLAY):
        print(
        "Speaker: {} - Predicted: {}".format(
            people_ids[labels[index]][:4],
            people_ids[y_pred[index]][:4],
        )
        )
        display(Audio(audios[index, :, :].squeeze(), rate=SAMPLING_RATE))

48000
audio_ds: 800
label_ds: 800
Speaker: 2378 - Predicted: 2378


Speaker: c999 - Predicted: c999


Speaker: 6f4f - Predicted: 6f4f


Speaker: 8213 - Predicted: 8213


Speaker: 99ff - Predicted: 99ff


Speaker: 7d37 - Predicted: 7d37


Speaker: 560d - Predicted: 560d


Speaker: bed5 - Predicted: bed5


Speaker: 6a10 - Predicted: 6a10


Speaker: 9bf6 - Predicted: 9bf6
