In [8]:
import os

import numpy as np

import tensorflow as tf
import keras

from pathlib import Path
from IPython.display import display, Audio
import pandas as pd

# Get the data from https://www.kaggle.com/kongaevans/speaker-recognition-dataset/

# Upload The Dataset

In [9]:
DATASET_ROOT = "16000_pcm_speeches"

AUDIO_SUBFOLDER = "audio"

DATASET_AUDIO_PATH = os.path.join(DATASET_ROOT, AUDIO_SUBFOLDER)


VALID_SPLIT = 0.1


SHUFFLE_SEED = 43


SAMPLING_RATE = 16000


BATCH_SIZE = 128
BATCH_SIZE_FOR_VAL = 32
EPOCHS = 1


In [10]:
def paths_and_labels_to_dataset(audio_paths, labels):
    """Constructs A Dataset Of Audios And Labels."""
    path_ds = tf.data.Dataset.from_tensor_slices(audio_paths)
    audio_ds = path_ds.map(
        lambda x: path_to_audio(x), num_parallel_calls=tf.data.AUTOTUNE
    )
    label_ds = tf.data.Dataset.from_tensor_slices(labels)

    # ZIP For Combine 2 Separate Datasets
    return tf.data.Dataset.zip((audio_ds, label_ds))


def path_to_audio(path):
    """Reads and decodes an audio file."""
    
    # Return Binary Data Of The Audio File 
    audio = tf.io.read_file(path)
    audio, _ = tf.audio.decode_wav(audio, 1, SAMPLING_RATE)
    return audio


def audio_to_fft(audio):
    """ Convert the signal from the time domain to frequency domain"""
    
    audio = tf.squeeze(audio, axis=-1)

    fft = tf.signal.fft(
        tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
    )

    fft = tf.expand_dims(fft, axis=-1)

    # Represents The Positive Frequencies
    return tf.math.abs(fft[:, : (audio.shape[1] // 2), :])

In [11]:
# Get The List Of Audio File Paths Along With Their Corresponding Labels
class_names = os.listdir(DATASET_AUDIO_PATH)
print(
    "Our class names: {}".format(
        class_names,
    )
)

Our class names: ['Benjamin_Netanyau', 'Jens_Stoltenberg', 'Julia_Gillard', 'Magaret_Tarcher', 'Nelson_Mandela']


In [12]:
audio_paths = []
labels = []
for label, name in enumerate(class_names):
    print(
        "{}- Processing speaker  {}".format(
            label+1,name
        )
    )

    dir_path = Path(DATASET_AUDIO_PATH) / name
    speaker_sample_paths = [
        os.path.join(dir_path, filepath)
        for filepath in os.listdir(dir_path)
        if filepath.endswith(".wav")
    ]

    audio_paths += speaker_sample_paths
    labels += [label] * len(speaker_sample_paths)


print(
    "Found {} files belonging to {} classes.".format(len(audio_paths), len(class_names))
)
print('*'*40)
print(labels)


1- Processing speaker  Benjamin_Netanyau
2- Processing speaker  Jens_Stoltenberg
3- Processing speaker  Julia_Gillard
4- Processing speaker  Magaret_Tarcher
5- Processing speaker  Nelson_Mandela
Found 7501 files belonging to 5 classes.
****************************************
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [13]:
# Shuffle
rng = np.random.RandomState(SHUFFLE_SEED)
rng.shuffle(audio_paths)
rng = np.random.RandomState(SHUFFLE_SEED)
rng.shuffle(labels)

In [14]:
# Split Into Training And Validation
num_val_samples = int(VALID_SPLIT * len(audio_paths))
print("Using {} files for training.".format(len(audio_paths) - num_val_samples))
train_audio_paths = audio_paths[:len(audio_paths) - num_val_samples]
train_labels = labels[:len(audio_paths) - num_val_samples]

Using 6751 files for training.


In [15]:
pd.DataFrame(audio_paths[:len(audio_paths) - num_val_samples])

Unnamed: 0,0
0,16000_pcm_speeches\audio\Jens_Stoltenberg\273.wav
1,16000_pcm_speeches\audio\Magaret_Tarcher\1253.wav
2,16000_pcm_speeches\audio\Jens_Stoltenberg\1225...
3,16000_pcm_speeches\audio\Nelson_Mandela\646.wav
4,16000_pcm_speeches\audio\Nelson_Mandela\1435.wav
...,...
6746,16000_pcm_speeches\audio\Julia_Gillard\555.wav
6747,16000_pcm_speeches\audio\Jens_Stoltenberg\1322...
6748,16000_pcm_speeches\audio\Jens_Stoltenberg\1457...
6749,16000_pcm_speeches\audio\Julia_Gillard\173.wav


In [16]:
print("Using {} files for validation.".format(num_val_samples))
valid_audio_paths = audio_paths[-num_val_samples:]
valid_labels = labels[-num_val_samples:]

Using 750 files for validation.


In [17]:
pd.DataFrame(audio_paths[-num_val_samples:])

Unnamed: 0,0
0,16000_pcm_speeches\audio\Benjamin_Netanyau\124...
1,16000_pcm_speeches\audio\Jens_Stoltenberg\259.wav
2,16000_pcm_speeches\audio\Julia_Gillard\1082.wav
3,16000_pcm_speeches\audio\Magaret_Tarcher\830.wav
4,16000_pcm_speeches\audio\Jens_Stoltenberg\97.wav
...,...
745,16000_pcm_speeches\audio\Nelson_Mandela\1179.wav
746,16000_pcm_speeches\audio\Jens_Stoltenberg\391.wav
747,16000_pcm_speeches\audio\Jens_Stoltenberg\371.wav
748,16000_pcm_speeches\audio\Julia_Gillard\1350.wav


In [18]:
# Create 2 Datasets, One For Training And The Other For Validation
train_ds = paths_and_labels_to_dataset(train_audio_paths, train_labels)
train_ds = train_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(
    BATCH_SIZE
)

valid_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels)
valid_ds = valid_ds.shuffle(buffer_size=BATCH_SIZE_FOR_VAL * 8, seed=SHUFFLE_SEED).batch(
    BATCH_SIZE_FOR_VAL
)

In [19]:
# Transform Audio Wave To The Frequency Domain Using `audio_to_fft` function
train_ds = train_ds.map(
    lambda x, y: (audio_to_fft(x), y), num_parallel_calls=tf.data.AUTOTUNE
)

# Prefetching Buffer Size Based On The Available Memory 
train_ds = train_ds.prefetch(tf.data.AUTOTUNE)

valid_ds = valid_ds.map(
    lambda x, y: (audio_to_fft(x), y), num_parallel_calls=tf.data.AUTOTUNE
)
valid_ds = valid_ds.prefetch(tf.data.AUTOTUNE)


In [123]:
def residual_block(x, filters, conv_num=3, activation="relu"):
    s = keras.layers.Conv1D(filters, 1, padding="same")(x)
    for i in range(conv_num - 1):
        x = keras.layers.Conv1D(filters, 3, padding="same")(x)
        x = keras.layers.Activation(activation)(x)
    x = keras.layers.Conv1D(filters, 3, padding="same")(x)
    x = keras.layers.Add()([x, s])
    x = keras.layers.Activation(activation)(x)
    return keras.layers.MaxPool1D(pool_size=2, strides=2)(x)


def build_model(input_shape, num_classes):
    inputs = keras.layers.Input(shape=input_shape, name="input")

    x = residual_block(inputs, 16, 2)
    x = residual_block(x, 32, 2)
    x = residual_block(x, 64, 3)
    x = residual_block(x, 128, 3)
    x = residual_block(x, 128, 3)

    x = keras.layers.AveragePooling1D(pool_size=3, strides=3)(x)
    x = keras.layers.Flatten()(x)
    x = keras.layers.Dense(256, activation="relu")(x)
    x = keras.layers.Dense(128, activation="relu")(x)

    outputs = keras.layers.Dense(num_classes, activation="softmax", name="output")(x)

    return keras.models.Model(inputs=inputs, outputs=outputs)


In [126]:
model = build_model((8000, 1), len(class_names))

model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input (InputLayer)          [(None, 8000, 1)]            0         []                            
                                                                                                  
 conv1d_55 (Conv1D)          (None, 8000, 16)             64        ['input[0][0]']               
                                                                                                  
 activation_39 (Activation)  (None, 8000, 16)             0         ['conv1d_55[0][0]']           
                                                                                                  
 conv1d_56 (Conv1D)          (None, 8000, 16)             784       ['activation_39[0][0]']       
                                                                                            

In [None]:
model.compile(
    optimizer="Adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)


model_save_filename = "model.keras"

earlystopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
mdlcheckpoint_cb = keras.callbacks.ModelCheckpoint(
    model_save_filename, monitor="val_accuracy", save_best_only=True
)

In [81]:
history = model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=valid_ds,
    callbacks=[earlystopping_cb, mdlcheckpoint_cb],
)




In [82]:
print(model.evaluate(valid_ds))

[0.07391063123941422, 0.984000027179718]


In [2]:
from keras.models import load_model

# Load the saved model
loaded_model = load_model('model.keras')







In [None]:
import librosa

In [58]:
def predict_single_audio(file_path):
    # Load The Audio File
    audio, sr = librosa.load(file_path, sr=SAMPLING_RATE, mono=True, duration=3)


    # Reshape The Audio To The (None,16000,1) shape
    audio = audio.reshape(-1, 16000, 1) 

    audio_fft = audio_to_fft(audio)

    prediction = loaded_model.predict(audio_fft)
    predicted_class = np.argmax(prediction)

    if predicted_class > len(class_names):
        return False

    return class_names[predicted_class]

In [59]:
file_path = '4.wav'  
predicted_speaker = predict_single_audio(file_path)
print("Accepted " if predicted_speaker in class_names else " Rejected" )


Accepted 
