In [10]:
import pandas as pd
import numpy as np
import os
import tqdm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
from sklearn.model_selection import train_test_split


In [11]:
df = pd.read_csv("balanced-all.csv")
df.head()

Unnamed: 0,filename,gender
0,data/cv-other-train/sample-069205.npy,female
1,data/cv-valid-train/sample-063134.npy,female
2,data/cv-other-train/sample-080873.npy,female
3,data/cv-other-train/sample-105595.npy,female
4,data/cv-valid-train/sample-144613.npy,female


In [12]:
# get total samples
n_samples = len(df)
# get total male samples
n_male_samples = len(df[df['gender'] == 'male'])
# get total female samples
n_female_samples = len(df[df['gender'] == 'female'])
print("Total samples:", n_samples)
print("Total male samples:", n_male_samples)
print("Total female samples:", n_female_samples)

Total samples: 66938
Total male samples: 33469
Total female samples: 33469


In [13]:
label2int = {
    "male": 1,
    "female": 0
}

def load_data(vector_length=128):
    """A function to load gender recognition dataset from `data` folder
    After the second run, this will load from results/features.npy and results/labels.npy files
    as it is much faster!"""
    # make sure results folder exists
    if not os.path.isdir("results"):
        os.mkdir("results")
    # if features & labels already loaded individually and bundled, load them from there instead
    if os.path.isfile("results/features.npy") and os.path.isfile("results/labels.npy"):
        X = np.load("results/features.npy")
        y = np.load("results/labels.npy")
        return X, y
    # read dataframe
    df = pd.read_csv("balanced-all.csv")
    # get total samples
    n_samples = len(df)
    # get total male samples
    n_male_samples = len(df[df['gender'] == 'male'])
    # get total female samples
    n_female_samples = len(df[df['gender'] == 'female'])
    print("Total samples:", n_samples)
    print("Total male samples:", n_male_samples)
    print("Total female samples:", n_female_samples)
    # initialize an empty array for all audio features
    X = np.zeros((n_samples, vector_length))
    # initialize an empty array for all audio labels (1 for male and 0 for female)
    y = np.zeros((n_samples, 1))
    for i, (filename, gender) in tqdm.tqdm(enumerate(zip(df['filename'], df['gender'])), "Loading data", total=n_samples):
        features = np.load(filename)
        X[i] = features
        y[i] = label2int[gender]
    # save the audio features and labels into files
    # so we won't load each one of them next run
    np.save("results/features", X)
    np.save("results/labels", y)
    return X, y

In [14]:
def split_data(X, y, test_size=0.1, valid_size=0.1):
    # split training set and testing set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=7)
    # split training set and validation set
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=valid_size, random_state=7)
    # return a dictionary of values
    return {
        "X_train": X_train,
        "X_valid": X_valid,
        "X_test": X_test,
        "y_train": y_train,
        "y_valid": y_valid,
        "y_test": y_test
    }

In [15]:
# load the dataset
X, y = load_data()
# split the data into training, validation and testing sets
data = split_data(X, y, test_size=0.1, valid_size=0.1)

In [16]:
# MLP
def create_mlp_model(vector_length=128):
    """5 hidden dense layers from 256 units to 64, not the best model."""
    model = Sequential()
    model.add(Dense(256, input_shape=(vector_length,)))
    model.add(Dropout(0.3))
    model.add(Dense(256, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(128, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(128, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.3))
    # one output neuron with sigmoid activation function, 0 means female, 1 means male
    model.add(Dense(1, activation="sigmoid"))
    # using binary crossentropy as it's male/female classification (binary)
    model.compile(loss="binary_crossentropy", metrics=["accuracy"], optimizer="adam")
    # print summary of the model
    model.summary()
    return model

In [17]:
# construct the model
model = create_mlp_model()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 256)               33024     
                                                                 
 dropout_5 (Dropout)         (None, 256)               0         
                                                                 
 dense_7 (Dense)             (None, 256)               65792     
                                                                 
 dropout_6 (Dropout)         (None, 256)               0         
                                                                 
 dense_8 (Dense)             (None, 128)               32896     
                                                                 
 dropout_7 (Dropout)         (None, 128)               0         
                                                                 
 dense_9 (Dense)             (None, 128)              

In [18]:
# use tensorboard to view metrics
mlp_tensorboard = TensorBoard(log_dir='logs/mlp_logs/')
# define early stopping to stop training after 7 epochs of not improving
early_stopping = EarlyStopping(mode="min", patience=7, restore_best_weights=True)

batch_size = 64
epochs = 100
# train the model using the training set and validating using validation set
model.fit(data["X_train"], data["y_train"], epochs=epochs, batch_size=batch_size, validation_data=(data["X_valid"], data["y_valid"]),
          callbacks=[mlp_tensorboard, early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100


<keras.callbacks.History at 0x22bb66e3460>

In [19]:
# save the model to a file
model.save("results/model.h5")

In [20]:
from keras.layers import Conv1D, MaxPooling1D, Flatten

def create_cnn_model(vector_length=128):
    """Simple 1D CNN with Conv1D and MaxPooling1D layers."""
    model = Sequential()
    model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(vector_length, 1)))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.3))
    
    model.add(Conv1D(128, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.3))
    
    model.add(Flatten())
    model.add(Dense(128, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.3))
    
    # Output layer
    model.add(Dense(1, activation="sigmoid"))
    
    # Compile the model
    model.compile(loss="binary_crossentropy", metrics=["accuracy"], optimizer="adam")
    
    # Print the model summary
    model.summary()
    return model


In [21]:
# construct the model
cnn_model = create_cnn_model()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 126, 64)           256       
                                                                 
 max_pooling1d (MaxPooling1D  (None, 63, 64)           0         
 )                                                               
                                                                 
 dropout_10 (Dropout)        (None, 63, 64)            0         
                                                                 
 conv1d_1 (Conv1D)           (None, 61, 128)           24704     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 30, 128)          0         
 1D)                                                             
                                                                 
 dropout_11 (Dropout)        (None, 30, 128)          

In [22]:
# use tensorboard to view metrics
cnn_tensorboard = TensorBoard(log_dir='logs/cnn_logs/')
# define early stopping to stop training after 7 epochs of not improving
early_stopping = EarlyStopping(mode="min", patience=7, restore_best_weights=True)

batch_size = 64
epochs = 100
# train the model using the training set and validating using validation set
cnn_model.fit(data["X_train"], data["y_train"], epochs=epochs, batch_size=batch_size, validation_data=(data["X_valid"], data["y_valid"]),
          callbacks=[cnn_tensorboard, early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100


<keras.callbacks.History at 0x22b99aea7c0>

In [23]:
from keras.layers import SimpleRNN

def create_rnn_model(vector_length=128):
    """Simple RNN model using SimpleRNN layers."""
    model = Sequential()
    model.add(SimpleRNN(128, input_shape=(vector_length, 1), return_sequences=True))
    model.add(Dropout(0.3))
    
    model.add(SimpleRNN(64, return_sequences=False))
    model.add(Dropout(0.3))
    
    # Dense layer
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.3))
    
    # Output layer
    model.add(Dense(1, activation="sigmoid"))
    
    # Compile the model
    model.compile(loss="binary_crossentropy", metrics=["accuracy"], optimizer="adam")
    
    # Print the model summary
    model.summary()
    return model


In [24]:
# construct the model
rnn_model = create_rnn_model()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 128, 128)          16640     
                                                                 
 dropout_14 (Dropout)        (None, 128, 128)          0         
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 64)                12352     
                                                                 
 dropout_15 (Dropout)        (None, 64)                0         
                                                                 
 dense_15 (Dense)            (None, 64)                4160      
                                                                 
 dropout_16 (Dropout)        (None, 64)                0         
                                                                 
 dense_16 (Dense)            (None, 1)                

In [25]:
# use tensorboard to view metrics
rnn_tensorboard = TensorBoard(log_dir='logs/rnn_logs')
# define early stopping to stop training after 7 epochs of not improving
early_stopping = EarlyStopping(mode="min", patience=7, restore_best_weights=True)

batch_size = 64
epochs = 100
# train the model using the training set and validating using validation set
rnn_model.fit(data["X_train"], data["y_train"], epochs=epochs, batch_size=batch_size, validation_data=(data["X_valid"], data["y_valid"]),
          callbacks=[rnn_tensorboard, early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100

KeyboardInterrupt: 

In [26]:
from keras.layers import MultiHeadAttention, LayerNormalization, Add, Dense, Dropout, Input, Reshape, Flatten
from keras.models import Model

def create_transformer_model(vector_length=128, num_heads=4):
    """Simple Transformer-based model."""
    
    # Define the input layer
    input_layer = Input(shape=(vector_length,))
    
    # Transform the input to match the expected dimension for MultiHeadAttention
    dense_input = Dense(vector_length)(input_layer)
    reshaped_input = Reshape((vector_length, 1))(dense_input)  # Reshape to (vector_length, 1)
    
    # Self-attention layer
    attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=vector_length)(reshaped_input, reshaped_input)
    attention_output = Flatten()(attention_output)  # Flatten back for dense layers
    attention_output = Dense(vector_length)(attention_output)  # Ensure the output shape matches the input layer
    
    # Combine the original input with the attention output
    combined_output = Add()([dense_input, attention_output])
    combined_output = LayerNormalization()(combined_output)
    
    # Dense layer
    dense_output = Dense(64, activation="relu")(combined_output)
    dense_output = Dropout(0.3)(dense_output)
    
    # Output layer
    output = Dense(1, activation="sigmoid")(dense_output)
    
    # Compile the model
    model = Model(inputs=input_layer, outputs=output)
    model.compile(loss="binary_crossentropy", metrics=["accuracy"], optimizer="adam")
    
    # Print the model summary
    model.summary()
    return model


In [27]:
# construct the model
t_model = create_transformer_model()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 dense_17 (Dense)               (None, 128)          16512       ['input_1[0][0]']                
                                                                                                  
 reshape (Reshape)              (None, 128, 1)       0           ['dense_17[0][0]']               
                                                                                                  
 multi_head_attention (MultiHea  (None, 128, 1)      3585        ['reshape[0][0]',                
 dAttention)                                                      'reshape[0][0]']            

In [28]:
# use tensorboard to view metrics
transformer_tensorboard = TensorBoard(log_dir='logs/t_logs')
# define early stopping to stop training after 7 epochs of not improving
early_stopping = EarlyStopping(mode="min", patience=7, restore_best_weights=True)

batch_size = 64
epochs = 100
# train the model using the training set and validating using validation set
t_model.fit(data["X_train"], data["y_train"], epochs=epochs, batch_size=batch_size, validation_data=(data["X_valid"], data["y_valid"]),
          callbacks=[transformer_tensorboard, early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100


<keras.callbacks.History at 0x22dc4a9f370>

In [None]:
# tensorboard --logdir="logs"

In [None]:
# evaluating the multilayered perceptron model using the testing set
print(f"Evaluating the multilayered perceptron model using {len(data['X_test'])} samples...")
loss, accuracy = model.evaluate(data["X_test"], data["y_test"], verbose=0)
print(f"Loss: {loss:.4f}")
print(f"Accuracy: {accuracy*100:.2f}%")

Evaluating the multilayered perceptron model using 6694 samples...
Loss: 0.2233
Accuracy: 92.01%


In [None]:
# evaluating the cnn model using the testing set
print(f"Evaluating the cnn model using {len(data['X_test'])} samples...")
loss, accuracy = cnn_model.evaluate(data["X_test"], data["y_test"], verbose=0)
print(f"Loss: {loss:.4f}")
print(f"Accuracy: {accuracy*100:.2f}%")

Evaluating the cnn model using 6694 samples...
Loss: 0.1261
Accuracy: 95.31%


In [None]:
# evaluating the model using the testing set
print(f"Evaluating the rnn model using {len(data['X_test'])} samples...")
loss, accuracy = rnn_model.evaluate(data["X_test"], data["y_test"], verbose=0)
print(f"Loss: {loss:.4f}")
print(f"Accuracy: {accuracy*100:.2f}%")

Evaluating the rnn model using 6694 samples...
Loss: 0.6840
Accuracy: 53.20%


In [None]:
# evaluating the model using the testing set
print(f"Evaluating the transformer model using {len(data['X_test'])} samples...")
loss, accuracy = t_model.evaluate(data["X_test"], data["y_test"], verbose=0)
print(f"Loss: {loss:.4f}")
print(f"Accuracy: {accuracy*100:.2f}%")

Evaluating the transformer model using 6694 samples...
Loss: 0.2122
Accuracy: 92.25%


In [None]:
# save the model to a file
cnn_model.save("results/cnn_model.h5")
# save the model to a file
rnn_model.save("results/rnn_model.h5")
# save the model to a file
t_model.save("results/t_model.h5")


In [None]:
import librosa
import numpy as np

# extracts voice features from a .wav file

def extract_feature(file_name, **kwargs):
    """
    Extract feature from audio file `file_name`
        Features supported:
            - MFCC (mfcc)
            - Chroma (chroma)
            - MEL Spectrogram Frequency (mel)
            - Contrast (contrast)
            - Tonnetz (tonnetz)
        e.g:
        `features = extract_feature(path, mel=True, mfcc=True)`
    """
    mfcc = kwargs.get("mfcc")
    chroma = kwargs.get("chroma")
    mel = kwargs.get("mel")
    contrast = kwargs.get("contrast")
    tonnetz = kwargs.get("tonnetz")
    X, sample_rate = librosa.core.load(file_name)
    if chroma or contrast:
        stft = np.abs(librosa.stft(X))
    result = np.array([])
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))
    if chroma:
        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
        result = np.hstack((result, chroma))
    if mel:
        mel_spectrogram = librosa.feature.melspectrogram(y=X, sr=sample_rate)
        mel = np.mean(mel_spectrogram.T, axis=0)
        result = np.hstack((result, mel))
    if contrast:
        contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
        result = np.hstack((result, contrast))
    if tonnetz:
        tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
        result = np.hstack((result, tonnetz))
    return result

In [None]:
import argparse
parser = argparse.ArgumentParser(description="""Gender recognition script, this will load the model you trained, 
                                    and perform inference on a sample you provide (either using your voice or a file)""")
parser.add_argument("-f", "--file", help="The path to the file, preferred to be in WAV format")
args = parser.parse_args()
file = "my_voice/testwav.wav"
# construct the model
model = create_rnn_model()
# load the saved/trained weights
model.load_weights("results/rnn_model.h5")
if not file or not os.path.isfile(file):
    # if file not provided, or it doesn't exist, use your voice
    print("Please talk")
    # put the file name here
    file = "test.wav"
# extract features and reshape it
features = extract_feature(file, mel=True).reshape(1, -1)
# predict the gender!
male_prob = model.predict(features)[0][0]
female_prob = 1 - male_prob
gender = "male" if male_prob > female_prob else "female"
# show the result!
print("Result:", gender)
print(f"Probabilities::: Male: {male_prob*100:.2f}%    Female: {female_prob*100:.2f}%")

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_2 (SimpleRNN)    (None, 128, 128)          16640     
                                                                 
 dropout_38 (Dropout)        (None, 128, 128)          0         
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 64)                12352     
                                                                 
 dropout_39 (Dropout)        (None, 64)                0         
                                                                 
 dense_54 (Dense)            (None, 64)                4160      
                                                                 
 dropout_40 (Dropout)        (None, 64)                0         
                                                                 
 dense_55 (Dense)            (None, 1)               

In [None]:
# all models test on my voice and friend's voice
model = create_transformer_model()
# load the saved/trained weights
model.load_weights("results/t_model.h5")
# extract features and reshape it

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_8 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 dense_79 (Dense)               (None, 128)          16512       ['input_8[0][0]']                
                                                                                                  
 reshape_4 (Reshape)            (None, 128, 1)       0           ['dense_79[0][0]']               
                                                                                                  
 multi_head_attention_9 (MultiH  (None, 128, 1)      3585        ['reshape_4[0][0]',              
 eadAttention)                                                    'reshape_4[0][0]']        

In [None]:
male_file = "my_voice/testwav.wav"

female_file = "my_voice/female.wav"

features = extract_feature(male_file, mel=True).reshape(1, -1)
# predict the gender!
male_prob = model.predict(features)[0][0]
female_prob = 1 - male_prob
gender = "male" if male_prob > female_prob else "female"
# show the result!
print("Result:", gender)
print(f"Probabilities::: Male: {male_prob*100:.2f}%    Female: {female_prob*100:.2f}%")

features = extract_feature(female_file, mel=True).reshape(1, -1)
# predict the gender!
male_prob = model.predict(features)[0][0]
female_prob = 1 - male_prob
gender = "male" if male_prob > female_prob else "female"
# show the result!
print("Result:", gender)
print(f"Probabilities::: Male: {male_prob*100:.2f}%    Female: {female_prob*100:.2f}%")


Result: male
Probabilities::: Male: 96.30%    Female: 3.70%
Result: female
Probabilities::: Male: 29.35%    Female: 70.65%
