In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import pickle

## Download the dataset

In [2]:
# df = pd.read_csv('data/CETUC_Features_data.csv')
df = pd.read_csv('data/CETUC_Features_data.csv')
df.head()

Unnamed: 0,FileName,nobs,mean,skew,kurtosis,median,mode,std,low,peak,q25,q75,iqr,Gender
0,F003-0616.wav,24,179.803922,-0.453233,-1.54009,205.0,115.0,48.533917,110.294118,240.0,115.0,220.0,105.0,0
1,F000-0823.wav,19,341.034577,-0.369143,0.038573,385.0,450.0,164.184087,30.0,695.0,252.5,447.5,195.0,0
2,M009-0399.wav,29,164.397933,0.87021,-0.357093,120.0,15.0,155.589327,7.540057,530.0,15.0,225.0,210.0,1
3,F033-0492.wav,25,199.4,0.323917,-1.207102,180.0,0.0,188.384288,0.0,575.0,0.0,370.0,370.0,0
4,M029-0430.wav,24,196.577381,0.800063,-0.598391,180.0,145.0,56.700662,140.0,320.0,145.0,238.75,93.75,1


## Split the dataset into training and test data
Let's use 20% of the database for testing.

We also need to make sure the classes(Genders) are equally distributed between the classes and separate diferent speakers.

In [3]:
mydata_test = df[df['FileName'].str.match('F050') | df['FileName'].str.match('F049') | df['FileName'].str.match('F048') | df['FileName'].str.match('F047') | df['FileName'].str.match('F046') | 
                df['FileName'].str.match('F045') | df['FileName'].str.match('F044') | df['FileName'].str.match('F043') | df['FileName'].str.match('F042') | df['FileName'].str.match('F041') | 
                df['FileName'].str.match('M049') | df['FileName'].str.match('M048') | df['FileName'].str.match('M047') | df['FileName'].str.match('M046') | df['FileName'].str.match('M045') | 
                df['FileName'].str.match('M044') | df['FileName'].str.match('M043') | df['FileName'].str.match('M042') | df['FileName'].str.match('M041') | df['FileName'].str.match('M040')] 

mydata_train = df.merge(mydata_test[['FileName']], on=['FileName'], how='left', indicator=True)
mydata_train = mydata_train[mydata_train['_merge'] == 'left_only']


print(f'Feminine voices in the training data: {len(mydata_train.Gender)- sum(mydata_train.Gender)}')
print(f'Masculine voices in the training data: {sum(mydata_train.Gender)}')
print(f'Feminine voices in the test data: {len(mydata_test.Gender)- sum(mydata_test.Gender)}')
print(f'Masculine voices in the test data: {sum(mydata_test.Gender)}')


Feminine voices in the training data: 40997
Masculine voices in the training data: 40000
Feminine voices in the test data: 10000
Masculine voices in the test data: 10000


In [4]:
mydata_train.head()

Unnamed: 0,FileName,nobs,mean,skew,kurtosis,median,mode,std,low,peak,q25,q75,iqr,Gender,_merge
0,F003-0616.wav,24,179.803922,-0.453233,-1.54009,205.0,115.0,48.533917,110.294118,240.0,115.0,220.0,105.0,0,left_only
1,F000-0823.wav,19,341.034577,-0.369143,0.038573,385.0,450.0,164.184087,30.0,695.0,252.5,447.5,195.0,0,left_only
2,M009-0399.wav,29,164.397933,0.87021,-0.357093,120.0,15.0,155.589327,7.540057,530.0,15.0,225.0,210.0,1,left_only
3,F033-0492.wav,25,199.4,0.323917,-1.207102,180.0,0.0,188.384288,0.0,575.0,0.0,370.0,370.0,0,left_only
4,M029-0430.wav,24,196.577381,0.800063,-0.598391,180.0,145.0,56.700662,140.0,320.0,145.0,238.75,93.75,1,left_only


In [5]:
data_x_train = mydata_train[['nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr']].copy()
y_train = mydata_train[['Gender']].copy().values.ravel()
data_x_test = mydata_test[['nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr']].copy()
y_test = mydata_test[['Gender']].copy().values.ravel()


In [6]:
scaler = StandardScaler()
scaler.fit(data_x_train)
X_train = pd.DataFrame(scaler.transform(data_x_train), columns=data_x_train.columns)
X_test = pd.DataFrame(scaler.transform(data_x_test), columns=data_x_test.columns)
pickle.dump(scaler, open('models/scaler.pkl', 'wb'))

In [7]:
#Train decision tree model
tree = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
filename = 'models/CETUC_DecisionTree.sav'
pickle.dump(tree, open(filename, 'wb'))
print("\nDecision Tree")
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

#Train random forest model
forest = RandomForestClassifier(n_estimators=5, random_state=0).fit(X_train, y_train)
filename = 'models/CETUC_RandomForest.sav'
pickle.dump(forest, open(filename, 'wb'))
print("\nRandom Forests")
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))

#Train gradient boosting model
gbrt = GradientBoostingClassifier(random_state=0).fit(X_train, y_train)
filename = 'models/CETUC_GradientBoosting.sav'
pickle.dump(gbrt, open(filename, 'wb'))
print("\nGradient Boosting")
print("Accuracy on training set: {:.3f}".format(gbrt.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(gbrt.score(X_test, y_test)))

#Train support vector machine model
svm = SVC().fit(X_train, y_train)
filename = 'models/CETUC_SVM.sav'
pickle.dump(svm, open(filename, 'wb'))
print("\nSupport Vector Machine")
print("Accuracy on training set: {:.3f}".format(svm.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(svm.score(X_test, y_test)))

#Train neural network model
mlp = MLPClassifier(random_state=0).fit(X_train, y_train)
filename = 'models/CETUC_MLP.sav'
pickle.dump(mlp, open(filename, 'wb'))
print("\nMultilayer Perceptron")
print("Accuracy on training set: {:.3f}".format(mlp.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(mlp.score(X_test, y_test)))


Decision Tree
Accuracy on training set: 1.000
Accuracy on test set: 0.757

Random Forests
Accuracy on training set: 0.986
Accuracy on test set: 0.785

Gradient Boosting
Accuracy on training set: 0.865
Accuracy on test set: 0.824


In [None]:
def plot_feature_importances_mydata(model, c):
    n_features = len(X_train.columns)
    plt.figure(1,figsize=(18,10))
    plt.bar(range(n_features), model.feature_importances_, align='center', color=c)
    plt.xticks(np.arange(n_features), X_train.columns)
    plt.ylabel("Variable importance")
    plt.xlabel("Independent Variable")
    plt.title(model.__class__.__name__)
    plt.show()

In [None]:
plot_feature_importances_mydata(gbrt,'blue')

In [None]:
import tensorflow as tf
from tensorflow import keras

NeuralNetwork = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation='relu', input_shape=(12,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')
])

NeuralNetwork.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
history = NeuralNetwork.fit(X_train, y_train,validation_split=0.2, epochs=100, verbose=4)

test_loss, test_acc = NeuralNetwork.evaluate(X_test,  y_test, verbose=0)


print('\nTest accuracy:', test_acc)

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()