In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import os
import random

# Generate Models

## Download the dataset

In [None]:
features = 'Features_F0'
project_root =  os.path.dirname(os.path.dirname(os.getcwd()))
df = pd.read_csv(f'{project_root}/data/CETUC/{features}_data.csv')
df.head()

## Split the dataset into training and test data
Let's use 20% of the database for testing.

We also need to make sure the classes(Genders) are equally distributed between the classes and separate diferent speakers.

In [None]:
mydata_test = df[df['FileName'].str.match('F050') | df['FileName'].str.match('F049') | df['FileName'].str.match('F048') | df['FileName'].str.match('F047') | df['FileName'].str.match('F046') | 
                df['FileName'].str.match('F045') | df['FileName'].str.match('F044') | df['FileName'].str.match('F043') | df['FileName'].str.match('F042') | df['FileName'].str.match('F041') | 
                df['FileName'].str.match('M049') | df['FileName'].str.match('M048') | df['FileName'].str.match('M047') | df['FileName'].str.match('M046') | df['FileName'].str.match('M045') | 
                df['FileName'].str.match('M044') | df['FileName'].str.match('M043') | df['FileName'].str.match('M042') | df['FileName'].str.match('M041') | df['FileName'].str.match('M040')] 

mydata_train = df.merge(mydata_test[['FileName']], on=['FileName'], how='left', indicator=True)
mydata_train = mydata_train[mydata_train['_merge'] == 'left_only']
mydata_train = mydata_train[~mydata_train['FileName'].str.match('F040')]


print(f'Feminine voices in the training data: {len(mydata_train.Gender)- sum(mydata_train.Gender)}')
print(f'Masculine voices in the training data: {sum(mydata_train.Gender)}')
print(f'Feminine voices in the test data: {len(mydata_test.Gender)- sum(mydata_test.Gender)}')
print(f'Masculine voices in the test data: {sum(mydata_test.Gender)}')


In [None]:
mydata_train.head()

In [None]:
if features == 'MFCCs':
    data_x_train = mydata_train[['MFCC_1', 'MFCC_2', 'MFCC_3', 'MFCC_4', 'MFCC_5', 'MFCC_6', 'MFCC_7', 'MFCC_8', 'MFCC_9', 'MFCC_10',
                        'MFCC_11', 'MFCC_12', 'MFCC_13', 'MFCC_14', 'MFCC_15', 'MFCC_16', 'MFCC_17', 'MFCC_18', 'MFCC_19', 'MFCC_20']].copy()
    y_train = mydata_train[['Gender']].copy().values.ravel()
    data_x_test = mydata_test[['MFCC_1', 'MFCC_2', 'MFCC_3', 'MFCC_4', 'MFCC_5', 'MFCC_6', 'MFCC_7', 'MFCC_8', 'MFCC_9', 'MFCC_10',
                        'MFCC_11', 'MFCC_12', 'MFCC_13', 'MFCC_14', 'MFCC_15', 'MFCC_16', 'MFCC_17', 'MFCC_18', 'MFCC_19', 'MFCC_20']].copy()
    y_test = mydata_test[['Gender']].copy().values.ravel()

if features == 'Features':
    data_x_train = mydata_train[['nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr']].copy()
    y_train = mydata_train[['Gender']].copy().values.ravel()
    data_x_test = mydata_test[['nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr']].copy()
    y_test = mydata_test[['Gender']].copy().values.ravel()

if features == 'Features_MFCCs':
    data_x_train = mydata_train[['nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr', 
                        'MFCC_1', 'MFCC_2', 'MFCC_3', 'MFCC_4', 'MFCC_5', 'MFCC_6', 'MFCC_7', 'MFCC_8', 'MFCC_9', 'MFCC_10',
                        'MFCC_11', 'MFCC_12', 'MFCC_13', 'MFCC_14', 'MFCC_15', 'MFCC_16', 'MFCC_17', 'MFCC_18', 'MFCC_19', 'MFCC_20']].copy()
    y_train = mydata_train[['Gender']].copy().values.ravel()
    data_x_test = mydata_test[['nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr', 
                        'MFCC_1', 'MFCC_2', 'MFCC_3', 'MFCC_4', 'MFCC_5', 'MFCC_6', 'MFCC_7', 'MFCC_8', 'MFCC_9', 'MFCC_10',
                        'MFCC_11', 'MFCC_12', 'MFCC_13', 'MFCC_14', 'MFCC_15', 'MFCC_16', 'MFCC_17', 'MFCC_18', 'MFCC_19', 'MFCC_20']].copy()
    y_test = mydata_test[['Gender']].copy().values.ravel()

if features == 'F0':
    data_x_train = mydata_train[['nobs_pitch', 'mean_pitch', 'skew_pitch', 'kurtosis_pitch', 'median_pitch', 'mode_pitch', 'std_pitch', 'low_pitch', 'peak_pitch', 'q25_pitch', 'q75_pitch', 'iqr_pitch']].copy()
    y_train = mydata_train[['Gender']].copy().values.ravel()
    data_x_test = mydata_test[['nobs_pitch', 'mean_pitch', 'skew_pitch', 'kurtosis_pitch', 'median_pitch', 'mode_pitch', 'std_pitch', 'low_pitch', 'peak_pitch', 'q25_pitch', 'q75_pitch', 'iqr_pitch']].copy()
    y_test = mydata_test[['Gender']].copy().values.ravel()

if features == 'F0_MFCCs':
    data_x_train = mydata_train[['nobs_pitch', 'mean_pitch', 'skew_pitch', 'kurtosis_pitch', 'median_pitch', 'mode_pitch', 'std_pitch', 'low_pitch', 'peak_pitch', 'q25_pitch', 'q75_pitch', 'iqr_pitch', 
                        'MFCC_1', 'MFCC_2', 'MFCC_3', 'MFCC_4', 'MFCC_5', 'MFCC_6', 'MFCC_7', 'MFCC_8', 'MFCC_9', 'MFCC_10',
                        'MFCC_11', 'MFCC_12', 'MFCC_13', 'MFCC_14', 'MFCC_15', 'MFCC_16', 'MFCC_17', 'MFCC_18', 'MFCC_19', 'MFCC_20']].copy()
    y_train = mydata_train[['Gender']].copy().values.ravel()
    data_x_test = mydata_test[['nobs_pitch', 'mean_pitch', 'skew_pitch', 'kurtosis_pitch', 'median_pitch', 'mode_pitch', 'std_pitch', 'low_pitch', 'peak_pitch', 'q25_pitch', 'q75_pitch', 'iqr_pitch', 
                        'MFCC_1', 'MFCC_2', 'MFCC_3', 'MFCC_4', 'MFCC_5', 'MFCC_6', 'MFCC_7', 'MFCC_8', 'MFCC_9', 'MFCC_10',
                        'MFCC_11', 'MFCC_12', 'MFCC_13', 'MFCC_14', 'MFCC_15', 'MFCC_16', 'MFCC_17', 'MFCC_18', 'MFCC_19', 'MFCC_20']].copy()
    y_test = mydata_test[['Gender']].copy().values.ravel()

if features == 'Features_F0':
    data_x_train = mydata_train[['nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr',
        'nobs_pitch', 'mean_pitch', 'skew_pitch', 'kurtosis_pitch', 'median_pitch', 'mode_pitch', 'std_pitch', 'low_pitch', 'peak_pitch', 'q25_pitch', 'q75_pitch', 'iqr_pitch']].copy()
    y_train = mydata_train[['Gender']].copy().values.ravel()
    data_x_test = mydata_test[['nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr',
        'nobs_pitch', 'mean_pitch', 'skew_pitch', 'kurtosis_pitch', 'median_pitch', 'mode_pitch', 'std_pitch', 'low_pitch', 'peak_pitch', 'q25_pitch', 'q75_pitch', 'iqr_pitch']].copy()
    y_test = mydata_test[['Gender']].copy().values.ravel()

if features == 'Features_F0_MFCCs':
    data_x_train = mydata_train[['nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr',
        'nobs_pitch', 'mean_pitch', 'skew_pitch', 'kurtosis_pitch', 'median_pitch', 'mode_pitch', 'std_pitch', 'low_pitch', 'peak_pitch', 'q25_pitch', 'q75_pitch', 'iqr_pitch', 
                        'MFCC_1', 'MFCC_2', 'MFCC_3', 'MFCC_4', 'MFCC_5', 'MFCC_6', 'MFCC_7', 'MFCC_8', 'MFCC_9', 'MFCC_10',
                        'MFCC_11', 'MFCC_12', 'MFCC_13', 'MFCC_14', 'MFCC_15', 'MFCC_16', 'MFCC_17', 'MFCC_18', 'MFCC_19', 'MFCC_20']].copy()
    y_train = mydata_train[['Gender']].copy().values.ravel()
    data_x_test = mydata_test[['nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr',
        'nobs_pitch', 'mean_pitch', 'skew_pitch', 'kurtosis_pitch', 'median_pitch', 'mode_pitch', 'std_pitch', 'low_pitch', 'peak_pitch', 'q25_pitch', 'q75_pitch', 'iqr_pitch', 
                        'MFCC_1', 'MFCC_2', 'MFCC_3', 'MFCC_4', 'MFCC_5', 'MFCC_6', 'MFCC_7', 'MFCC_8', 'MFCC_9', 'MFCC_10',
                        'MFCC_11', 'MFCC_12', 'MFCC_13', 'MFCC_14', 'MFCC_15', 'MFCC_16', 'MFCC_17', 'MFCC_18', 'MFCC_19', 'MFCC_20']].copy()
    y_test = mydata_test[['Gender']].copy().values.ravel()


In [None]:
scaler = StandardScaler()
scaler.fit(data_x_train)
X_train = pd.DataFrame(scaler.transform(data_x_train), columns=data_x_train.columns)
X_test = pd.DataFrame(scaler.transform(data_x_test), columns=data_x_test.columns)
pickle.dump(scaler, open(f'{project_root}/models/CETUC/{features}/scaler.pkl', 'wb'))

In [None]:
#Train decision tree model
tree = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
filename = f'{project_root}/models/CETUC/{features}/DecisionTree.sav'
pickle.dump(tree, open(filename, 'wb'))
print("\n--------------------------Decision Tree--------------------------")
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
cm = confusion_matrix(y_train, tree.predict(X_train), labels=[1, 0])
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
# print(f"Confusion Matrix:\n {cm}")
print(f"Precision on training set:  {precision}")
print(f"Recall on training set: {recall}")
print(f"F1-score on training set: {2 * (precision * recall) / (precision + recall)}")
    #Test
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))
cm = confusion_matrix(y_test, tree.predict(X_test), labels=[1, 0])
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
# print(f"Confusion Matrix:\n {cm}")
print(f"Precision on test set:  {precision}")
print(f"Recall on test set: {recall}")
print(f"F1-score on test set: {2 * (precision * recall) / (precision + recall)}")

#Train random forest model
    #Training
forest = RandomForestClassifier(n_estimators=5, random_state=0).fit(X_train, y_train)
filename = f'{project_root}/models/CETUC/{features}/RandomForest.sav'
pickle.dump(forest, open(filename, 'wb'))
print("\n--------------------------Random Forests--------------------------")
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
cm = confusion_matrix(y_train, forest.predict(X_train), labels=[1, 0])
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
# print(f"Confusion Matrix:\n {cm}")
print(f"Precision on training set:  {precision}")
print(f"Recall on training set: {recall}")
print(f"F1-score on training set: {2 * (precision * recall) / (precision + recall)}")
    #Test
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))
cm = confusion_matrix(y_test, forest.predict(X_test), labels=[1, 0])
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
# print(f"Confusion Matrix:\n {cm}")
print(f"Precision on test set:  {precision}")
print(f"Recall on test set: {recall}")
print(f"F1-score on test set: {2 * (precision * recall) / (precision + recall)}")

#Train gradient boosting model
gbrt = GradientBoostingClassifier(random_state=0).fit(X_train, y_train)
filename = f'{project_root}/models/CETUC/{features}/GradientBoosting.sav'
pickle.dump(gbrt, open(filename, 'wb'))
print("\n--------------------------Gradient Boosting--------------------------")
print("Accuracy on training set: {:.3f}".format(gbrt.score(X_train, y_train)))
cm = confusion_matrix(y_train, gbrt.predict(X_train), labels=[1, 0])
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
# print(f"Confusion Matrix:\n {cm}")
print(f"Precision on training set:  {precision}")
print(f"Recall on training set: {recall}")
print(f"F1-score on training set: {2 * (precision * recall) / (precision + recall)}")
    #Test
print("Accuracy on test set: {:.3f}".format(gbrt.score(X_test, y_test)))
cm = confusion_matrix(y_test, gbrt.predict(X_test), labels=[1, 0])
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
# print(f"Confusion Matrix:\n {cm}")
print(f"Precision on test set:  {precision}")
print(f"Recall on test set: {recall}")
print(f"F1-score on test set: {2 * (precision * recall) / (precision + recall)}")

In [None]:
#Train logistic regression model
lgr = LogisticRegression(random_state=0).fit(X_train, y_train)
filename = f'{project_root}/models/CETUC/{features}/LogisticRegression.sav'
pickle.dump(lgr, open(filename, 'wb'))
print("\n--------------------------LogisticRegression--------------------------")
print("Accuracy on training set: {:.3f}".format(lgr.score(X_train, y_train)))
cm = confusion_matrix(y_train, lgr.predict(X_train), labels=[1, 0])
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
# print(f"Confusion Matrix:\n {cm}")
print(f"Precision on training set:  {precision}")
print(f"Recall on training set: {recall}")
print(f"F1-score on training set: {2 * (precision * recall) / (precision + recall)}")
    #Test
print("Accuracy on test set: {:.3f}".format(lgr.score(X_test, y_test)))
cm = confusion_matrix(y_test, lgr.predict(X_test), labels=[1, 0])
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
# print(f"Confusion Matrix:\n {cm}")
print(f"Precision on test set:  {precision}")
print(f"Recall on test set: {recall}")
print(f"F1-score on test set: {2 * (precision * recall) / (precision + recall)}")

#Train support vector machine model
svm = SVC().fit(X_train, y_train)
filename = f'{project_root}/models/CETUC/{features}/SVM.sav'
pickle.dump(svm, open(filename, 'wb'))
print("\n--------------------------Support Vector Machine--------------------------")
print("Accuracy on training set: {:.3f}".format(svm.score(X_train, y_train)))
cm = confusion_matrix(y_train, svm.predict(X_train), labels=[1, 0])
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
# print(f"Confusion Matrix:\n {cm}")
print(f"Precision on training set:  {precision}")
print(f"Recall on training set: {recall}")
print(f"F1-score on training set: {2 * (precision * recall) / (precision + recall)}")
    #Test
print("Accuracy on test set: {:.3f}".format(svm.score(X_test, y_test)))
cm = confusion_matrix(y_test, svm.predict(X_test), labels=[1, 0])
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
# print(f"Confusion Matrix:\n {cm}")
print(f"Precision on test set:  {precision}")
print(f"Recall on test set: {recall}")
print(f"F1-score on test set: {2 * (precision * recall) / (precision + recall)}")

#Train neural network model
mlp = MLPClassifier(random_state=0).fit(X_train, y_train)
filename = f'{project_root}/models/CETUC/{features}/MLP.sav'
pickle.dump(mlp, open(filename, 'wb'))
print("\n--------------------------Multilayer Perceptron--------------------------")
print("Accuracy on training set: {:.3f}".format(mlp.score(X_train, y_train)))
cm = confusion_matrix(y_train, mlp.predict(X_train), labels=[1, 0])
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
# print(f"Confusion Matrix:\n {cm}")
print(f"Precision on training set:  {precision}")
print(f"Recall on training set: {recall}")
print(f"F1-score on training set: {2 * (precision * recall) / (precision + recall)}")
    #Test
print("Accuracy on test set: {:.3f}".format(mlp.score(X_test, y_test)))
cm = confusion_matrix(y_test, mlp.predict(X_test), labels=[1, 0])
precision = cm[0][0]/(cm[0][0]+cm[1][0])
recall = cm[0][0]/(cm[0][0]+cm[0][1])
# print(f"Confusion Matrix:\n {cm}")
print(f"Precision on test set:  {precision}")
print(f"Recall on test set: {recall}")
print(f"F1-score on test set: {2 * (precision * recall) / (precision + recall)}")

# Cross Validation

## Download Dataset

In [9]:
features = 'Features_F0_MFCCs'
project_root =  os.path.dirname(os.path.dirname(os.getcwd()))
df = pd.read_csv(f'{project_root}/data/CETUC/{features}_data.csv')
df.head()

Unnamed: 0,FileName,nobs,mean,skew,kurtosis,median,mode,std,low,peak,...,skew_pitch,kurtosis_pitch,median_pitch,mode_pitch,std_pitch,low_pitch,peak_pitch,q25_pitch,q75_pitch,iqr_pitch
0,F003-0616.wav,24,179.803922,-0.453233,-1.54009,205.0,115.0,48.533917,110.294118,240.0,...,-0.69946,-1.342911,201.211632,0.0,97.416064,0.0,265.632548,0.0,218.797809,218.797809
1,F000-0823.wav,19,341.034577,-0.369143,0.038573,385.0,450.0,164.184087,30.0,695.0,...,-0.253664,-1.711468,169.02675,0.0,101.613362,0.0,276.846249,0.0,215.105068,215.105068
2,M009-0399.wav,29,164.397933,0.87021,-0.357093,120.0,15.0,155.589327,7.540057,530.0,...,3.03958,14.240777,91.160534,0.0,90.858767,0.0,598.610675,0.0,113.379987,113.379987
3,F033-0492.wav,25,199.4,0.323917,-1.207102,180.0,0.0,188.384288,0.0,575.0,...,-0.044067,-1.851095,150.57059,0.0,94.443665,0.0,272.803872,0.0,183.76448,183.76448
4,M029-0430.wav,24,196.577381,0.800063,-0.598391,180.0,145.0,56.700662,140.0,320.0,...,-0.507444,-1.35232,117.890983,0.0,64.925799,0.0,205.084,0.0,142.727442,142.727442


## Setup final dataframe

In [10]:
def feture_selector(features, mydata_test, mydata_train):
    if features == 'MFCCs':
        data_x_train = mydata_train[['MFCC_1', 'MFCC_2', 'MFCC_3', 'MFCC_4', 'MFCC_5', 'MFCC_6', 'MFCC_7', 'MFCC_8', 'MFCC_9', 'MFCC_10',
                            'MFCC_11', 'MFCC_12', 'MFCC_13', 'MFCC_14', 'MFCC_15', 'MFCC_16', 'MFCC_17', 'MFCC_18', 'MFCC_19', 'MFCC_20']].copy()
        y_train = mydata_train[['Gender']].copy().values.ravel()
        data_x_test = mydata_test[['MFCC_1', 'MFCC_2', 'MFCC_3', 'MFCC_4', 'MFCC_5', 'MFCC_6', 'MFCC_7', 'MFCC_8', 'MFCC_9', 'MFCC_10',
                            'MFCC_11', 'MFCC_12', 'MFCC_13', 'MFCC_14', 'MFCC_15', 'MFCC_16', 'MFCC_17', 'MFCC_18', 'MFCC_19', 'MFCC_20']].copy()
        y_test = mydata_test[['Gender']].copy().values.ravel()

    if features == 'Features':
        data_x_train = mydata_train[['nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr']].copy()
        y_train = mydata_train[['Gender']].copy().values.ravel()
        data_x_test = mydata_test[['nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr']].copy()
        y_test = mydata_test[['Gender']].copy().values.ravel()

    if features == 'Features_MFCCs':
        data_x_train = mydata_train[['nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr', 
                            'MFCC_1', 'MFCC_2', 'MFCC_3', 'MFCC_4', 'MFCC_5', 'MFCC_6', 'MFCC_7', 'MFCC_8', 'MFCC_9', 'MFCC_10',
                            'MFCC_11', 'MFCC_12', 'MFCC_13', 'MFCC_14', 'MFCC_15', 'MFCC_16', 'MFCC_17', 'MFCC_18', 'MFCC_19', 'MFCC_20']].copy()
        y_train = mydata_train[['Gender']].copy().values.ravel()
        data_x_test = mydata_test[['nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr', 
                            'MFCC_1', 'MFCC_2', 'MFCC_3', 'MFCC_4', 'MFCC_5', 'MFCC_6', 'MFCC_7', 'MFCC_8', 'MFCC_9', 'MFCC_10',
                            'MFCC_11', 'MFCC_12', 'MFCC_13', 'MFCC_14', 'MFCC_15', 'MFCC_16', 'MFCC_17', 'MFCC_18', 'MFCC_19', 'MFCC_20']].copy()
        y_test = mydata_test[['Gender']].copy().values.ravel()

    if features == 'F0':
        data_x_train = mydata_train[['nobs_pitch', 'mean_pitch', 'skew_pitch', 'kurtosis_pitch', 'median_pitch', 'mode_pitch', 'std_pitch', 'low_pitch', 'peak_pitch', 'q25_pitch', 'q75_pitch', 'iqr_pitch']].copy()
        y_train = mydata_train[['Gender']].copy().values.ravel()
        data_x_test = mydata_test[['nobs_pitch', 'mean_pitch', 'skew_pitch', 'kurtosis_pitch', 'median_pitch', 'mode_pitch', 'std_pitch', 'low_pitch', 'peak_pitch', 'q25_pitch', 'q75_pitch', 'iqr_pitch']].copy()
        y_test = mydata_test[['Gender']].copy().values.ravel()

    if features == 'F0_MFCCs':
        data_x_train = mydata_train[['nobs_pitch', 'mean_pitch', 'skew_pitch', 'kurtosis_pitch', 'median_pitch', 'mode_pitch', 'std_pitch', 'low_pitch', 'peak_pitch', 'q25_pitch', 'q75_pitch', 'iqr_pitch', 
                            'MFCC_1', 'MFCC_2', 'MFCC_3', 'MFCC_4', 'MFCC_5', 'MFCC_6', 'MFCC_7', 'MFCC_8', 'MFCC_9', 'MFCC_10',
                            'MFCC_11', 'MFCC_12', 'MFCC_13', 'MFCC_14', 'MFCC_15', 'MFCC_16', 'MFCC_17', 'MFCC_18', 'MFCC_19', 'MFCC_20']].copy()
        y_train = mydata_train[['Gender']].copy().values.ravel()
        data_x_test = mydata_test[['nobs_pitch', 'mean_pitch', 'skew_pitch', 'kurtosis_pitch', 'median_pitch', 'mode_pitch', 'std_pitch', 'low_pitch', 'peak_pitch', 'q25_pitch', 'q75_pitch', 'iqr_pitch', 
                            'MFCC_1', 'MFCC_2', 'MFCC_3', 'MFCC_4', 'MFCC_5', 'MFCC_6', 'MFCC_7', 'MFCC_8', 'MFCC_9', 'MFCC_10',
                            'MFCC_11', 'MFCC_12', 'MFCC_13', 'MFCC_14', 'MFCC_15', 'MFCC_16', 'MFCC_17', 'MFCC_18', 'MFCC_19', 'MFCC_20']].copy()
        y_test = mydata_test[['Gender']].copy().values.ravel()

    if features == 'Features_F0':
        data_x_train = mydata_train[['nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr',
            'nobs_pitch', 'mean_pitch', 'skew_pitch', 'kurtosis_pitch', 'median_pitch', 'mode_pitch', 'std_pitch', 'low_pitch', 'peak_pitch', 'q25_pitch', 'q75_pitch', 'iqr_pitch']].copy()
        y_train = mydata_train[['Gender']].copy().values.ravel()
        data_x_test = mydata_test[['nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr',
            'nobs_pitch', 'mean_pitch', 'skew_pitch', 'kurtosis_pitch', 'median_pitch', 'mode_pitch', 'std_pitch', 'low_pitch', 'peak_pitch', 'q25_pitch', 'q75_pitch', 'iqr_pitch']].copy()
        y_test = mydata_test[['Gender']].copy().values.ravel()

    if features == 'Features_F0_MFCCs':
        data_x_train = mydata_train[['nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr',
            'nobs_pitch', 'mean_pitch', 'skew_pitch', 'kurtosis_pitch', 'median_pitch', 'mode_pitch', 'std_pitch', 'low_pitch', 'peak_pitch', 'q25_pitch', 'q75_pitch', 'iqr_pitch', 
                            'MFCC_1', 'MFCC_2', 'MFCC_3', 'MFCC_4', 'MFCC_5', 'MFCC_6', 'MFCC_7', 'MFCC_8', 'MFCC_9', 'MFCC_10',
                            'MFCC_11', 'MFCC_12', 'MFCC_13', 'MFCC_14', 'MFCC_15', 'MFCC_16', 'MFCC_17', 'MFCC_18', 'MFCC_19', 'MFCC_20']].copy()
        y_train = mydata_train[['Gender']].copy().values.ravel()
        data_x_test = mydata_test[['nobs', 'mean', 'skew', 'kurtosis', 'median', 'mode', 'std', 'low', 'peak', 'q25', 'q75', 'iqr',
            'nobs_pitch', 'mean_pitch', 'skew_pitch', 'kurtosis_pitch', 'median_pitch', 'mode_pitch', 'std_pitch', 'low_pitch', 'peak_pitch', 'q25_pitch', 'q75_pitch', 'iqr_pitch', 
                            'MFCC_1', 'MFCC_2', 'MFCC_3', 'MFCC_4', 'MFCC_5', 'MFCC_6', 'MFCC_7', 'MFCC_8', 'MFCC_9', 'MFCC_10',
                            'MFCC_11', 'MFCC_12', 'MFCC_13', 'MFCC_14', 'MFCC_15', 'MFCC_16', 'MFCC_17', 'MFCC_18', 'MFCC_19', 'MFCC_20']].copy()
        y_test = mydata_test[['Gender']].copy().values.ravel()
    return data_x_train, y_train, data_x_test, y_test


## Generate scaler

In [11]:
scaler = pickle.load(open(os.path.join(project_root, 'models', 'CETUC', features, 'scaler.pkl'), 'rb'))


## CrossValidation

In [12]:
tree_test_score = []
tree_test_recall = []
tree_test_precision = []
tree_test_f1 = []
tree_train_score = []
tree_train_recall = []
tree_train_precision = []
tree_train_f1 = []

forest_test_score = []
forest_test_recall = []
forest_test_precision = []
forest_test_f1 = []
forest_train_score = []
forest_train_recall = []
forest_train_precision = []
forest_train_f1 = []

gbrt_test_score = []
gbrt_test_recall = []
gbrt_test_precision = []
gbrt_test_f1 = []
gbrt_train_score = []
gbrt_train_recall = []
gbrt_train_precision = []
gbrt_train_f1 = []

lgr_test_score = []
lgr_test_recall = []
lgr_test_precision = []
lgr_test_f1 = []
lgr_train_score = []
lgr_train_recall = []
lgr_train_precision = []
lgr_train_f1 = []

svm_test_score = []
svm_test_recall = []
svm_test_precision = []
svm_test_f1 = []
svm_train_score = []
svm_train_recall = []
svm_train_precision = []
svm_train_f1 = []

mlp_test_score = []
mlp_test_recall = []
mlp_test_precision = []
mlp_test_f1 = []
mlp_train_score = []
mlp_train_recall = []
mlp_train_precision = []
mlp_train_f1 = []

In [13]:
n_crossvalidations = 5
def process_numbers(list):
    processed_list = []
    for element in list:
        if element < 10:
            element_str = f'00{str(element)}'
        else:
            element_str = f'0{str(element)}'
        processed_list.append(element_str)
    return processed_list

In [14]:
for i in range(n_crossvalidations):

    rlm = random.sample(range(49), 10)
    rlf = random.sample(range(51), 10)
    while 27 in rlf:
        rlf = random.sample(range(51), 10)

    rlm = process_numbers(rlm)
    rlf = process_numbers(rlf)

    mydata_test = df[df['FileName'].str.match(f'F{rlf[0]}') | df['FileName'].str.match(f'F{rlf[1]}') | df['FileName'].str.match(f'F{rlf[2]}') | df['FileName'].str.match(f'F{rlf[3]}') | df['FileName'].str.match(f'F{rlf[4]}') | 
                    df['FileName'].str.match(f'F{rlf[5]}') | df['FileName'].str.match(f'F{rlf[6]}') | df['FileName'].str.match(f'F{rlf[7]}') | df['FileName'].str.match(f'F{rlf[8]}') | df['FileName'].str.match(f'F{rlf[9]}') | 
                    df['FileName'].str.match(f'M{rlm[0]}') | df['FileName'].str.match(f'M{rlm[1]}') | df['FileName'].str.match(f'M{rlm[2]}') | df['FileName'].str.match(f'M{rlm[3]}') | df['FileName'].str.match(f'M{rlm[4]}') | 
                    df['FileName'].str.match(f'M{rlm[5]}') | df['FileName'].str.match(f'M{rlm[6]}') | df['FileName'].str.match(f'M{rlm[7]}') | df['FileName'].str.match(f'M{rlm[8]}') | df['FileName'].str.match(f'M{rlm[9]}')] 

    mydata_train = df.merge(mydata_test[['FileName']], on=['FileName'], how='left', indicator=True)
    mydata_train = mydata_train[mydata_train['_merge'] == 'left_only']
    # mydata_train = mydata_train[~mydata_train['FileName'].str.match('F040')]

    data_x_train, y_train, data_x_test, y_test = feture_selector(features, mydata_test, mydata_train)
    X_train = pd.DataFrame(scaler.transform(data_x_train), columns=data_x_train.columns)
    X_test = pd.DataFrame(scaler.transform(data_x_test), columns=data_x_test.columns)

    #Train decision tree model
    tree = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
    score = tree.score(X_train, y_train)
    cm = confusion_matrix(y_train, tree.predict(X_train), labels=[1, 0])
    precision = cm[0][0]/(cm[0][0]+cm[1][0])
    recall = cm[0][0]/(cm[0][0]+cm[0][1])
    tree_train_score.append(score)
    tree_train_recall.append(recall)
    tree_train_precision.append(precision)
    tree_train_f1.append(2 * (precision * recall) / (precision + recall))
        #Test
    score = tree.score(X_test, y_test)
    cm = confusion_matrix(y_test, tree.predict(X_test), labels=[1, 0])
    precision = cm[0][0]/(cm[0][0]+cm[1][0])
    recall = cm[0][0]/(cm[0][0]+cm[0][1])
    tree_test_score.append(score)
    tree_test_recall.append(recall)
    tree_test_precision.append(precision)
    tree_test_f1.append(2 * (precision * recall) / (precision + recall))

    #Train random forest model
        #Training
    forest = RandomForestClassifier(n_estimators=5, random_state=0).fit(X_train, y_train)
    score = forest.score(X_train, y_train)
    cm = confusion_matrix(y_train, forest.predict(X_train), labels=[1, 0])
    precision = cm[0][0]/(cm[0][0]+cm[1][0])
    recall = cm[0][0]/(cm[0][0]+cm[0][1])
    forest_train_score.append(score)
    forest_train_recall.append(recall)
    forest_train_precision.append(precision)
    forest_train_f1.append(2 * (precision * recall) / (precision + recall))
        #Test
    score = forest.score(X_test, y_test)
    cm = confusion_matrix(y_test, forest.predict(X_test), labels=[1, 0])
    precision = cm[0][0]/(cm[0][0]+cm[1][0])
    recall = cm[0][0]/(cm[0][0]+cm[0][1])
    forest_test_score.append(score)
    forest_test_recall.append(recall)
    forest_test_precision.append(precision)
    forest_test_f1.append(2 * (precision * recall) / (precision + recall))

    #Train gradient boosting model
    gbrt = GradientBoostingClassifier(random_state=0).fit(X_train, y_train)
    score = gbrt.score(X_train, y_train)
    cm = confusion_matrix(y_train, gbrt.predict(X_train), labels=[1, 0])
    precision = cm[0][0]/(cm[0][0]+cm[1][0])
    recall = cm[0][0]/(cm[0][0]+cm[0][1])
    gbrt_train_score.append(score)
    gbrt_train_recall.append(recall)
    gbrt_train_precision.append(precision)
    gbrt_train_f1.append(2 * (precision * recall) / (precision + recall))
        #Test
    score = gbrt.score(X_test, y_test)
    cm = confusion_matrix(y_test, gbrt.predict(X_test), labels=[1, 0])
    precision = cm[0][0]/(cm[0][0]+cm[1][0])
    recall = cm[0][0]/(cm[0][0]+cm[0][1])
    gbrt_test_score.append(score)
    gbrt_test_recall.append(recall)
    gbrt_test_precision.append(precision)
    gbrt_test_f1.append(2 * (precision * recall) / (precision + recall))

    # ------------------------------------------------------------------------------

    #Train logistic regrassion model
    lgr = LogisticRegression(random_state=0).fit(X_train, y_train)
    score = lgr.score(X_train, y_train)
    cm = confusion_matrix(y_train, lgr.predict(X_train), labels=[1, 0])
    precision = cm[0][0]/(cm[0][0]+cm[1][0])
    recall = cm[0][0]/(cm[0][0]+cm[0][1])
    lgr_train_score.append(score)
    lgr_train_recall.append(recall)
    lgr_train_precision.append(precision)
    lgr_train_f1.append(2 * (precision * recall) / (precision + recall))
        #Test
    score = lgr.score(X_test, y_test)
    cm = confusion_matrix(y_test, lgr.predict(X_test), labels=[1, 0])
    precision = cm[0][0]/(cm[0][0]+cm[1][0])
    recall = cm[0][0]/(cm[0][0]+cm[0][1])
    lgr_test_score.append(score)
    lgr_test_recall.append(recall)
    lgr_test_precision.append(precision)
    lgr_test_f1.append(2 * (precision * recall) / (precision + recall))

    #Train support vector machine model
        #Training
    svm = SVC().fit(X_train, y_train)
    score = svm.score(X_train, y_train)
    cm = confusion_matrix(y_train, svm.predict(X_train), labels=[1, 0])
    precision = cm[0][0]/(cm[0][0]+cm[1][0])
    recall = cm[0][0]/(cm[0][0]+cm[0][1])
    svm_train_score.append(score)
    svm_train_recall.append(recall)
    svm_train_precision.append(precision)
    svm_train_f1.append(2 * (precision * recall) / (precision + recall))
        #Test
    score = svm.score(X_test, y_test)
    cm = confusion_matrix(y_test, svm.predict(X_test), labels=[1, 0])
    precision = cm[0][0]/(cm[0][0]+cm[1][0])
    recall = cm[0][0]/(cm[0][0]+cm[0][1])
    svm_test_score.append(score)
    svm_test_recall.append(recall)
    svm_test_precision.append(precision)
    svm_test_f1.append(2 * (precision * recall) / (precision + recall))

    #Train Multilayer perceptron model
    mlp = MLPClassifier(random_state=0).fit(X_train, y_train)
    score = mlp.score(X_train, y_train)
    cm = confusion_matrix(y_train, mlp.predict(X_train), labels=[1, 0])
    precision = cm[0][0]/(cm[0][0]+cm[1][0])
    recall = cm[0][0]/(cm[0][0]+cm[0][1])
    mlp_train_score.append(score)
    mlp_train_recall.append(recall)
    mlp_train_precision.append(precision)
    mlp_train_f1.append(2 * (precision * recall) / (precision + recall))
        #Test
    score = mlp.score(X_test, y_test)
    cm = confusion_matrix(y_test, mlp.predict(X_test), labels=[1, 0])
    precision = cm[0][0]/(cm[0][0]+cm[1][0])
    recall = cm[0][0]/(cm[0][0]+cm[0][1])
    mlp_test_score.append(score)
    mlp_test_recall.append(recall)
    mlp_test_precision.append(precision)
    mlp_test_f1.append(2 * (precision * recall) / (precision + recall))
    print(f"Iteration = {i}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Iteration = 0
Iteration = 1
Iteration = 2
Iteration = 3


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Iteration = 4


In [15]:
print("\nDecision Tree Test Accuracy, reacall, precision and F1")
print("{:.3f}".format(np.mean(tree_test_score)))
print("{:.3f}".format(np.mean(tree_test_recall)))
print("{:.3f}".format(np.mean(tree_test_precision)))
print("{:.3f}".format(np.mean(tree_test_f1)))
print("\nDecision Tree Train Accuracy, reacall, precision and F1")
print("{:.3f}".format(np.mean(tree_train_score)))
print("{:.3f}".format(np.mean(tree_train_recall)))
print("{:.3f}".format(np.mean(tree_train_precision)))
print("{:.3f}".format(np.mean(tree_train_f1)))

print("\nDecision Tree Test Accuracy, reacall, precision and F1")
print("{:.3f}".format(np.mean(forest_test_score)))
print("{:.3f}".format(np.mean(forest_test_recall)))
print("{:.3f}".format(np.mean(forest_test_precision)))
print("{:.3f}".format(np.mean(forest_test_f1)))
print("Random Forest Train Accuracy, reacall, precision and F1")
print("{:.3f}".format(np.mean(forest_train_score)))
print("{:.3f}".format(np.mean(forest_train_recall)))
print("{:.3f}".format(np.mean(forest_train_precision)))
print("{:.3f}".format(np.mean(forest_train_f1)))

print("\nGradient Boost Test Accuracy, reacall, precision and F1")
print("{:.3f}".format(np.mean(gbrt_test_score)))
print("{:.3f}".format(np.mean(gbrt_test_recall)))
print("{:.3f}".format(np.mean(gbrt_test_precision)))
print("{:.3f}".format(np.mean(gbrt_test_f1)))
print("Gradient Boost Train Accuracy, reacall, precision and F1")
print("{:.3f}".format(np.mean(gbrt_train_score)))
print("{:.3f}".format(np.mean(gbrt_train_recall)))
print("{:.3f}".format(np.mean(gbrt_train_precision)))
print("{:.3f}".format(np.mean(gbrt_train_f1)))

print("\nLogistic Regression Test Accuracy, reacall, precision and F1")
print("{:.3f}".format(np.mean(lgr_test_score)))
print("{:.3f}".format(np.mean(lgr_test_recall)))
print("{:.3f}".format(np.mean(lgr_test_precision)))
print("{:.3f}".format(np.mean(lgr_test_f1)))
print("Logistic Regression Train Accuracy, reacall, precision and F1")
print("{:.3f}".format(np.mean(lgr_train_score)))
print("{:.3f}".format(np.mean(lgr_train_recall)))
print("{:.3f}".format(np.mean(lgr_train_precision)))
print("{:.3f}".format(np.mean(lgr_train_f1)))

print("\nSVM Test Accuracy, reacall, precision and F1")
print("{:.3f}".format(np.mean(svm_test_score)))
print("{:.3f}".format(np.mean(svm_test_recall)))
print("{:.3f}".format(np.mean(svm_test_precision)))
print("{:.3f}".format(np.mean(svm_test_f1)))
print("SVM Train Accuracy, reacall, precision and F1")
print("{:.3f}".format(np.mean(svm_train_score)))
print("{:.3f}".format(np.mean(svm_train_recall)))
print("{:.3f}".format(np.mean(svm_train_precision)))
print("{:.3f}".format(np.mean(svm_train_f1)))

print("\nMLP Test Accuracy, reacall, precision and F1")
print("{:.3f}".format(np.mean(mlp_test_score)))
print("{:.3f}".format(np.mean(mlp_test_recall)))
print("{:.3f}".format(np.mean(mlp_test_precision)))
print("{:.3f}".format(np.mean(mlp_test_f1)))
print("MLP Train Accuracy, reacall, precision and F1")
print("{:.3f}".format(np.mean(mlp_train_score)))
print("{:.3f}".format(np.mean(mlp_train_recall)))
print("{:.3f}".format(np.mean(mlp_train_precision)))
print("{:.3f}".format(np.mean(mlp_train_f1)))


Decision Tree Test Accuracy, reacall, precision and F1
0.894
0.853
0.930
0.889

Decision Tree Train Accuracy, reacall, precision and F1
1.000
1.000
1.000
1.000

Decision Tree Test Accuracy, reacall, precision and F1
0.926
0.901
0.950
0.924
Random Forest Train Accuracy, reacall, precision and F1
0.998
0.998
0.998
0.998

Gradient Boost Test Accuracy, reacall, precision and F1
0.941
0.916
0.964
0.939
Gradient Boost Train Accuracy, reacall, precision and F1
0.984
0.983
0.983
0.983

Logistic Regression Test Accuracy, reacall, precision and F1
0.954
0.934
0.973
0.952
Logistic Regression Train Accuracy, reacall, precision and F1
0.971
0.968
0.973
0.970

SVM Test Accuracy, reacall, precision and F1
0.941
0.915
0.967
0.939
SVM Train Accuracy, reacall, precision and F1
0.994
0.994
0.994
0.994

MLP Test Accuracy, reacall, precision and F1
0.919
0.898
0.939
0.918
MLP Train Accuracy, reacall, precision and F1
1.000
1.000
1.000
1.000
