# Project - Gender recognition by voice
## EPFL - Statistical learning (MATH-412) 
## Adrien Besson

### 2. An intuitive approach

In [45]:
# Import libraries
%matplotlib inline
import pandas as pd
import numpy as np
import sklearn.discriminant_analysis as lda
import sklearn.linear_model as lm
import sklearn.model_selection as model_selection
import sklearn.preprocessing as preproc
from sklearn import svm, tree, ensemble, neural_network, mixture

In [23]:
# Load the data
input_file = os.path.join(os.getcwd(), 'data', 'voice.csv')
data = pd.read_csv(input_file)
data['label'] = data['label'].astype('category')

In [24]:
# Drop collinear columns
cols_to_drop = ['IQR', 'dfrange', 'centroid']
data = data.drop(cols_to_drop,axis=1)


In [25]:
# Assign quantitative values to the labels and drop them from the data
encoder = preproc.LabelEncoder()
labels = data['label'].values
labels = encoder.fit_transform(labels)
data = data.drop(['label'],axis=1)

## Classification based on the mean frequency / mean fundamental frequency

Here, our intuitive idea is to consider the mean fequency / mean fundamental frequency as a good classifier between male and female. Indeed, it is clear that male's voice have lower frequencies than female's one. Let's try this idea!

In [57]:
# Split the dataset
features = data['meanfun'].values.reshape(-1,1)
features_train, features_test, labels_train, labels_test = model_selection.train_test_split(features, labels, train_size=0.8, test_size=0.2, random_state=10)

## LDA/QDA 

In [58]:
lda_class = lda.LinearDiscriminantAnalysis()
lda_class.fit(X=features_train, y=labels_train)
class_score_lda = lda_class.score(X=features_test, y=labels_test)
print('Classification loss LDA: {0} %'.format((1-class_score_lda)*100))

Classification loss LDA: 4.889589905362779 %


In [59]:
qda_class = lda.QuadraticDiscriminantAnalysis()
qda_class.fit(X=features_train, y=labels_train)
class_score_qda = qda_class.score(X=features_test, y=labels_test)
print('Classification loss QDA: {0} %'.format((1-class_score_qda)*100))

Classification loss QDA: 4.889589905362779 %


## Logistic regression - Ridge

In [60]:
# Logistic regression with L2 regularization
list_C = np.logspace(10^-3, 10^3, 200)
logistic_reg = lm.LogisticRegression(penalty='l2', solver='liblinear', random_state=10)
max_score = 0
for C in list_C:
    logistic_reg.set_params(C=C)
    logistic_reg.fit(features_train, labels_train)
    score = logistic_reg.score(features_test, labels_test)
    if score > max_score:
        best_C = C
        max_score = score

logistic_reg.set_params(C=best_C)
logistic_reg.fit(X=features_train, y=labels_train)
class_score_reg_l2 = logistic_reg.score(X=features_test, y=labels_test)
print('Classification loss Logistic - Ridge: {0}'.format((1-class_score_reg_l2)*100))

Classification loss Logistic - Ridge: 4.731861198738175


## Logistic regression - LASSO

In [61]:
list_C = np.logspace(10^-3, 10^3, 200)
logistic_reg_l1 = lm.LogisticRegression(penalty='l1', solver='liblinear', random_state=10)
max_score = 0
best_C = list_C[0]
for C in list_C:
    logistic_reg_l1.set_params(C=C)
    logistic_reg_l1.fit(features_train, labels_train)
    score = logistic_reg_l1.score(features_test, labels_test)
    if score > max_score:
        best_C = C
        max_score = score
        
logistic_reg_l1.set_params(C=best_C)
logistic_reg_l1.fit(X=features_train, y=labels_train)
class_score_reg_l1 = logistic_reg_l1.score(X=features_test, y=labels_test)
print('Classification loss Logistic - LASSO: {0}'.format((1-class_score_reg_l1)*100))

Classification loss Logistic - LASSO: 4.889589905362779


## Linear SVM 

In [62]:
# SVM classification - Linear kernel
list_C = np.logspace(10^-3, 10^3, 100)
class_svm = svm.LinearSVC(random_state=10)
max_score = 0
best_C = list_C[0]
for C in list_C:
    class_svm.set_params(C=C)
    class_svm.fit(features_train, labels_train)
    score = class_svm.score(features_test, labels_test)
    if score > max_score:
        best_C = C
        max_score = score
class_svm.set_params(C=best_C)
class_svm.fit(X=features_train, y=labels_train)
score_svm = class_svm.score(X=features_test, y=labels_test)
print('Classification loss linear SVM - L2: {0} %'.format((1-score_svm)*100))

Classification loss linear SVM - L2: 4.731861198738175 %


## Kernel SVM - RBF

In [63]:
# SVM classification - Linear kernel
list_C = np.logspace(10^-3, 10^3, 100)
class_svm = svm.SVC(kernel='rbf',random_state=10)
max_score = 0
best_C = list_C[0]
for C in list_C:
    class_svm.set_params(C=C)
    class_svm.fit(features_train, labels_train)
    score = class_svm.score(features_test, labels_test)
    if score > max_score:
        best_C = C
        max_score = score
class_svm.set_params(C=best_C)
class_svm.fit(X=features_train, y=labels_train)
score_svm = class_svm.score(X=features_test, y=labels_test)
print('Classification loss kernel SVM - L2: {0} %'.format((1-score_svm)*100))

Classification loss kernel SVM - L2: 4.731861198738175 %


## Decision Tree

In [64]:
dec_tree = tree.DecisionTreeClassifier(criterion='gini', random_state=10)
dec_tree.fit(X=features_train, y=labels_train)
score_tree = dec_tree.score(X=features_test, y=labels_test)
print('Classification loss - Decision tree: {0} %'.format((1-score_tree)*100))

Classification loss - Decision tree: 5.678233438485803 %


## Random forest

In [65]:
random_forest = ensemble.RandomForestClassifier(criterion='gini', random_state=10)
random_forest.fit(X=features_train, y=labels_train)
score_rf = random_forest.score(X=features_test, y=labels_test)
print('Classification loss - Random forest: {0} %'.format((1-score_rf)*100))

Classification loss - Random forest: 5.835962145110408 %


## AdaBoost 

In [66]:
ada_boost = ensemble.AdaBoostClassifier(random_state=10)
ada_boost.fit(X=features_train, y=labels_train)
score_ab = ada_boost.score(X=features_test, y=labels_test)
print('Classification loss - AdaBoost: {0} %'.format((1-score_ab)*100))

Classification loss - AdaBoost: 5.047318611987384 %


## Gradient Boosting

In [67]:
g_boost = ensemble.GradientBoostingClassifier(random_state=10)
g_boost.fit(X=features_train, y=labels_train)
score_gb = g_boost.score(X=features_test, y=labels_test)
print('Classification loss - Gradient Boosting: {0} %'.format((1-score_gb)*100))

Classification loss - Gradient Boosting: 5.3627760252365935 %


## Bagging

In [68]:
bagging = ensemble.BaggingClassifier(random_state=10)
bagging.fit(X=features_train, y=labels_train)
score_bag = bagging.score(X=features_test, y=labels_test)
print('Classification loss - Bagging: {0} %'.format((1-score_bag)*100))

Classification loss - Bagging: 5.835962145110408 %


## Conclusion
1. The mean frequency leads to a relatively low classification loss, around 35%. It is a good feature for classification. This makes sense due to the difference of the voice frequencies between men and women;
1. The mean fundamental frequency leads to a very low classification loss, around 5%. The underlying physical reason may be that the fundamental frequency of the male's voice cannot be reached by female voice. This is confirmed by publications studies (https://hal.archives-ouvertes.fr/halshs-00999332/document)
1. The classification loss is similar for all the classifier. It is not obvious to identify a best classifier.