# Project - Gender recognition by voice
## EPFL - Statistical learning (MATH-412) 
## Adrien Besson

### 2. An intuitive approach

In [1]:
# Import libraries
%matplotlib inline
import pandas as pd
import numpy as np
import sklearn.discriminant_analysis as lda
import sklearn.linear_model as lm
import sklearn.model_selection as model_selection
import sklearn.preprocessing as preproc
from sklearn import svm, tree, ensemble, neighbors
import os

In [2]:
# Load the data
input_file = os.path.join(os.getcwd(), 'data', 'voice.csv')
data = pd.read_csv(input_file)
data['label'] = data['label'].astype('category')

In [3]:
# Drop collinear columns
cols_to_drop = ['IQR', 'dfrange', 'centroid']
data = data.drop(cols_to_drop,axis=1)

In [4]:
# Assign quantitative values to the labels and drop them from the data
encoder = preproc.LabelEncoder()
labels = data['label'].values
labels = encoder.fit_transform(labels)
data = data.drop(['label'],axis=1)

## Classification based on the fundamental frequency

Here, our intuitive idea is to consider the mean fundamental frequency as a good classifier between male and female. Indeed, most state-of-the-art studies on automatic gender recognition state that the fundamental frequency is a good classifier. Let's try this idea!

In [5]:
# Split the dataset
features = data['meanfun'].values.reshape(-1,1)
features_train, features_test, labels_train, labels_test = model_selection.train_test_split(features, labels, train_size=0.8, test_size=0.2, random_state=10)

## k-NN

In [6]:
# k-NN classification
param_grid = [
  {'n_neighbors': np.arange(1, 50, 1)}
]
knn = neighbors.KNeighborsClassifier()
knn_grid = model_selection.GridSearchCV(knn, param_grid)
knn_grid.fit(X=features_train, y=labels_train)
class_score_knn = knn_grid.score(X=features_test, y=labels_test)
print('Classification loss kNN: {0} %'.format((1-class_score_knn)*100))

Classification loss kNN: 5.205047318611989 %


## LDA/QDA 

In [7]:
lda_class = lda.LinearDiscriminantAnalysis()
lda_class.fit(X=features_train, y=labels_train)
class_score_lda = lda_class.score(X=features_test, y=labels_test)
print('Classification loss - LDA: {0} %'.format((1-class_score_lda)*100))

Classification loss - LDA: 4.889589905362779 %


In [8]:
qda_class = lda.QuadraticDiscriminantAnalysis()
qda_class.fit(X=features_train, y=labels_train)
class_score_qda = qda_class.score(X=features_test, y=labels_test)
print('Classification loss - QDA: {0} %'.format((1-class_score_qda)*100))

Classification loss - QDA: 4.889589905362779 %


## Logistic regression

In [9]:
# Logistic regression
logistic = lm.LogisticRegression(penalty='l2', solver='liblinear', random_state=10)
logistic.set_params(C=1e9)
logistic.fit(features_train, labels_train)
score = logistic.score(features_test, labels_test)
logistic.fit(X=features_train, y=labels_train)
class_score_l2 = logistic.score(X=features_test, y=labels_test)
print('Classification loss - Logistic : {0} %'.format((1-class_score_l2)*100))

Classification loss - Logistic : 5.3627760252365935 %


## Logistic regression - Ridge

In [10]:
# Logistic regression with L2 regularization
list_C = np.logspace(10^-3, 10^3, 200)
logistic_reg = lm.LogisticRegressionCV(penalty='l2', solver='liblinear', random_state=10)
logistic_reg.fit(X=features_train, y=labels_train)
class_score_reg_l2 = logistic_reg.score(X=features_test, y=labels_test)
print('Classification loss - Logistic Ridge: {0} %'.format((1-class_score_reg_l2)*100))

Classification loss - Logistic Ridge: 5.047318611987384 %


## Logistic regression - LASSO

In [11]:
logistic_reg_l1 = lm.LogisticRegressionCV(penalty='l1', solver='liblinear', random_state=10)
logistic_reg_l1.fit(X=features_train, y=labels_train)
class_score_reg_l1 = logistic_reg_l1.score(X=features_test, y=labels_test)
print('Classification loss - Logistic LASSO: {0}'.format((1-class_score_reg_l1)*100))

Classification loss - Logistic LASSO: 5.3627760252365935


## Linear SVM 

In [13]:
# SVM classification - Linear kernel
param_grid = [
  {'C': np.logspace(10^-3, 10^3, 100)}
]
class_svm = svm.LinearSVC(random_state=10)
class_svm_grid = model_selection.GridSearchCV(class_svm, param_grid)
class_svm_grid.fit(X=features_train, y=labels_train)
score_svm = class_svm_grid.score(X=features_test, y=labels_test)
print('Classification loss - linear SVM: {0} %'.format((1-score_svm)*100))

Classification loss - linear SVM: 5.047318611987384 %


## Kernel SVM - RBF

In [14]:
# SVM classification - RBF kernel
param_grid = [
  {'C': np.logspace(10^-3, 10^3, 100)}
]
class_svm = svm.SVC(kernel='rbf',random_state=10)
class_svm_grid = model_selection.GridSearchCV(class_svm, param_grid)
class_svm_grid.fit(X=features_train, y=labels_train)
score_svm = class_svm_grid.score(X=features_test, y=labels_test)
print('Classification loss - kernel SVM - L2: {0} %'.format((1-score_svm)*100))

Classification loss - kernel SVM - L2: 4.889589905362779 %


## Decision Tree

In [19]:
param_grid = [
  {'min_samples_leaf': [1, 5,10,50,100,200,500]}
]
dec_tree = tree.DecisionTreeClassifier(criterion='entropy', random_state=10)
dec_tree_grid = model_selection.GridSearchCV(dec_tree, param_grid)
dec_tree_grid.fit(X=features_train, y=labels_train)
score_tree = dec_tree_grid.score(X=features_test, y=labels_test)
print('Classification loss - Decision tree: {0} %'.format((1-score_tree)*100))

Classification loss - Decision tree: 5.047318611987384 %


55

## Random forest

In [16]:
param_grid = [
  {'min_samples_leaf': [1, 5,10,50,100,200,500]}
]
random_forest = ensemble.RandomForestClassifier(criterion='entropy', random_state=10, n_jobs=-1)
random_forest_grid = model_selection.GridSearchCV(random_forest, param_grid)
random_forest_grid.fit(X=features_train, y=labels_train)
score_rf = random_forest_grid.score(X=features_test, y=labels_test)
print('Classification loss - Random forest: {0} %'.format((1-score_rf)*100))

Classification loss - Random forest: 5.205047318611989 %


## AdaBoost 

In [17]:
param_grid = [
  {'n_estimators': [1, 5,10,50,100,200,500], 'learning_rate':[1e-3, 1e-2, 1e-1, 1, 10]}
]
ada_boost = ensemble.AdaBoostClassifier(random_state=10)
ada_boost_grid = model_selection.GridSearchCV(ada_boost, param_grid)
ada_boost_grid.fit(X=features_train, y=labels_train)
score_ab = ada_boost_grid.score(X=features_test, y=labels_test)
print('Classification loss - AdaBoost: {0} %'.format((1-score_ab)*100))

Classification loss - AdaBoost: 5.047318611987384 %


## Gradient Boosting

In [18]:
param_grid = [
  {'n_estimators': [1, 5,10,50,100,200,500], 'learning_rate':[1e-3, 1e-2, 1e-1, 1, 10], 'min_samples_leaf': [1, 5,10,50,100,200,500]}
]
g_boost = ensemble.GradientBoostingClassifier(random_state=10)
g_boost_grid = model_selection.GridSearchCV(g_boost, param_grid)
g_boost_grid.fit(X=features_train, y=labels_train)
score_gb = g_boost_grid.score(X=features_test, y=labels_test)
print('Classification loss - Gradient Boosting: {0} %'.format((1-score_gb)*100))

Classification loss - Gradient Boosting: 5.047318611987384 %


## Bagging

In [19]:
param_grid = [
  {'n_estimators': [1, 5,10,50,100,200,500]}
]
bagging = ensemble.BaggingClassifier(random_state=10)
bagging_grid = model_selection.GridSearchCV(bagging, param_grid)
bagging_grid.fit(X=features_train, y=labels_train)
score_bag = bagging_grid.score(X=features_test, y=labels_test)
print('Classification loss - Bagging: {0} %'.format((1-score_bag)*100))

Classification loss - Bagging: 5.047318611987384 %
