# Project - Gender recognition by voice
## EPFL - Statistical learning (MATH-412) 
## Adrien Besson

### 3. Classification with 80/20 split of the dataset and different seed numbers

In [1]:
# Import libraries
%matplotlib inline
import pandas as pd
import numpy as np
import sklearn.discriminant_analysis as lda
import sklearn.linear_model as lm
import sklearn.model_selection as model_selection
import sklearn.preprocessing as preproc
from sklearn import svm, tree, ensemble, neighbors
import os

In [2]:
# Load the data
input_file = os.path.join(os.getcwd(), 'data', 'voice.csv')
data = pd.read_csv(input_file)
data['label'] = data['label'].astype('category')

In [3]:
# Drop collinear columns
cols_to_drop = ['IQR', 'dfrange', 'centroid']
data = data.drop(cols_to_drop,axis=1)

In [4]:
# Assign quantitative values to the labels and drop them from the data
encoder = preproc.LabelEncoder()
labels = data['label'].values
labels = encoder.fit_transform(labels)
data = data.drop(['label'],axis=1)

In [19]:
# Scale the features to have mean 0 and variance 1
features = preproc.scale(features)

## Classification based on all the features

In [68]:
# Split the dataset
seed = 2
features = data.values
features_train, features_test, labels_train, labels_test = model_selection.train_test_split(features, labels, train_size=0.8, test_size=0.2, random_state=seed)

## k-NN

In [69]:
# k-NN classification
param_grid = [
  {'n_neighbors': np.arange(1, 20, 1), 'weights':['distance', 'uniform']}
] 
knn_grid = model_selection.GridSearchCV(estimator=neighbors.KNeighborsClassifier(), cv=10, param_grid=param_grid, n_jobs=-1)
knn_grid.fit(X=features_train, y=labels_train)
class_score_knn = knn_grid.score(X=features_test, y=labels_test)
print('Classification loss kNN: {0} %'.format((1-class_score_knn)*100))

Classification loss kNN: 26.813880126182966 %


## LDA/QDA 

In [70]:
lda_class = lda.LinearDiscriminantAnalysis()
lda_class.fit(X=features_train, y=labels_train)
class_score_lda = lda_class.score(X=features_test, y=labels_test)
print('Classification loss - LDA: {0} %'.format((1-class_score_lda)*100))

Classification loss - LDA: 3.785488958990535 %


In [71]:
qda_class = lda.QuadraticDiscriminantAnalysis()
qda_class.fit(X=features_train, y=labels_train)
class_score_qda = qda_class.score(X=features_test, y=labels_test)
print('Classification loss - QDA: {0} %'.format((1-class_score_qda)*100))

Classification loss - QDA: 4.57413249211357 %


## Logistic regression

In [72]:
# Logistic regression
logistic = lm.LogisticRegression(penalty='l2', solver='liblinear', random_state=seed)
logistic.set_params(C=1e9)
logistic.fit(features_train, labels_train)
score = logistic.score(features_test, labels_test)
logistic.fit(X=features_train, y=labels_train)
class_score_l2 = logistic.score(X=features_test, y=labels_test)
print('Classification loss - Logistic : {0} %'.format((1-class_score_l2)*100))

Classification loss - Logistic : 2.8391167192429068 %


## Logistic regression - Ridge

In [73]:
# Logistic regression with L2 regularization
logistic_reg = lm.LogisticRegressionCV(penalty='l2', solver='liblinear', random_state=seed)
logistic_reg.fit(X=features_train, y=labels_train)
class_score_reg_l2 = logistic_reg.score(X=features_test, y=labels_test)
print('Classification loss - Logistic Ridge: {0} %'.format((1-class_score_reg_l2)*100))

Classification loss - Logistic Ridge: 2.8391167192429068 %


## Logistic regression - LASSO

In [74]:
logistic_reg_l1 = lm.LogisticRegressionCV(penalty='l1', solver='liblinear', random_state=seed)
logistic_reg_l1.fit(X=features_train, y=labels_train)
class_score_reg_l1 = logistic_reg_l1.score(X=features_test, y=labels_test)
print('Classification loss - Logistic LASSO: {0}'.format((1-class_score_reg_l1)*100))

Classification loss - Logistic LASSO: 2.681388012618302


## Linear SVM 

In [None]:
# SVM classification - Linear kernel
param_grid = [
  {'C': [8, 9, 10 ,11 ,12], 'gamma':[0.2, 0.21, 0.22, 0.23, 0.24, 0.25]}
]
class_svm = svm.SVC(kernel='linear', random_state=seed)
class_svm_grid = model_selection.GridSearchCV(estimator=class_svm, cv=10, param_grid=param_grid, n_jobs=-1)
class_svm_grid.fit(X=features_train, y=labels_train)
score_svm = class_svm_grid.score(X=features_test, y=labels_test)
print('Classification loss - linear SVM: {0} %'.format((1-score_svm)*100))

## Kernel SVM - RBF

In [None]:
# SVM classification - RBF kernel
param_grid = [
  {'C': 2.0**(np.arange(-5, 15, 2)), 'gamma':[0.2, 0.21, 0.22, 0.23, 0.24, 0.25], 'kernel':['rbf']}
]
class_svm = svm.SVC(random_state=seed)
class_svm_grid = model_selection.GridSearchCV(estimator=class_svm, cv=10, param_grid=param_grid, n_jobs=-1)
class_svm_grid.fit(X=features_train, y=labels_train)
score_svm = class_svm_grid.score(X=features_test, y=labels_test)
print('Classification loss - kernel SVM - L2: {0} %'.format((1-score_svm)*100))

## Decision Tree

In [None]:
param_grid = [
  {'min_samples_leaf': [1, 5,10,50,100,200,500]}
]
dec_tree = tree.DecisionTreeClassifier(criterion='entropy', random_state=seed)
dec_tree_grid = model_selection.GridSearchCV(estimator=dec_tree, cv=10, param_grid=param_grid, n_jobs=-1)
dec_tree_grid.fit(X=features_train, y=labels_train)
score_tree = dec_tree_grid.score(X=features_test, y=labels_test)
print('Classification loss - Decision tree: {0} %'.format((1-score_tree)*100))

## Random Forest

In [None]:
param_grid = [
  {'min_samples_leaf': [1, 5,10,50,100,200,500]}
]
random_forest = ensemble.RandomForestClassifier(criterion='entropy', random_state=seed, n_jobs=-1)
random_forest_grid = model_selection.GridSearchCV(estimator=random_forest, cv=10, param_grid=param_grid, n_jobs=-1)
random_forest_grid.fit(X=features_train, y=labels_train)
score_rf = random_forest_grid.score(X=features_test, y=labels_test)
print('Classification loss - Random forest: {0} %'.format((1-score_rf)*100))

## AdaBoost 

In [None]:
param_grid = [
  {'n_estimators': [1, 5,10,50,100,200,500], 'learning_rate':[1e-3, 1e-2, 1e-1, 1, 10]}
]
ada_boost = ensemble.AdaBoostClassifier(random_state=seed)
ada_boost_grid = model_selection.GridSearchCV(estimator=ada_boost, cv=10, param_grid=param_grid)
ada_boost_grid.fit(X=features_train, y=labels_train)
score_ab = ada_boost_grid.score(X=features_test, y=labels_test)
print('Classification loss - AdaBoost: {0} %'.format((1-score_ab)*100))

## Gradient Boosting

In [None]:
param_grid = [
  {'n_estimators': [1, 5,10,50,100,200,500], 'learning_rate':[1e-3, 1e-2, 1e-1, 1, 10], 'min_samples_leaf': [1, 5,10,50,100,200,500]}
]
g_boost = ensemble.GradientBoostingClassifier(random_state=seed)
g_boost_grid = model_selection.GridSearchCV(estimator=g_boost, cv=10, param_grid=param_grid)
g_boost_grid.fit(X=features_train, y=labels_train)
score_gb = g_boost_grid.score(X=features_test, y=labels_test)
print('Classification loss - Gradient Boosting: {0} %'.format((1-score_gb)*100))

## Bagging

In [None]:
param_grid = [
  {'n_estimators': [1, 5,10,50,100,200,500]}
]
bagging = ensemble.BaggingClassifier(random_state=seed)
bagging_grid = model_selection.GridSearchCV(estimator=bagging, cv=10, param_grid=param_grid)
bagging_grid.fit(X=features_train, y=labels_train)
score_bag = bagging_grid.score(X=features_test, y=labels_test)
print('Classification loss - Bagging: {0} %'.format((1-score_bag)*100))

## Conclusions
 1. Tree-based methods seem to give the best results but the results are unstable with respect to the shift of the training and test set (To see that, just run the script with a different random_state for the split)
 1. The average classification error is significantly decreased compared to the case where only meanfun was used.   