# Algoritmo para fazer o Grid Search com alguns classificadores para o SESA Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, GroupKFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [2]:
caminhoArquivo = "/home/dimi/Programming/IC2019/ML/jupyter/Outros/resultadosGridsearchSESAAugmentation.txt"

In [3]:
def resultados(modelo, caminhoArquivo):
    
    mediaAcuracia   = modelo.best_score_
    desvPadAcuracia = np.std(modelo.cv_results_['mean_test_score'])
    mediaTempo      = np.mean(modelo.cv_results_['mean_score_time'])
    desvPadTempo    = np.std(modelo.cv_results_['mean_score_time'])
    
    objFile = open(caminhoArquivo, "a")
    
    linha = str(modelo.best_estimator_) + "\n"
    objFile.write(linha) 
    print(linha)
    
    linha = 'Acurácia Média: ' + str(mediaAcuracia) + ' +- ' + str(desvPadAcuracia) + "\n"
    objFile.write(linha) 
    print(linha)
    
    linha = 'Time: ' + str(mediaTempo) + ' +- ' + str(desvPadTempo) + "\n\n"
    objFile.write(linha) 
    print(linha)   
    
    objFile.close() 

#### Abrindo os CSVs de treino e teste e separando os dados em x e y

In [4]:
caminhoCSVTreino = "/home/dimi/Programming/IC2019/ML/datasets/SESA/SESA_Normalizado/train_augmentation/treino_augmentation_normalizado_semPCA.csv"
caminhoCSVTeste  = "/home/dimi/Programming/IC2019/ML/datasets/SESA/SESA_Normalizado/test/teste_normalizado_semPCA.csv"

dataframeTreino  = pd.read_csv(caminhoCSVTreino)
dataframeTeste   = pd.read_csv(caminhoCSVTeste)

print("Tamanho dataframe treino:", len(dataframeTreino))
print("Tamanho dataframe teste:", len(dataframeTeste))
print("Total de dados:", len(dataframeTreino) + len(dataframeTeste))

Tamanho dataframe treino: 59880
Tamanho dataframe teste: 1354
Total de dados: 61234


##### Só rodar a célula abaixo se quiser diminuir o tamanho do dataset para testes rápidos

In [5]:
# percentual = 0.15

# dataframeTreino = dataframeTreino[0:int(percentual * len(dataframeTreino))]
# dataframeTeste  = dataframeTeste[0:int(percentual * len(dataframeTeste))]

# print("Tamanho dataframe treino:", len(dataframeTreino))
# print("Tamanho dataframe teste:", len(dataframeTeste))
# print("Total de dados:", len(dataframeTreino) + len(dataframeTeste))

#### Usando o Group K Fold para garantir que todos os frames de um mesmo áudio fiquem na mesma pasta em um K Fold

Para fazer a validação cruzada, não posso deixar que um mesmo áudio seja usado tanto no treinamento quanto no teste. É preciso garantir que todos os frames de um determinado áudio façam parte ou do treinamento ou do teste.

Fazendo "groups" ser igual ao nomes dos arquivos, é possível usar a classe GroupKFold para fazer o K Fold sem que os áudios se misturem.

In [6]:
data   = np.array(dataframeTreino.iloc[:,1:-1].values.tolist() + dataframeTeste.iloc[:,1:-1].values.tolist())
target = np.array(dataframeTreino.iloc[:,-1].values.tolist() + dataframeTeste.iloc[:,-1].values.tolist())
groups = np.array(dataframeTreino.iloc[:,0].values.tolist() + dataframeTeste.iloc[:,0].values.tolist())

In [7]:
# TESTANDO COMO USAR ESSA CLASSE
# objGroupKFold = GroupKFold(n_splits=3)

# for trainIndex, testIndex in objGroupKFold.split(data, target, groups):
#     xTrain, xTest = data[trainIndex], data[testIndex]
#     yTrain, yTest = target[trainIndex], target[testIndex]
    
#     objKNN = KNeighborsClassifier()
#     objKNN.fit(xTrain, yTrain)
#     yPred = objKNN.predict(xTest)
    
#     print("Acurácia:", accuracy_score(yTest, yPred))

## KNN

In [8]:
grid_params_knn = [
    {
        'n_neighbors': [3,5,7,11,13],
     
        'weights': ['uniform', 
                    'distance'],
     
        'metric': ['euclidean', 
                   'manhattan', 
                   'chebyshev', 
                   'minkowski']
    }
]

gs_knn = GridSearchCV(
    KNeighborsClassifier(),
    grid_params_knn,
    verbose=10,
    cv=GroupKFold(n_splits=3).split(data, target, groups),
    n_jobs=-1,
    scoring='accuracy'
)

In [9]:
gs_knn.fit(data, target)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 28.0min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 40.3min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 68.9min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 86.3min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 121.4min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 146.5min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 181.2min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 204.6min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 228.6min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 252.6min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 296.0min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 335.1min finished


GridSearchCV(cv=<generator object _BaseKFold.split at 0x7fa2b26d4c00>,
             error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid=[{'metric': ['euclidean', 'manhattan', 'chebyshev',
                                     'minkowski'],
                          'n_neighbors': [3, 5, 7, 11, 13],
                          'weights': ['uniform', 'distance']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=10)

In [10]:
resultados(gs_knn, caminhoArquivo)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='distance')

Acurácia Média: 0.8491197700623836 +- 0.0482713755700093

Time: 664.0284180402756 +- 149.70311735613862




## SGD

In [11]:
grid_params_sgd = [
    {
        'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
        'penalty': ['none', 'l2', 'l1', 'elasticnet'],
        'alpha': [0.00001, 0.0001, 0.001],
        'tol': [0.01, 0.001, 0.0001]
    }
]

gs_sgd = GridSearchCV(
    SGDClassifier(),
    grid_params_sgd,
    verbose=10,
    cv=GroupKFold(n_splits=3).split(data, target, groups),
    n_jobs=-1,
    scoring='accuracy'
)

In [12]:
gs_sgd.fit(data, target)

Fitting 3 folds for each of 180 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   44.6s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 14.2min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 17.7min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 20.0min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed: 21

GridSearchCV(cv=<generator object _BaseKFold.split at 0x7fa2b2222480>,
             error_score='raise-deprecating',
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight=None, early_stopping=False,
                                     epsilon=0.1, eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='hinge', max_iter=1000,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2', power_t=0.5,
                                     r...0.001,
                                     validation_fraction=0.1, verbose=0,
                                     warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'alpha': [1e-05, 0.0001, 0.001],
                          'loss': ['hinge', 'log', 'modified_huber',
                                   'squar

In [13]:
resultados(gs_sgd, caminhoArquivo)

SGDClassifier(alpha=0.001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None,
              penalty='elasticnet', power_t=0.5, random_state=None,
              shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
              warm_start=False)

Acurácia Média: 0.7823431426985008 +- 0.03336989435323899

Time: 0.0383923124383997 +- 0.007287723825608822




## Decision Tree

In [14]:
grid_params_tree = [
    {
        'criterion': ['gini', 'entropy'],
        'min_samples_split': [2,6,10,16],
        'min_samples_leaf': [1,2,3,4,5],
        'min_impurity_split': [1e-9, 1e-8, 1e-7, 1e-6]
    }
]

gs_tree = GridSearchCV(
    DecisionTreeClassifier(),
    grid_params_tree,
    verbose=10,
    cv=GroupKFold(n_splits=3).split(data, target, groups),
    n_jobs=-1,
    scoring='accuracy'
)

In [15]:
gs_tree.fit(data, target)

Fitting 3 folds for each of 160 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   43.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed: 14

GridSearchCV(cv=<generator object _BaseKFold.split at 0x7fa2b2222930>,
             error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid=[{'criterion': ['gini', 'entropy'],
             

In [16]:
resultados(gs_tree, caminhoArquivo)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=1e-08,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

Acurácia Média: 0.8042590717575203 +- 0.003410395435843564

Time: 0.02983313649892807 +- 0.006236430319515586




## SVM

In [17]:
grid_params_svm = [
    {
        'C': [0.01, 0.1, 1],
        'kernel': ['linear', 'poly', 'rbf'],
        'degree': [3,4,5],
        'decision_function_shape': ['ovo', 'ovr']
    }
]

gs_svm = GridSearchCV(
    SVC(),
    grid_params_svm,
    verbose=10,
    cv=GroupKFold(n_splits=3).split(data, target, groups),
    n_jobs=-1,
    scoring='accuracy'
)

In [18]:
gs_svm.fit(data, target)

Fitting 3 folds for each of 54 candidates, totalling 162 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 25.9min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 37.5min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 73.4min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 102.7min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 138.7min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 173.4min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 220.6min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 251.2min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 301.0min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 342.2min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 394.1min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 457.6min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 537.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 608.6min
[Parallel(n_jobs=-1)]: Done 162 out of 162 | 

GridSearchCV(cv=<generator object _BaseKFold.split at 0x7fa2b2538318>,
             error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'C': [0.01, 0.1, 1],
                          'decision_function_shape': ['ovo', 'ovr'],
                          'degree': [3, 4, 5],
                          'kernel': ['linear', 'poly', 'rbf']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=10)

In [19]:
resultados(gs_svm, caminhoArquivo)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

Acurácia Média: 0.8597021262697194 +- 0.12505282248445984

Time: 200.83355978830357 +- 64.98283691136588




## Perceptron

In [20]:
grid_params_perceptron = [
    {
        'penalty': ['none', 'l2', 'l1', 'elasticnet'],
        'alpha': [0.00001, 0.00001, 0.0001, 0.001, 0.01],
        'tol': [0.01, 0.001, 0.0001, 0.00001]
    }
]

gs_perceptron = GridSearchCV(
    Perceptron(),
    grid_params_perceptron,
    verbose=10,
    cv=GroupKFold(n_splits=3).split(data, target, groups),
    n_jobs=-1,
    scoring='accuracy'
)

In [21]:
gs_perceptron.fit(data, target)

Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   56.3s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  3

GridSearchCV(cv=<generator object _BaseKFold.split at 0x7fa2b2538660>,
             error_score='raise-deprecating',
             estimator=Perceptron(alpha=0.0001, class_weight=None,
                                  early_stopping=False, eta0=1.0,
                                  fit_intercept=True, max_iter=1000,
                                  n_iter_no_change=5, n_jobs=None, penalty=None,
                                  random_state=0, shuffle=True, tol=0.001,
                                  validation_fraction=0.1, verbose=0,
                                  warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'alpha': [1e-05, 1e-05, 0.0001, 0.001, 0.01],
                          'penalty': ['none', 'l2', 'l1', 'elasticnet'],
                          'tol': [0.01, 0.001, 0.0001, 1e-05]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=10)

In [22]:
resultados(gs_perceptron, caminhoArquivo)

Perceptron(alpha=1e-05, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=1000, n_iter_no_change=5, n_jobs=None,
           penalty='l1', random_state=0, shuffle=True, tol=0.01,
           validation_fraction=0.1, verbose=0, warm_start=False)

Acurácia Média: 0.7224091191168305 +- 0.04986855845859998

Time: 0.053537952899932864 +- 0.01618069056734965




## LDA

In [23]:
grid_params_lda = [
    {
        'solver': ['svd', 'lsqr', 'eigen'],
        'store_covariance': [True, False],
        'tol': [1e-5,1e-4,1e-3]
    }
]

gs_lda = GridSearchCV(
    LinearDiscriminantAnalysis(),
    grid_params_lda,
    verbose=10,
    cv=GroupKFold(n_splits=3).split(data, target, groups),
    n_jobs=-1,
    scoring='accuracy'
)

In [24]:
gs_lda.fit(data, target)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   31.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   44.7s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   51.5s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   56.5s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  1.2min finished


GridSearchCV(cv=<generator object _BaseKFold.split at 0x7fa2b2222660>,
             error_score='raise-deprecating',
             estimator=LinearDiscriminantAnalysis(n_components=None,
                                                  priors=None, shrinkage=None,
                                                  solver='svd',
                                                  store_covariance=False,
                                                  tol=0.0001),
             iid='warn', n_jobs=-1,
             param_grid=[{'solver': ['svd', 'lsqr', 'eigen'],
                          'store_covariance': [True, False],
                          'tol': [1e-05, 0.0001, 0.001]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=10)

In [25]:
resultados(gs_lda, caminhoArquivo)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='svd', store_covariance=True, tol=1e-05)

Acurácia Média: 0.7431655616160956 +- 0.0

Time: 0.07487501479961253 +- 0.028658504177869436


