# Algoritmo para fazer o Grid Search com alguns classificadores para o SESA Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, GroupKFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.ensemble import BaggingClassifier

#### Abrindo os CSVs de treino e teste e separando os dados em x e y

In [2]:
caminhoCSVTreino = "/home/dimi/Programming/IC2019/ML/datasets/SESA/SESA_Normalizado/train/treino_normalizado_semPCA.csv"
caminhoCSVTeste  = "/home/dimi/Programming/IC2019/ML/datasets/SESA/SESA_Normalizado/test/teste_normalizado_semPCA.csv"

dataframeTreino  = pd.read_csv(caminhoCSVTreino)
dataframeTeste   = pd.read_csv(caminhoCSVTeste)

print("Tamanho dataframe treino:", len(dataframeTreino))
print("Tamanho dataframe teste:", len(dataframeTeste))
print("Total de dados:", len(dataframeTreino) + len(dataframeTeste))

Tamanho dataframe treino: 5988
Tamanho dataframe teste: 1354
Total de dados: 7342


#### Usando o Group K Fold para garantir que todos os frames de um mesmo áudio fiquem na mesma pasta em um K Fold

Para fazer a validação cruzada, não posso deixar que um mesmo áudio seja usado tanto no treinamento quanto no teste. É preciso garantir que todos os frames de um determinado áudio façam parte ou do treinamento ou do teste.

Fazendo "groups" ser igual ao nomes dos arquivos, é possível usar a classe GroupKFold para fazer o K Fold sem que os áudios se misturem.

In [3]:
data   = np.array(dataframeTreino.iloc[:,1:-1].values.tolist() + dataframeTeste.iloc[:,1:-1].values.tolist())
target = np.array(dataframeTreino.iloc[:,-1].values.tolist() + dataframeTeste.iloc[:,-1].values.tolist())
groups = np.array(dataframeTreino.iloc[:,0].values.tolist() + dataframeTeste.iloc[:,0].values.tolist())

In [4]:
# TESTANDO COMO USAR ESSA CLASSE
objGroupKFold = GroupKFold(n_splits=3)

for trainIndex, testIndex in objGroupKFold.split(data, target, groups):
    xTrain, xTest = data[trainIndex], data[testIndex]
    yTrain, yTest = target[trainIndex], target[testIndex]
    
    objKNN = KNeighborsClassifier()
    objKNN.fit(xTrain, yTrain)
    yPred = objKNN.predict(xTest)
    
    print("Acurácia:", accuracy_score(yTest, yPred))

Acurácia: 0.8402777777777778
Acurácia: 0.8242746219861055
Acurácia: 0.8929301185124643


## KNN

In [5]:
grid_params_knn = [
    {
        'n_neighbors': [3,5,7,11,13,17],
     
        'weights': ['uniform', 
                    'distance'],
     
        'metric': ['euclidean', 
                   'manhattan', 
                   'chebyshev', 
                   'minkowski']
    }
]

gs_knn = GridSearchCV(
    KNeighborsClassifier(),
    grid_params_knn,
    verbose=10,
    cv=GroupKFold(n_splits=3).split(data, target, groups),
    n_jobs=-1,
    scoring='accuracy'
)

In [6]:
gs_knn.fit(data, target)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   26.7s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   33.2s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   39.7s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   47.1s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   58.2s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:  1.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:  1.8min finished


GridSearchCV(cv=<generator object _BaseKFold.split at 0x7fa46f350a98>,
             error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid=[{'metric': ['euclidean', 'manhattan', 'chebyshev',
                                     'minkowski'],
                          'n_neighbors': [3, 5, 7, 11, 13, 17],
                          'weights': ['uniform', 'distance']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=10)

In [7]:
print(gs_knn.best_estimator_, "\n")
print(gs_knn.best_score_)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='distance') 

0.8703350585671479


## SGD

In [8]:
grid_params_sgd = [
    {
        'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
        'penalty': ['none', 'l2', 'l1', 'elasticnet'],
        'alpha': [0.00001, 0.0001, 0.001],
        'tol': [0.01, 0.001, 0.0001]
    }
]

gs_sgd = GridSearchCV(
    SGDClassifier(),
    grid_params_sgd,
    verbose=10,
    cv=GroupKFold(n_splits=3).split(data, target, groups),
    n_jobs=-1,
    scoring='accuracy'
)

In [9]:
gs_sgd.fit(data, target)

Fitting 3 folds for each of 180 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   26.1s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   30.3s
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:   34.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   39.4s
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:   43.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   

GridSearchCV(cv=<generator object _BaseKFold.split at 0x7fa46f325a98>,
             error_score='raise-deprecating',
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight=None, early_stopping=False,
                                     epsilon=0.1, eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='hinge', max_iter=1000,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2', power_t=0.5,
                                     r...0.001,
                                     validation_fraction=0.1, verbose=0,
                                     warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'alpha': [1e-05, 0.0001, 0.001],
                          'loss': ['hinge', 'log', 'modified_huber',
                                   'squar

In [10]:
print(gs_sgd.best_estimator_, "\n")
print(gs_sgd.best_score_)

SGDClassifier(alpha=0.001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.0001,
              validation_fraction=0.1, verbose=0, warm_start=False) 

0.8387360392263689


## Bagging

In [11]:
grid_params_bagging = [
    {
        'n_estimators': [50, 100, 150, 200],
        'max_features': [0.2, 0.4, 0.6, 0.8, 1.0]
    }
]

gs_bagging = GridSearchCV(
    BaggingClassifier(),
    grid_params_bagging,
    verbose=10,
    cv=GroupKFold(n_splits=3).split(data, target, groups),
    n_jobs=-1,
    scoring='accuracy'
)

In [12]:
gs_bagging.fit(data, target)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   39.5s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  52 out of  60 | elapsed:  4.2min remaining:   38.5s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  5.9min finished


GridSearchCV(cv=<generator object _BaseKFold.split at 0x7fa46e812228>,
             error_score='raise-deprecating',
             estimator=BaggingClassifier(base_estimator=None, bootstrap=True,
                                         bootstrap_features=False,
                                         max_features=1.0, max_samples=1.0,
                                         n_estimators=10, n_jobs=None,
                                         oob_score=False, random_state=None,
                                         verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'max_features': [0.2, 0.4, 0.6, 0.8, 1.0],
                          'n_estimators': [50, 100, 150, 200]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=10)

In [13]:
print(gs_bagging.best_estimator_, "\n")
print(gs_bagging.best_score_)

BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=0.4, max_samples=1.0, n_estimators=200,
                  n_jobs=None, oob_score=False, random_state=None, verbose=0,
                  warm_start=False) 

0.8975755924816127


## LinearSVC

In [14]:
grid_params_svm = [
    {
        'C': [0.01, 0.1, 1],
        'penalty': ['l2'],
        'multi_class': ['ovr', 'crammer_singer']
    }
]

gs_svm = GridSearchCV(
    LinearSVC(),
    grid_params_svm,
    verbose=10,
    cv=GroupKFold(n_splits=3).split(data, target, groups),
    n_jobs=-1,
    scoring='accuracy'
)

In [15]:
gs_svm.fit(data, target)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done   5 out of  18 | elapsed:    1.1s remaining:    2.8s
[Parallel(n_jobs=-1)]: Done   7 out of  18 | elapsed:    3.0s remaining:    4.6s
[Parallel(n_jobs=-1)]: Done   9 out of  18 | elapsed:    3.5s remaining:    3.5s
[Parallel(n_jobs=-1)]: Done  11 out of  18 | elapsed:    4.4s remaining:    2.8s
[Parallel(n_jobs=-1)]: Done  13 out of  18 | elapsed:    6.0s remaining:    2.3s
[Parallel(n_jobs=-1)]: Done  15 out of  18 | elapsed:    7.2s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   27.0s finished


GridSearchCV(cv=<generator object _BaseKFold.split at 0x7fa46e8127c8>,
             error_score='raise-deprecating',
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='warn', n_jobs=-1,
             param_grid=[{'C': [0.01, 0.1, 1],
                          'multi_class': ['ovr', 'crammer_singer'],
                          'penalty': ['l2']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=10)

In [16]:
print(gs_svm.best_estimator_, "\n")
print(gs_svm.best_score_)

LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0) 

0.8481340234268592


## Perceptron

In [17]:
grid_params_perceptron = [
    {
        'penalty': ['none', 'l2', 'l1', 'elasticnet'],
        'alpha': [0.00001, 0.00001, 0.0001, 0.001, 0.01],
        'tol': [0.01, 0.001, 0.0001, 0.00001]
    }
]

gs_perceptron = GridSearchCV(
    Perceptron(),
    grid_params_perceptron,
    verbose=10,
    cv=GroupKFold(n_splits=3).split(data, target, groups),
    n_jobs=-1,
    scoring='accuracy'
)

In [18]:
gs_perceptron.fit(data, target)

Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1470s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0072s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 107 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 123 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 153 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    9.2s
[Parallel(n_jo

GridSearchCV(cv=<generator object _BaseKFold.split at 0x7fa46eb1dde0>,
             error_score='raise-deprecating',
             estimator=Perceptron(alpha=0.0001, class_weight=None,
                                  early_stopping=False, eta0=1.0,
                                  fit_intercept=True, max_iter=1000,
                                  n_iter_no_change=5, n_jobs=None, penalty=None,
                                  random_state=0, shuffle=True, tol=0.001,
                                  validation_fraction=0.1, verbose=0,
                                  warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'alpha': [1e-05, 1e-05, 0.0001, 0.001, 0.01],
                          'penalty': ['none', 'l2', 'l1', 'elasticnet'],
                          'tol': [0.01, 0.001, 0.0001, 1e-05]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=10)

In [19]:
print(gs_perceptron.best_estimator_, "\n")
print(gs_perceptron.best_score_)

Perceptron(alpha=1e-05, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=1000, n_iter_no_change=5, n_jobs=None,
           penalty='l2', random_state=0, shuffle=True, tol=0.01,
           validation_fraction=0.1, verbose=0, warm_start=False) 

0.8319259057477526
