# Avaliando algoritmos de apredizagem

## Grid-search

***

O Grid-search é usado para encontrar os hiperparâmetros ideais de um modelo que resultem em previsões mais "precisas". 

[Link](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

## Importando bibliotecas

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

# Classes do modelo de aprendizado
from sklearn.neighbors import KNeighborsClassifier

# Funções de avaliação dos modelos
from sklearn.metrics import classification_report, f1_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
#carregando o csv
dataset = pd.read_csv("https://raw.githubusercontent.com/Francimaria/monitoria-ml/main/Iris.csv")

# Mapeando os valores da classe para inteiro (para fins de visualização da região de decisão)
dataset['Species'] = pd.factorize(dataset['Species'])[0]


### Separando o conjunto de dados

In [3]:
#Vamos usar somente duas features SepalLengthCm e SepalWidthCm
X = dataset.loc[:,["SepalLengthCm", "SepalWidthCm"]] 
y = dataset.loc[:,["Species"]]

#Separando o conjunto de dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

## Treinamento do modelo com os parâmetros default 

In [4]:
# vamos criar um classificador kNN com k=5
model = KNeighborsClassifier()
model.fit(X_train, y_train)

# e ver a sua performance no dataset de teste
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.50      0.27      0.35        15
           2       0.50      0.73      0.59        15

    accuracy                           0.67        45
   macro avg       0.67      0.67      0.65        45
weighted avg       0.67      0.67      0.65        45



## Seleção de parâmetros com o Grid-Search 

In [5]:
model = KNeighborsClassifier()

parameters = {'n_neighbors': [11, 9, 7, 5, 3, 1],
              'metric':["euclidean", "manhattan"]}

grid = GridSearchCV(estimator = model,             # k-nn
                    param_grid = parameters,       # dicionário com valores para serem testados.
                    scoring = 'f1_macro',          # métrica de avaliação
                    cv = 5)                        # cross-validation

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print("Melhor parametro:", grid.best_params_)         
# e ver a sua performance no dataset de teste
print(classification_report(y_test, grid.predict(X_test)))

Melhor parametro: {'metric': 'euclidean', 'n_neighbors': 11}
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        15
           1       0.67      0.27      0.38        15
           2       0.52      0.80      0.63        15

    accuracy                           0.69        45
   macro avg       0.71      0.69      0.66        45
weighted avg       0.71      0.69      0.66        45



In [6]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.007183,0.004045,0.012652,0.008476,euclidean,11,"{'metric': 'euclidean', 'n_neighbors': 11}",0.85641,0.904762,0.904762,0.902778,0.664957,0.846734,0.092749,1
1,0.002482,0.000822,0.004255,0.000534,euclidean,9,"{'metric': 'euclidean', 'n_neighbors': 9}",0.85641,0.904762,0.904762,0.850267,0.664957,0.836232,0.088693,2
2,0.002175,0.00024,0.004389,0.000828,euclidean,7,"{'metric': 'euclidean', 'n_neighbors': 7}",0.760684,0.904762,0.85641,0.792593,0.664957,0.795881,0.082352,6
3,0.002291,0.000461,0.004342,0.000663,euclidean,5,"{'metric': 'euclidean', 'n_neighbors': 5}",0.85641,0.85641,0.85641,0.792593,0.619048,0.796174,0.091948,5
4,0.002373,0.000503,0.004266,0.000886,euclidean,3,"{'metric': 'euclidean', 'n_neighbors': 3}",0.708333,0.805556,0.76801,0.708333,0.664957,0.731038,0.049634,12
5,0.002203,0.000593,0.004177,0.000801,euclidean,1,"{'metric': 'euclidean', 'n_neighbors': 1}",0.760684,0.850267,0.804424,0.750446,0.664957,0.766156,0.061737,9
6,0.002386,0.000575,0.004405,0.000706,manhattan,11,"{'metric': 'manhattan', 'n_neighbors': 11}",0.85641,0.904762,0.904762,0.792593,0.664957,0.824697,0.08986,3
7,0.002271,0.000552,0.004238,0.000466,manhattan,9,"{'metric': 'manhattan', 'n_neighbors': 9}",0.85641,0.904762,0.904762,0.792593,0.664957,0.824697,0.08986,3
8,0.002613,0.00093,0.004823,0.001395,manhattan,7,"{'metric': 'manhattan', 'n_neighbors': 7}",0.754335,0.904762,0.85641,0.792593,0.619048,0.785429,0.097959,7
9,0.002201,0.00036,0.004419,0.001052,manhattan,5,"{'metric': 'manhattan', 'n_neighbors': 5}",0.755556,0.85641,0.85641,0.792593,0.619048,0.776003,0.087475,8


In [7]:
def grid_Search_kfold(skf):
  score_list = []
  fold = 0

  model = KNeighborsClassifier()
  parameters = {'n_neighbors': [11, 9, 7, 5, 3, 1],
                'metric':["euclidean", "manhattan"]}

  for train, test in skf.split(X, y):
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]

        grid = GridSearchCV(estimator = model,     # k-nn
                    param_grid = parameters,       # dicionário com valores para serem testados.
                    scoring = 'f1_macro',          # métrica de avaliação
                    cv = 5)                      # cross-validation
        
        grid.fit(X_train, y_train)

        y_pred = grid.predict(X_test)
    
        print("Melhor parametro:", grid.best_params_)         
        print("Fold %d: %.3f" %(fold, f1_score(y_test, y_pred, average="macro")))
        
        score_list.append(f1_score(y_test, y_pred, average="macro"))
        fold += 1

    
    
  score = np.array(score_list)
  print("\n F1-score média (desvio): %.3f +- (%.3f)" %(score.mean(), score.std()))

grid_Search_kfold(StratifiedKFold(n_splits=5, random_state=42, shuffle=True))

Melhor parametro: {'metric': 'manhattan', 'n_neighbors': 11}
Fold 0: 0.829
Melhor parametro: {'metric': 'euclidean', 'n_neighbors': 7}
Fold 1: 0.733
Melhor parametro: {'metric': 'manhattan', 'n_neighbors': 9}
Fold 2: 0.731
Melhor parametro: {'metric': 'euclidean', 'n_neighbors': 5}
Fold 3: 0.800
Melhor parametro: {'metric': 'euclidean', 'n_neighbors': 9}
Fold 4: 0.733

 F1-score média (desvio): 0.765 +- (0.041)
