# Avaliando algoritmos de apredizagem

## Grid-search

***

O Grid-search é usado para encontrar os hiperparâmetros ideais de um modelo que resultem em previsões mais "precisas". 

[Link](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

## Importando bibliotecas

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

# Classes do modelo de aprendizado
from sklearn.neighbors import KNeighborsClassifier

# Funções de avaliação dos modelos
from sklearn.metrics import classification_report, f1_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
#carregando o csv
dataset = pd.read_csv("https://raw.githubusercontent.com/Francimaria/especializacao_DNN/main/datasets/Iris.csv")

# Mapeando os valores da classe para inteiro (para fins de visualização da região de decisão)
dataset['Species'] = pd.factorize(dataset['Species'])[0]


### Separando o conjunto de dados

In [3]:
#Vamos usar somente duas features SepalLengthCm e SepalWidthCm
X = dataset.loc[:,["SepalLengthCm", "SepalWidthCm"]] 
y = dataset.loc[:,["Species"]]

#Separando o conjunto de dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

## Treinamento do modelo com os parâmetros default 

In [4]:
# vamos criar um classificador kNN com k=5
model = KNeighborsClassifier()
model.fit(X_train, y_train)

# e ver a sua performance no dataset de teste
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.73      0.53      0.62        15
           2       0.63      0.80      0.71        15

    accuracy                           0.78        45
   macro avg       0.79      0.78      0.77        45
weighted avg       0.79      0.78      0.77        45



## Seleção de parâmetros com o Grid-Search 

In [5]:
model = KNeighborsClassifier()

parameters = {'n_neighbors': [11, 9, 7, 5, 3, 1],
              'metric':["euclidean", "manhattan"]}

grid = GridSearchCV(estimator = model,             # k-nn
                    param_grid = parameters,       # dicionário com valores para serem testados.
                    scoring = 'f1_macro',          # métrica de avaliação
                    cv = 5)                        # cross-validation

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print("Melhor parametro:", grid.best_params_)         
# e ver a sua performance no dataset de teste
print(classification_report(y_test, grid.predict(X_test)))

Melhor parametro: {'metric': 'euclidean', 'n_neighbors': 3}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.64      0.47      0.54        15
           2       0.58      0.73      0.65        15

    accuracy                           0.73        45
   macro avg       0.74      0.73      0.73        45
weighted avg       0.74      0.73      0.73        45



In [6]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005762,0.000672,0.020804,0.00164,euclidean,11,"{'metric': 'euclidean', 'n_neighbors': 11}",0.650624,0.714286,0.760684,0.760684,0.760684,0.729392,0.04329,5
1,0.005149,0.000461,0.016541,0.002982,euclidean,9,"{'metric': 'euclidean', 'n_neighbors': 9}",0.650624,0.714286,0.760684,0.760684,0.760684,0.729392,0.04329,5
2,0.002998,0.000243,0.012304,0.000765,euclidean,7,"{'metric': 'euclidean', 'n_neighbors': 7}",0.760684,0.714286,0.809524,0.585185,0.760684,0.726072,0.076614,7
3,0.003848,0.000585,0.012886,0.001259,euclidean,5,"{'metric': 'euclidean', 'n_neighbors': 5}",0.850267,0.664957,0.85641,0.585185,0.809524,0.753269,0.108869,4
4,0.003866,0.001482,0.013697,0.002684,euclidean,3,"{'metric': 'euclidean', 'n_neighbors': 3}",0.792593,0.760684,0.904762,0.650624,0.814103,0.784553,0.082334,1
5,0.003387,0.000589,0.011221,0.000717,euclidean,1,"{'metric': 'euclidean', 'n_neighbors': 1}",0.650624,0.714286,0.714286,0.508772,0.721154,0.661824,0.080715,11
6,0.003015,0.000214,0.01205,0.000447,manhattan,11,"{'metric': 'manhattan', 'n_neighbors': 11}",0.650624,0.760684,0.760684,0.760684,0.85641,0.757817,0.06517,3
7,0.003623,0.000524,0.011156,0.000795,manhattan,9,"{'metric': 'manhattan', 'n_neighbors': 9}",0.650624,0.714286,0.714286,0.760684,0.760684,0.720113,0.040469,8
8,0.003373,0.000786,0.014075,0.002801,manhattan,7,"{'metric': 'manhattan', 'n_neighbors': 7}",0.760684,0.714286,0.714286,0.585185,0.760684,0.707025,0.064357,9
9,0.003494,0.000461,0.011013,0.000668,manhattan,5,"{'metric': 'manhattan', 'n_neighbors': 5}",0.750446,0.714286,0.714286,0.585185,0.760684,0.704977,0.06276,10


In [7]:
def grid_Search_kfold(skf):
  score_list = []
  fold = 0

  model = KNeighborsClassifier()
  parameters = {'n_neighbors': [11, 9, 7, 5, 3, 1],
                'metric':["euclidean", "manhattan"]}

  for train, test in skf.split(X, y):
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]

        grid = GridSearchCV(estimator = model,     # k-nn
                    param_grid = parameters,       # dicionário com valores para serem testados.
                    scoring = 'f1_macro',          # métrica de avaliação
                    cv = 5)                      # cross-validation
        
        grid.fit(X_train, y_train)

        y_pred = grid.predict(X_test)

        #print(grid.cv_results_["mean_test_score"])
    
        print("Melhor parametro:", grid.best_params_)         
        print("Fold %d: %.3f" %(fold, f1_score(y_test, y_pred, average="macro")))
        
        score_list.append(f1_score(y_test, y_pred, average="macro"))
        fold += 1

    
    
  score = np.array(score_list)
  print("\n F1-score média (desvio): %.3f +- (%.3f)" %(score.mean(), score.std()))

grid_Search_kfold(StratifiedKFold(n_splits=5, random_state=42, shuffle=True))

Melhor parametro: {'metric': 'manhattan', 'n_neighbors': 11}
Fold 0: 0.829
Melhor parametro: {'metric': 'euclidean', 'n_neighbors': 7}
Fold 1: 0.733
Melhor parametro: {'metric': 'manhattan', 'n_neighbors': 9}
Fold 2: 0.731
Melhor parametro: {'metric': 'euclidean', 'n_neighbors': 5}
Fold 3: 0.800
Melhor parametro: {'metric': 'euclidean', 'n_neighbors': 9}
Fold 4: 0.733

 F1-score média (desvio): 0.765 +- (0.041)


In [8]:
# final model 
""" 
model = KNeighborsClassifier(metric='euclidean', n_neighbors=5)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_pred, y_test)) """

" \nmodel = KNeighborsClassifier(metric='euclidean', n_neighbors=5)\n\nmodel.fit(X_train, y_train)\n\ny_pred = model.predict(X_test)\n\nprint(classification_report(y_pred, y_test)) "