# Openclassrooms activity - Grid Search
The aim is to produce a function for the gridsearch algorithm.
Then it will be used to compare to the gridsearch function by Scikitlearn

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing,neighbors, model_selection
from sklearn.model_selection import KFold, train_test_split
data = pd.read_csv('./TP_Selectionnez_nb_voisins_knn/winequality-white.csv', sep=';')
X = data[data.columns[:-1]].values # Not take into account quality
y = data['quality'].values
y_class = np.where(y<6, 0, 1) # Separate good and bad wines


In [2]:
# Divide set in a training and a test set. 
X_train, X_test, y_train, y_test = \
    train_test_split(X, y_class, test_size=0.3) 

# Standardization
std_scale = preprocessing.StandardScaler().fit(X_train)
X_train_std = std_scale.transform(X_train)
X_test_std = std_scale.transform(X_test)

# Results

In [3]:
def add_accur_to_result(result, accur, index):
    if index not in result:
        result[index] = accur
    else:
        result[index] += accur
    return result
def find_max(result):
    return max(result, key=result.get)
def my_grid_search_two(param_grid,nb_folds):
    result = {}
    # Create the folds
    kfold = model_selection.StratifiedKFold(n_splits=nb_folds)
    # On each fold
    for k in param_grid['n_neighbors']:
        results =[]
        for train_index, test_index in kfold.split(X_train_std,y_train):
            Xtrain, Xtest = X_train_std[train_index], X_train_std[test_index]
            ytrain, ytest = y_train[train_index], y_train[test_index]
            accuracy = neighbors.KNeighborsClassifier(k).fit(Xtrain,ytrain).score(Xtest, ytest)
            results.append(accuracy)
            # Add the accuracy to the dictionnary
        result = add_accur_to_result(result, accuracy,k)
    best_couple = find_max(result)
    print("Best hyperparameter : {}".format(best_couple))
    return result


param_grid = {'n_neighbors':[3, 5, 7, 9, 11, 13, 15]}
nb_folds = 5
print(my_grid_search_two(param_grid,nb_folds))

Best hyperparameter : 3
{3: 0.7664233576642335, 5: 0.7605839416058394, 7: 0.7474452554744525, 9: 0.7489051094890511, 11: 0.743065693430657, 13: 0.7576642335766424, 15: 0.7562043795620438}


## Code from OC

In [4]:
from sklearn import neighbors, metrics
param_grid = {'n_neighbors':[3, 5, 7, 9, 11, 13, 15]}
score = 'accuracy'

# Model kNN creation for hyperparameter research with crossed validation
clf = model_selection.GridSearchCV(
    neighbors.KNeighborsClassifier(),
    param_grid, # hyperparameters
    cv=5, # Kfolds nb
    scoring=score 
)

# Optimisation of the model
clf.fit(X_train_std, y_train)

print("Meilleur(s) hyperparamètres(s) sur le jeu d'entraînement:")
print(clf.best_params_)

print("Résultats de la validation croisée :")
for mean, std, params in zip(
    clf.cv_results_['mean_test_score'], # Mean
    clf.cv_results_['std_test_score'], # Standard deviation
    clf.cv_results_['params'] #Hyperparameter value
):
    print("{} = {:.3f} (+/-{:.03f}) for {}".format(score, mean, std*2, params))

Meilleur(s) hyperparamètres(s) sur le jeu d'entraînement:
{'n_neighbors': 3}
Résultats de la validation croisée :
accuracy = 0.759 (+/-0.019) for {'n_neighbors': 3}
accuracy = 0.755 (+/-0.021) for {'n_neighbors': 5}
accuracy = 0.753 (+/-0.017) for {'n_neighbors': 7}
accuracy = 0.753 (+/-0.008) for {'n_neighbors': 9}
accuracy = 0.750 (+/-0.014) for {'n_neighbors': 11}
accuracy = 0.757 (+/-0.016) for {'n_neighbors': 13}
accuracy = 0.754 (+/-0.019) for {'n_neighbors': 15}
