<a href="https://colab.research.google.com/github/Bborub/baseball/blob/main/DSC502_050323_hyperparameter_tuning_KNN_RF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Grid Search using KNN and RF on the diabetes dataset

## Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

## Load dataset and define features and label

In [2]:
df = pd.read_csv('https://storage.googleapis.com/scsu-data-science/diabetes_cleaned.csv')
X = df.loc[:, 'Pregnancies':'Age']
y = df.loc[:, 'Outcome']

## Case I: KNN optimization

### Define KNN model, the grid (search space) and cross validation model

In [3]:
model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
metric = ['euclidean', 'manhattan']
weights = ['uniform', 'distance']

# define the grid 
grid = dict(n_neighbors = n_neighbors, metric = metric, weights = weights)

# Stratified sampling ensures that proportion of labels is maintained in test set 
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)

### Define grid search and execute it

In [4]:
grid_search = GridSearchCV(estimator=model, 
                           param_grid=grid, 
                           n_jobs=-1,   # Use all CPU cores 
                           cv=cv,
                           scoring='accuracy')

# This may take a while to compute (depending on size of search space)
grid_result = grid_search.fit(X, y)

### Extract results (KNN)

In [5]:
print(f"Best accuracy: {grid_result.best_score_:0.3f} using {grid_result.best_params_}\n")

# Output the accuracy for all combintations in the grid
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best accuracy: 0.749 using {'metric': 'manhattan', 'n_neighbors': 19, 'weights': 'uniform'}

0.695728 (0.048822) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.695728 (0.048822) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.702671 (0.042343) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.700496 (0.041584) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.712207 (0.044516) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.713078 (0.046241) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.719634 (0.040810) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
0.714850 (0.046482) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.730463 (0.046045) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
0.728281 (0.048898) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.

## Case II: Random forest optimization

### Define RF model, the grid (search space) and cross validation model

In [6]:
model = RandomForestClassifier()

# Hyperparameters
n_estimators = [32, 64, 128, 256]
max_features = ['sqrt', 'log2']
criterion = ['gini', 'entropy', 'log_loss']

# define search space
grid = dict(n_estimators = n_estimators, 
            max_features = max_features,
            criterion = criterion)

# Stratified sampling ensures that proportion of labels is maintained in test set 
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)

### Define Grid Search and Execute it

In [7]:
grid_search = GridSearchCV(estimator = model, 
                           param_grid = grid, 
                           n_jobs = -1, 
                           cv = cv, 
                           scoring = 'accuracy',
                           error_score = 0) # Assign score of 0 if error

# This takes 2-3 minutes to finish
grid_result = grid_search.fit(X, y)

### Extract results

In [8]:
print(f"Best accuracy: {grid_result.best_score_:0.3f} using {grid_result.best_params_}\n")

# Output the accuracy for all combintations in the grid
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best accuracy: 0.770 using {'criterion': 'entropy', 'max_features': 'log2', 'n_estimators': 64}

0.755223 (0.042253) with: {'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 32}
0.759552 (0.044441) with: {'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 64}
0.766091 (0.048197) with: {'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 128}
0.758299 (0.046466) with: {'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 256}
0.752153 (0.051277) with: {'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 32}
0.762600 (0.046752) with: {'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 64}
0.755218 (0.042786) with: {'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 128}
0.764781 (0.045063) with: {'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 256}
0.750427 (0.046552) with: {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 32}
0.758271 (0.047532) with: {'criterion': 'entropy', 'max_features': '