# Hyperparameter search

In [1]:
from src.data_management import *
from src.plotting import *
import time
from sklearn.model_selection import GridSearchCV

### Model selection

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier


classifiers = [
    # DecisionTreeClassifier(), 
    RandomForestClassifier(n_jobs=-1), # OK
    # BaggingClassifier(n_jobs=-1), 
    LogisticRegression(n_jobs=-1), # SUPER OK
    # SVC(gamma='auto', C=1, cache_size=1900), # LONG (30k : 4min, 10k : 30s, 5k : 4s)
    GaussianNB(), # SUPER OK
    # SGDClassifier(n_jobs=-1),
    KNeighborsClassifier(n_jobs=-1), # SUPER OK
    #GradientBoostingClassifier(),
    MLPClassifier(), # OK
    #AdaBoostClassifier()
]

### Data loading
Data is loaded from the files and joined. Each feature is then processed to ensure that it is in the correct format for the model and then some features are dropped (according to the feature selection process).

In [3]:
nb_pca = 10
data_without_grav, grav, data_PCA = workable_data(nb_pca)

nb_lines = 1000
data_without_grav = data_without_grav[:nb_lines]
grav = grav[:nb_lines]
data_PCA = data_PCA[:nb_lines]

`data_without_grav` contains the 26 selected features, `grav` is the target variable and `data_PCA` contains the 5 principal components of the data.

### Getting the list of hyperparameters
To simplify the hyperparameter search, we use the `get_params` method of the classifier to get the list of hyperparameters that can be tuned.

In [4]:
for classifier in classifiers:
    print("Classifier:", classifier.__class__.__name__)
    print("Parameters:")
    for key in classifier.get_params():
        print("\t", key)
    print("")

Classifier: RandomForestClassifier
Parameters:
	 bootstrap
	 ccp_alpha
	 class_weight
	 criterion
	 max_depth
	 max_features
	 max_leaf_nodes
	 max_samples
	 min_impurity_decrease
	 min_samples_leaf
	 min_samples_split
	 min_weight_fraction_leaf
	 n_estimators
	 n_jobs
	 oob_score
	 random_state
	 verbose
	 warm_start

Classifier: LogisticRegression
Parameters:
	 C
	 class_weight
	 dual
	 fit_intercept
	 intercept_scaling
	 l1_ratio
	 max_iter
	 multi_class
	 n_jobs
	 penalty
	 random_state
	 solver
	 tol
	 verbose
	 warm_start

Classifier: GaussianNB
Parameters:
	 priors
	 var_smoothing

Classifier: KNeighborsClassifier
Parameters:
	 algorithm
	 leaf_size
	 metric
	 metric_params
	 n_jobs
	 n_neighbors
	 p
	 weights

Classifier: MLPClassifier
Parameters:
	 activation
	 alpha
	 batch_size
	 beta_1
	 beta_2
	 early_stopping
	 epsilon
	 hidden_layer_sizes
	 learning_rate
	 learning_rate_init
	 max_fun
	 max_iter
	 momentum
	 n_iter_no_change
	 nesterovs_momentum
	 power_t
	 random_state


### Chosing the hyperparameters to tune
We then need to choose from the list above which hyperparameters we want to tune. We can also choose the range of values to test for each hyperparameter.

The `param_grid` variable is a dictionary where the keys are the names of the hyperparameters and the values are the list of values to test for each hyperparameter.

In [5]:
param_grids = []

# DecisionTreeClassifier
param_grid = {
    "criterion": ["gini", "entropy"],
    "splitter": ["best", "random"],
    "max_depth": [None, 5, 10, 20, 50, 100],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5, 10],
    "max_features": ["sqrt", "log2"]
}
if "DecisionTreeClassifier" in [classifier.__class__.__name__ for classifier in classifiers]:
    param_grids.append(param_grid)

# RandomForestClassifier
param_grid = {
    "n_estimators": [10, 50, 100, 200, 500],
    "criterion": ["gini", "entropy"],
    "min_samples_split": [2, 5, 10],
    "max_features": ["sqrt", "log2"]
}
if "RandomForestClassifier" in [classifier.__class__.__name__ for classifier in classifiers]:
    param_grids.append(param_grid)

# BaggingClassifier
param_grid = {
    "n_estimators": [10, 20, 50, 100],
    "max_samples": [0.1, 0.5, 1.0],
    "max_features": [0.1, 0.5, 1.0],
    "bootstrap": [True, False],
    "bootstrap_features": [True, False]
}
if "BaggingClassifier" in [classifier.__class__.__name__ for classifier in classifiers]:
    param_grids.append(param_grid)

# LogisticRegression
param_grid = {
    "penalty": ["l1", "l2", "elasticnet"],
    "C": [0.1, 0.5, 2, 5, 10, 20, 50, 100, 200, 500, 1000],
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
    "max_iter": [100, 200, 500]
}
if "LogisticRegression" in [classifier.__class__.__name__ for classifier in classifiers]:
    param_grids.append(param_grid)

# SVC
param_grid = {
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "C": [0.1, 0.5, 2, 5, 10, 20, 50, 100, 200, 500, 1000],
    "gamma": ["scale", "auto"]
}
if "SVC" in [classifier.__class__.__name__ for classifier in classifiers]:
    param_grids.append(param_grid)

# GaussianNB
param_grid = {
    "var_smoothing": [1e-07, 1e-06, 1e-05, 1e-04, 1e-03, 0.005, 0.01, 0.02, 0.05, 0.075, 0.1]
}
if "GaussianNB" in [classifier.__class__.__name__ for classifier in classifiers]:
    param_grids.append(param_grid)

# SGDClassifier
param_grid = {
    "loss": ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
    "penalty": ["l1", "l2", "elasticnet"],
    "alpha": [0.00001, 0.0001, 0.001, 0.01],
    "max_iter": [1000, 2000, 5000, 10000],
}
if "SGDClassifier" in [classifier.__class__.__name__ for classifier in classifiers]:
    param_grids.append(param_grid)

# KNeighborsClassifier
param_grid = {
    "n_neighbors": [1, 2, 5, 10],
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "leaf_size": [1, 2, 5, 10, 20, 30, 50],
    "p": [1, 2]
}
if "KNeighborsClassifier" in [classifier.__class__.__name__ for classifier in classifiers]:
    param_grids.append(param_grid)

# GradientBoostingClassifier
param_grid = {
    "learning_rate": [0.005, 0.01, 0.025, 0.05, 0.1, 0.5],
    "n_estimators": [100, 500], 
    "criterion": ["friedman_mse", "squared_error"],
    "max_depth": [1, 2, 3, 5, 10],
    "min_samples_split": [2, 5, 10, 15, 20],
    "max_features": ["sqrt", "log2"]
}
if "GradientBoostingClassifier" in [classifier.__class__.__name__ for classifier in classifiers]:
    param_grids.append(param_grid)

# MLPClassifier
param_grid = {
    "hidden_layer_sizes":  [(20,20,), (100,), (200,), (500,)],
    "activation": ["logistic", "tanh", "relu"],
    "solver": ["lbfgs", "sgd"],
    "alpha": [0.00001, 0.0001, 0.001, 0.01] ,
    "learning_rate": ["constant", "adaptive"],
    "max_iter": [200, 500]
}
if "MLPClassifier" in [classifier.__class__.__name__ for classifier in classifiers]:
    param_grids.append(param_grid)

# AdaBoostClassifier
param_grid = {
    "n_estimators": [50, 200, 500],
    "learning_rate": [0.001, 0.01, 0.1, 0.5],
    "algorithm": ["SAMME", "SAMME.R"]
}
if "AdaBoostClassifier" in [classifier.__class__.__name__ for classifier in classifiers]:
    param_grids.append(param_grid)

### Fitting the models with all combinations of hyperparameters
We use the `GridSearchCV` class to fit the models with all combinations of hyperparameters and find the best hyperparameters for each model.

This class uses cross-validation to evaluate the performance through an exhaustive search over the hyperparameter values space.

In [6]:
best_params = []
best_scores = []

for classifier, param_grid in zip(classifiers, param_grids):
    print("Classifier:", classifier.__class__.__name__)
    print("Parameters:")
    for key in param_grid:
        print(f"\t{key:12}: {param_grid[key]}")
    print("")
    
    grid_search = GridSearchCV(classifier, param_grid, cv=3, verbose=0, n_jobs=-1)
    grid_search.fit(data_without_grav, grav)
    best_params.append(grid_search.best_params_)
    best_scores.append(grid_search.best_score_)
    print("Best parameters:", best_params[-1])
    print(f"Best score: {best_scores[-1]:.3f}")
    print("\n############################################\n")

Classifier: RandomForestClassifier
Parameters:
	n_estimators: [10, 50, 100, 200, 500]
	criterion   : ['gini', 'entropy']
	max_depth   : [None, 10, 50, 200]
	min_samples_split: [10, 50, 200]
	min_samples_leaf: [5, 20, 50, 200]
	max_features: ['sqrt', 'log2']

Best parameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 50}
Best score: 0.610

############################################

Classifier: LogisticRegression
Parameters:
	penalty     : ['l1', 'l2', 'elasticnet']
	C           : [0.1, 0.5, 2, 5, 10, 20, 50, 100, 200, 500, 1000]
	solver      : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
	max_iter    : [100, 200, 500]



792 fits failed out of a total of 1485.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
99 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Python310\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 56, in _check_solver
    raise ValueError(
ValueError: Sol

Best parameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Best score: 0.557

############################################

Classifier: GaussianNB
Parameters:
	var_smoothing: [1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.005, 0.01, 0.02, 0.05, 0.075, 0.1]

Best parameters: {'var_smoothing': 0.02}
Best score: 0.517

############################################

Classifier: KNeighborsClassifier
Parameters:
	n_neighbors : [1, 2, 5, 10]
	weights     : ['uniform', 'distance']
	algorithm   : ['auto', 'ball_tree', 'kd_tree', 'brute']
	leaf_size   : [1, 2, 5, 10, 20, 30, 50]
	p           : [1, 2]

Best parameters: {'algorithm': 'ball_tree', 'leaf_size': 10, 'n_neighbors': 10, 'p': 1, 'weights': 'distance'}
Best score: 0.535

############################################

Classifier: MLPClassifier
Parameters:
	hidden_layer_sizes: [(20, 20), (100,), (200,), (500,)]
	activation  : ['logistic', 'tanh', 'relu']
	solver      : ['lbfgs', 'sgd']
	alpha       : [1e-05, 0.0001, 0.001, 0.01]



### Printing the results

In [7]:
for classifier, best_param, best_scores in zip(classifiers, best_params, best_scores):
    print("Classifier:", classifier.__class__.__name__)
    print("Best parameters:")
    for key in best_param:
        print(f"\t{key:12}: {best_param[key]}")
    print(f"Best score: {best_scores:.3f}")
    print("\n############################################\n")

Classifier: RandomForestClassifier
Best parameters:
	criterion   : gini
	max_depth   : None
	max_features: sqrt
	min_samples_leaf: 5
	min_samples_split: 10
	n_estimators: 50
Best score: 0.610

############################################

Classifier: LogisticRegression
Best parameters:
	C           : 0.1
	max_iter    : 100
	penalty     : l1
	solver      : saga
Best score: 0.557

############################################

Classifier: GaussianNB
Best parameters:
	var_smoothing: 0.02
Best score: 0.517

############################################

Classifier: KNeighborsClassifier
Best parameters:
	algorithm   : ball_tree
	leaf_size   : 10
	n_neighbors : 10
	p           : 1
	weights     : distance
Best score: 0.535

############################################

Classifier: MLPClassifier
Best parameters:
	activation  : logistic
	alpha       : 0.001
	hidden_layer_sizes: (100,)
	learning_rate: constant
	max_iter    : 500
	solver      : sgd
Best score: 0.578

###############################