# Example 1. Integration with _sklearn_ and _optuna_ for hyperparameter optimization


In this notebook, we showcase an illustration of a classical pipeline. Within this pipeline, a model and a hyperparameter grid are specified, and the model undergoes evaluation using the optimal combination of hyperparameters identified. The hyperparameter optimization is conducted through nested cross-validation to mitigate the risk of overfitting the models.

In [1]:
import sys

sys.path.append('..')   # the gojo library is in ../gojo

import optuna
import pandas as pd
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# GOJO libraries
import gojo
from gojo import core

N_JOBS = 8

Data loading, the problem corresponds to a binary classification using the typical wine dataset.

In [2]:
# load test dataset (Wine)
wine_dt = datasets.load_wine()

# create the target variable. Classification problem 0 vs rest
# to see the target names you can use wine_dt['target_names']
y = (wine_dt['target'] == 1).astype(int)  
X = wine_dt['data']

Model definition.

In [3]:
# previous model transforms
transforms = [
    core.SKLearnTransformWrapper(StandardScaler),
    core.SKLearnTransformWrapper(PCA, n_components=5)
]

# model hyperparameters
search_space = {
    'degree': ('suggest_int', (1, 10)),
    'class_weight': ('suggest_categorical', [('balanced', None)]),
    'coef0': ('suggest_float', (0.0, 100.00 ))
}

# default model
model = core.SklearnModelWrapper(
    SVC, kernel='poly', degree=1, coef0=0.0,
    cache_size=1000, class_weight=None
)

Model evaluation using a simple cross-validation with the default parameters.

In [4]:
# evaluate the model using a simple cross-validation strategy with a 
# default parameters
cv_report = core.evalCrossVal(
    X=X, y=y,
    model=model,
    cv=gojo.util.getCrossValObj(
        cv=5, repeats=1, stratified=True, loocv=False, random_state=1997),
    transforms=transforms,
    verbose=True,
    save_train_preds=True,
    save_models=False,
    save_transforms=False,
    n_jobs=N_JOBS
)

Performing cross-validation...: 5it [00:00, 205.10it/s]


Report inspection of the optained results.

In [5]:
scores = cv_report.getScores(
    core.getDefaultMetrics('binary_classification', bin_threshold=0.5))
results = pd.concat([
    pd.DataFrame(scores['train'].mean(axis=0)).round(decimals=3),
    pd.DataFrame(scores['test'].mean(axis=0)).round(decimals=3)], 
    axis=1).drop(index=['n_fold'])
results.columns = ['Train', 'Test']
results

Unnamed: 0,Train,Test
accuracy,0.98,0.972
balanced_accuracy,0.977,0.965
precision,0.993,1.0
recall,0.958,0.93
sensitivity,0.958,0.93
specificity,0.995,1.0
negative_predictive_value,0.973,0.956
f1_score,0.975,0.963
auc,0.977,0.965


Model with hyper-parameter optimization using a nested cross-validation approach.

In [6]:
# perform the HPO to optimice model-hyperparameters
cv_report = core.evalCrossValNestedHPO(
    X=X,
    y=y,
    model=model,
    search_space=search_space,
    outer_cv=gojo.util.getCrossValObj(
        cv=5, repeats=1, stratified=True, loocv=False, random_state=1997),
    inner_cv=gojo.util.getCrossValObj(
        cv=5, repeats=1, stratified=True, loocv=False, random_state=1997),
    hpo_sampler=optuna.samplers.TPESampler(n_startup_trials=40),
    hpo_n_trials=80,
    minimization=False,
    transforms=transforms,
    metrics=core.getDefaultMetrics('binary_classification', bin_threshold=0.5),
    objective_metric='f1_score',
    verbose=1,
    save_train_preds=True,
    save_models=False,
    n_jobs=8
)

Performing cross-validation...: 5it [00:24,  4.90s/it]


In [7]:
scores = cv_report.getScores(
    core.getDefaultMetrics('binary_classification', bin_threshold=0.5))
results = pd.concat([
    pd.DataFrame(scores['train'].mean(axis=0)).round(decimals=3),
    pd.DataFrame(scores['test'].mean(axis=0)).round(decimals=3)], 
    axis=1).drop(index=['n_fold'])
results.columns = ['Train', 'Test']
results

Unnamed: 0,Train,Test
accuracy,0.987,0.96
balanced_accuracy,0.986,0.96
precision,0.986,0.946
recall,0.982,0.958
sensitivity,0.982,0.958
specificity,0.991,0.962
negative_predictive_value,0.988,0.971
f1_score,0.984,0.951
auc,0.986,0.96
