Evaluation of a model through the sklearn interface by introducing a hyperparameter optimization step.

In [1]:
import sys

sys.path.append('..')

import optuna
import pandas as pd
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# GOJO libraries
import gojo
from gojo import core

N_JOBS = 8

In [2]:
# load test dataset (Wine)
wine_dt = datasets.load_wine()

# create the target variable. Classification problem 0 vs rest
# to see the target names you can use wine_dt['target_names']
y = (wine_dt['target'] == 1).astype(int)  
X = wine_dt['data']

In [3]:
# previous model transforms
transforms = [
    core.SKLearnTransformWrapper(StandardScaler),
    core.SKLearnTransformWrapper(PCA, n_components=5)
]

# model hyperparameters
search_space = {
    'degree': ('suggest_int', (1, 10)),
    'class_weight': ('suggest_categorical', [('balanced', None)]),
    'coef0': ('suggest_float', (0.0, 100.00 ))
}

# default model
model = core.SklearnModelWrapper(
    SVC, kernel='poly', degree=1, coef0=0.0,
    cache_size=1000, class_weight=None
)

# Basic model evaluation

In [4]:
# evaluate the model using a simple cross-validation strategy with a 
# default parameters
cv_report = core.evalCrossVal(
    X=X, y=y,
    model=model,
    cv=gojo.util.getCrossValObj(cv=5, repeats=1, stratified=True, loocv=False, random_state=1997),
    transforms=transforms,
    verbose=True,
    save_train_preds=True,
    save_models=False,
    save_transforms=False,
    n_jobs=N_JOBS
)
scores = cv_report.getScores(core.getDefaultMetrics('binary_classification', bin_threshold=0.5))
results = pd.concat([
    pd.DataFrame(scores['train'].mean(axis=0)).round(decimals=3),
    pd.DataFrame(scores['test'].mean(axis=0)).round(decimals=3)], 
    axis=1).drop(index=['n_fold'])
results.columns = ['Train', 'Test']
results

Performing cross-validation...: 5it [00:00, 250.92it/s]


Unnamed: 0,Train,Test
accuracy,0.98,0.972
balanced_accuracy,0.977,0.965
precision,0.993,1.0
recall,0.958,0.93
sensitivity,0.958,0.93
specificity,0.995,1.0
negative_predictive_value,0.973,0.956
f1_score,0.975,0.963
auc,0.977,0.965


# Model with HPO

In [5]:
# perform the HPO to optimice model-hyperparameters
cv_report = core.evalCrossValNestedHPO(
    X=X,
    y=y,
    model=model,
    search_space=search_space,
    outer_cv=gojo.util.getCrossValObj(cv=5, repeats=1, stratified=True, loocv=False, random_state=1997),
    inner_cv=gojo.util.getCrossValObj(cv=5, repeats=1, stratified=True, loocv=False, random_state=1997),
    hpo_sampler=optuna.samplers.TPESampler(n_startup_trials=40),
    hpo_n_trials=80,
    minimization=False,
    transforms=transforms,
    metrics=core.getDefaultMetrics('binary_classification', bin_threshold=0.5),
    objective_metric='f1_score',
    verbose=1,
    save_train_preds=True,
    save_models=False,
    n_jobs=8
)

Performing cross-validation...: 5it [00:21,  4.32s/it]


In [6]:
scores = cv_report.getScores(core.getDefaultMetrics('binary_classification', bin_threshold=0.5))
results = pd.concat([
    pd.DataFrame(scores['train'].mean(axis=0)).round(decimals=3),
    pd.DataFrame(scores['test'].mean(axis=0)).round(decimals=3)], 
    axis=1).drop(index=['n_fold'])
results.columns = ['Train', 'Test']
results

Unnamed: 0,Train,Test
accuracy,0.989,0.966
balanced_accuracy,0.988,0.967
precision,0.986,0.947
recall,0.986,0.972
sensitivity,0.986,0.972
specificity,0.991,0.962
negative_predictive_value,0.991,0.981
f1_score,0.986,0.959
auc,0.988,0.967
