In [1]:
import sys

sys.path.append('..')

import optuna
import pandas as pd
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

# DEID libraries
import gojo
from gojo import core

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load test dataset (Wine)
wine_dt = datasets.load_wine()

# create the target variable. Classification problem 0 vs rest
# to see the target names you can use wine_dt['target_names']
y = (wine_dt['target'] == 1).astype(int)  
X = wine_dt['data']

In [3]:
# previous model transforms
transforms = [
    core.SKLearnTransformWrapper(StandardScaler),
    core.SKLearnTransformWrapper(PCA, n_components=5)
]

# model hyperparameters
search_space = {
    'degree': ('suggest_int', (1, 10)),
    'class_weight': ('suggest_categorical', [('balanced', None)]),
    'coef0': ('suggest_float', (0.0, 100.00 ))
}

# default model
model = core.SklearnModelWrapper(
    SVC, kernel='poly', degree=1, coef0=0.0,
    cache_size=1000, class_weight=None
)

In [4]:
# evaluate the model using a simple cross-validation strategy with a 
# default parameters
cv_report = core.evalCrossVal(
    X=X, y=y,
    model=model,
    cv=gojo.util.getCrossValObj(cv=5, repeats=5, stratified=True, loocv=False, random_state=1997),
    transforms=transforms,
    verbose=True,
    save_train_preds=True,
    save_models=False,
    save_transforms=False,
    n_jobs=5
)
scores = cv_report.getScores(core.getDefaultMetrics('binary_classification', bin_threshold=0.5))
results = pd.concat([
    pd.DataFrame(scores['train'].mean(axis=0)).round(decimals=3),
    pd.DataFrame(scores['test'].mean(axis=0)).round(decimals=3)], 
    axis=1).drop(index=['n_fold'])
results.columns = ['Train', 'Test']
results

Performing cross-validation...: 25it [00:04,  5.40it/s]


Unnamed: 0,Train,Test
accuracy,0.98,0.971
balanced_accuracy,0.976,0.964
precision,0.994,0.995
recall,0.956,0.932
sensitivity,0.956,0.932
specificity,0.996,0.996
negative_predictive_value,0.972,0.959
f1_score,0.975,0.961
auc,0.976,0.964


In [5]:
# perform the HPO to optimice model-hyperparameters
cv_report = core.evalCrossValNestedHPO(
    X=X,
    y=y,
    model=model,
    search_space=search_space,
    outer_cv=gojo.util.getCrossValObj(cv=5, repeats=5, stratified=True, loocv=False, random_state=1997),
    inner_cv=gojo.util.getCrossValObj(cv=5, stratified=True, loocv=False, random_state=1997),
    hpo_sampler=optuna.samplers.TPESampler(n_startup_trials=100),
    hpo_n_trials=200,
    minimization=False,
    transforms=transforms,
    metrics=core.getDefaultMetrics('binary_classification', bin_threshold=0.5),
    objective_metric='f1_score',
    verbose=1,
    save_train_preds=True,
    save_models=False,
    n_jobs=8
)



Performing cross-validation...: 25it [08:17, 19.90s/it]


In [6]:
scores = cv_report.getScores(core.getDefaultMetrics('binary_classification', bin_threshold=0.5))
results = pd.concat([
    pd.DataFrame(scores['train'].mean(axis=0)).round(decimals=3),
    pd.DataFrame(scores['test'].mean(axis=0)).round(decimals=3)], 
    axis=1).drop(index=['n_fold'])
results.columns = ['Train', 'Test']
results

Unnamed: 0,Train,Test
accuracy,0.988,0.963
balanced_accuracy,0.987,0.959
precision,0.99,0.965
recall,0.982,0.943
sensitivity,0.982,0.943
specificity,0.993,0.976
negative_predictive_value,0.988,0.965
f1_score,0.985,0.952
auc,0.987,0.959


In [7]:
# test a custom aggregation function as optimization objective
def adocMetric(_scores):
    f1_test = _scores['test']['f1_score'].mean()
    f1_train = _scores['train']['f1_score'].mean()
    pen1 = 2 * abs(f1_test - f1_train)
    pen2 = 2 * _scores['test']['f1_score'].std()
    
    return f1_test - (pen1 + pen2)

cv_report2 = core.evalCrossValNestedHPO(
    X=X,
    y=y,
    model=model,
    search_space=search_space,
    outer_cv=gojo.util.getCrossValObj(cv=5, repeats=5, stratified=True, loocv=False, random_state=1997),
    inner_cv=gojo.util.getCrossValObj(cv=5, stratified=True, loocv=False, random_state=1997),
    hpo_sampler=optuna.samplers.TPESampler(n_startup_trials=100),
    hpo_n_trials=200,
    minimization=False,
    transforms=transforms,
    metrics=core.getDefaultMetrics('binary_classification', bin_threshold=0.5),
    objective_metric='f1_score',
    agg_function=adocMetric,
    verbose=1,
    save_train_preds=True,
    save_models=False,
    n_jobs=8
)

Performing cross-validation...: 25it [13:53, 33.32s/it]


In [8]:
scores = cv_report2.getScores(core.getDefaultMetrics('binary_classification', bin_threshold=0.5))
results = pd.concat([
    pd.DataFrame(scores['train'].mean(axis=0)).round(decimals=3),
    pd.DataFrame(scores['test'].mean(axis=0)).round(decimals=3)], 
    axis=1).drop(index=['n_fold'])
results.columns = ['Train', 'Test']
results

Unnamed: 0,Train,Test
accuracy,0.985,0.964
balanced_accuracy,0.984,0.96
precision,0.987,0.974
recall,0.976,0.938
sensitivity,0.976,0.938
specificity,0.991,0.981
negative_predictive_value,0.984,0.962
f1_score,0.981,0.953
auc,0.984,0.96


In [9]:
help(cv_report2)

Help on CVReport in module gojo.core.report object:

class CVReport(builtins.object)
 |  CVReport(raw_results: list, X_dataset: gojo.core.base.Dataset, y_dataset: gojo.core.base.Dataset, n_fold_key: str, pred_test_key: str, true_test_key: str, pred_train_key: str, true_train_key: str, test_idx_key: str, train_idx_key: str, trained_model_key: str, fitted_transforms_key: str)
 |  
 |  Object returned by the subroutines defined in 'gojo.core.loops' functions with the results of the
 |  cross validation.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, raw_results: list, X_dataset: gojo.core.base.Dataset, y_dataset: gojo.core.base.Dataset, n_fold_key: str, pred_test_key: str, true_test_key: str, pred_train_key: str, true_train_key: str, test_idx_key: str, train_idx_key: str, trained_model_key: str, fitted_transforms_key: str)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  addMetadata(self, **kwargs)
 |      Function used to add metadata to the report.