In [None]:
import optuna
from optuna.samplers import TPESampler

from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    StratifiedKFold,
    train_test_split,
    cross_val_score,
)
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

import mlflow
from sklearn.metrics import log_loss, make_scorer

from datetime import datetime
import time

# Global variables

All the global variables should be changed by assigning parameters to the notebook. If none, default value would be used instead.

In [5]:
def get_study_name():
    now = datetime.now()
    return (
        str(now.year) + "-" +
        str(now.month).zfill(2) + "-" +
        str(now.day).zfill(2) + "_" +
        str(now.hour).zfill(2) + ":" +
        str(now.minute).zfill((2)) + ":" +
        str(now.second).zfill((2)) + "_" +
        "LogisticRegression"
    )
    
# Global
seed = 1010

# For the Kfold constrution
n_splits = 5
shuffle = True

# For the study
study_name = get_study_name()
n_startup_trials = 3
n_trials = 5
n_jobs = 5
callbacks = False

# For the modelization
max_iter = 100
penalty = "elasticnet"


# Obviously X and y should be passed as notebook argument.
# TODO: if clause to collect X and y
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=.30,
    stratify=y, 
    random_state=seed
)

# Setup the dataset

## Create the index of stratification

In [8]:
skf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=shuffle)
skf_generator = skf.split(X_train, y_train)

# Model structure

In [9]:
# Hyperparameters scope
def get_model_trial(trial=None):
    HP_params = dict()
    fixed_params = {
        "penalty": penalty,
        "solver": "saga" if penalty == "elasticnet" else "lbfgs",
        "max_iter": max_iter,
    }
    if trial:
        # Hyperparameters scope
        HP_params = {
            "C": trial.suggest_float(name="C", low=1, high=10),
            "l1_ratio": (
                trial.suggest_float(name="l1_ratio", low=0, high=1)
                if penalty == "elasticnet"
                else None
            ),
        }
    return fixed_params | HP_params

# The study

In [10]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [11]:
# TODO : Replace get_study_name() by study_name
study_name = get_study_name()
study = optuna.create_study(
    study_name=study_name,
    sampler=TPESampler(n_startup_trials=n_startup_trials),
    direction="maximize"
)

# Ignore warning for logistic regression.
@ignore_warnings(category=ConvergenceWarning)
def objective(trial):
    params = get_model_trial(trial)
    clf = LogisticRegression(**params)
    scores = cross_val_score(estimator=clf,
                             X=X_train, y=y_train,
                             cv = skf,
                             scoring="neg_log_loss")
    return scores.mean() - scores.std()

study.optimize(func=objective,
               n_trials=n_trials,
               n_jobs=n_jobs)

# experiment_id = mlflow.create_experiment(study_name)
# with mlflow.start_run(experiment_id=experiment_id, nested=True ):
    # mlflow.log_artifact() --> To store crossvalidation result
    # pass

