# Introduction to Optuna

## Install required packages

In [None]:
!pip install --upgrade lightgbm scikit-learn optuna plotly dask dask_optuna joblib gitpython

## Setup Azure mlflow tracking

In [None]:
import mlflow
from azureml.core import Workspace

ws = Workspace.from_config()
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
mlflow.set_experiment("intro-to-optuna-tutorial")

In [None]:
import git
from pathlib import Path

# get root of git repo
prefix = Path(git.Repo(".", search_parent_directories=True).working_tree_dir)

# data path
data_path = prefix.joinpath("data", "raw", "iris", "iris.csv")

## Define an objective function

In [None]:
def objective(trial):
    import mlflow
    import mlflow.lightgbm
    import pandas as pd
    import lightgbm as lgb
    from sklearn.metrics import accuracy_score, log_loss
    from sklearn.preprocessing import LabelEncoder
    from sklearn.model_selection import train_test_split

    # prepare train and test data
    iris = pd.read_csv(data_path)
    enc = LabelEncoder()
    X = iris.drop("species", axis=1)
    y = enc.fit_transform(iris.species)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    train_set = lgb.Dataset(X_train, label=y_train)

    # set training parameters
    params = {
        "objective": "multiclass",
        "num_class": 3,
        "boosting": trial.suggest_categorical(
            "boosting", ["gbdt", "dart", "goss"]
        ),
        "num_iterations": trial.suggest_int("num_iterations", 10, 200),
        "num_leaves": trial.suggest_int("num_leaves", 8, 128),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.9),
        "metric": "multi_logloss",
        "colsample_bytree": 1.0,
        "subsample": 1.0,
        "seed": trial.suggest_int("seed", 0, 42),
    }

    # start run
    with mlflow.start_run():

        # enable automatic logging
        mlflow.lightgbm.autolog()

        # train the lightgbm model
        model = lgb.train(
            params,
            train_set,
            num_boost_round=trial.suggest_int("num_boost_round", 8, 64),
            valid_sets=[train_set],
        )

        # evaluate model
        y_proba = model.predict(X_test)
        y_pred = y_proba.argmax(axis=1)
        loss = log_loss(y_test, y_proba)
        acc = accuracy_score(y_test, y_pred)

        # log metrics from evaluation
        mlflow.log_metric("log_loss", loss)
        mlflow.log_metric("accuracy", acc)

        return loss

## Define and run study

In [None]:
from dask.distributed import Client

c = Client()
c

In [None]:
import joblib
import optuna
from optuna.samplers import TPESampler
from dask_optuna import DaskStorage

study = optuna.create_study(
    study_name="intro-to-optuna-tutorial",
    direction="minimize",
    sampler=TPESampler(),
    storage=DaskStorage(),
)

with joblib.parallel_backend("dask"):
    study.optimize(objective, n_trials=20, n_jobs=-1)

## View results

In [None]:
study.trials_dataframe()

In [None]:
study.best_value

In [None]:
study.best_trial

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_contour(study)

In [None]:
optuna.visualization.plot_edf(study)