# Hyperparameter Optimiziation (HPO) with Dask and Optuna, locally

In [None]:
!pip install --upgrade lightgbm dask_optuna optuna

In [None]:
from azureml.core import Workspace

ws = Workspace.from_config()
ws

In [None]:
import git
from pathlib import Path

# get root of git repo
prefix = Path(git.Repo(".", search_parent_directories=True).working_tree_dir)
prefix

In [None]:
# imports
import os
import time
import mlflow
import argparse

import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
# define functions
def preprocess_data(df):
    X = df.drop(["species"], axis=1)
    y = df["species"]

    enc = LabelEncoder()
    y = enc.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    return X_train, X_test, y_train, y_test, enc


def train_model(params, num_boost_round, X_train, X_test, y_train, y_test):
    t1 = time.time()
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test)
    model = lgb.train(
        params,
        train_data,
        num_boost_round=num_boost_round,
        valid_sets=[test_data],
        valid_names=["test"],
    )
    t2 = time.time()

    return model, t2 - t1


def evaluate_model(model, X_test, y_test):
    y_proba = model.predict(X_test)
    y_pred = y_proba.argmax(axis=1)
    loss = log_loss(y_test, y_proba)
    acc = accuracy_score(y_test, y_pred)

    return loss, acc

In [None]:
# define an objective for optuna to optimize
def objective(trial):
    try:
        mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
        mlflow.set_experiment("using-dask-hpo-with-optuna-local-tutorial")
        # start mlflow run
        with mlflow.start_run():
            # enable autologging
            mlflow.lightgbm.autolog()

            # generate parameters
            num_boost_round = trial.suggest_int("num_boost_round", 1, 100)
            params = {
                "objective": "multiclass",
                "num_class": 3,
                "boosting": trial.suggest_categorical(
                    "boosting", ["gbdt", "dart", "goss"]
                ),
                "num_iterations": trial.suggest_int("num_iterations", 10, 100),
                "num_leaves": trial.suggest_int("num_leaves", 15, 63),
                # "num_threads": trial.suggest_categorical("num_threads", [1, 2, 4]),
                "learning_rate": trial.suggest_loguniform(
                    "learning_rate", 10e-5, 0.1
                ),
                "metric": "multi_logloss",
                # "seed": trial.suggest_categorical("seed", [1, 3, 5, 7, 11, 13, 42]),
                "verbose": 0,
            }

            # read in dataset
            df = pd.read_csv(
                prefix.joinpath("data", "raw", "iris", "iris.csv")
            )

            # preprocess data
            X_train, X_test, y_train, y_test, enc = preprocess_data(df)

            # train model
            model, train_time = train_model(
                params, num_boost_round, X_train, X_test, y_train, y_test
            )
            mlflow.log_metric("training_time", train_time)

            # evaluate model
            loss, acc = evaluate_model(model, X_test, y_test)
            mlflow.log_metrics({"loss": loss, "accuracy": acc})

            return loss
    except:
        return None

In [None]:
%%time

import optuna

study = optuna.create_study(direction="minimize", study_name="test")
study.optimize(objective, n_trials=8, n_jobs=-1)

In [None]:
study.best_params

In [None]:
%%time

import joblib
import optuna
import dask_optuna
from dask.distributed import Client

c = Client()
print(c)
print(c.dashboard_link)

sampler = optuna.samplers.TPESampler()
storage = dask_optuna.DaskStorage()
study = optuna.create_study(
    direction="minimize",
    study_name="aml-tutorial",
    sampler=sampler,
    storage=storage,
)
with joblib.parallel_backend("dask"):
    study.optimize(objective, n_trials=500, n_jobs=-1)

In [None]:
study.best_params

In [None]:
study.best_value

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
df = study.trials_dataframe()
df.head()