# Introduction to HPO with Optuna

In [None]:
!pip install --upgrade "dask-cloudprovider[azure]" lightgbm optuna dask_optuna

In [1]:
from azureml.core import Workspace

ws = Workspace.from_config()
ws

Workspace.create(name='AzureML', subscription_id='6560575d-fa06-4e7d-95fb-f962e74efd7a', resource_group='cody-eastus-rg')

In [2]:
import git
from pathlib import Path

# get root of git repo
prefix = Path(git.Repo(".", search_parent_directories=True).working_tree_dir)

# setup data path
data_path = prefix.joinpath("data", "raw", "iris", "iris.csv")

In [3]:
ds = ws.get_default_datastore()
ds.upload_files([str(data_path)], target_path="datasets/iris", overwrite=True)

Uploading an estimated of 1 files
Uploading /Users/cody/code/azureml-examples/data/raw/iris/iris.csv
Uploaded /Users/cody/code/azureml-examples/data/raw/iris/iris.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_c6c3d0982e134df2a236151e8d9f8d4c

In [4]:
container_name = ds.container_name

storage_options = {
    "account_name": ds.account_name,
    "account_key": ds.account_key,
}

In [5]:
# adjust data path
data_path = f"az://{container_name}/datasets/iris/iris.csv"
data_path

'az://azureml-blobstore-b0304e68-b406-492f-90b6-ae4897ecdb33/datasets/iris/iris.csv'

In [6]:
import dask.dataframe as dd

df = dd.read_csv(data_path, storage_options=storage_options).compute()

In [7]:
from azureml.core import Environment
from dask.distributed import Client
from dask_cloudprovider import AzureMLCluster

env = Environment.from_conda_specification(
    "dask-tutorial", prefix.joinpath("environments", "dask.yml")
)
cluster = AzureMLCluster(
    ws,
    vm_size="STANDARD_DS5_V2",
    environment_definition=env,
    initial_node_count=10,
    scheduler_idle_timeout=1200,
)

c = Client(cluster)
c

....................................................



0,1
Client  Scheduler: tcp://localhost:9002  Dashboard: http://localhost:9001,Cluster  Workers: 1  Cores: 16  Memory: 59.10 GB


In [8]:
# imports
import os
import time
import mlflow
import argparse

import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [9]:
# define functions
def preprocess_data(df):
    X = df.drop(["species"], axis=1)
    y = df["species"]

    enc = LabelEncoder()
    y = enc.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    return X_train, X_test, y_train, y_test, enc


def train_model(params, num_boost_round, X_train, X_test, y_train, y_test):
    t1 = time.time()
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test)
    model = lgb.train(
        params,
        train_data,
        num_boost_round=num_boost_round,
        valid_sets=[test_data],
        valid_names=["test"],
    )
    t2 = time.time()

    return model, t2 - t1


def evaluate_model(model, X_test, y_test):
    y_proba = model.predict(X_test)
    y_pred = y_proba.argmax(axis=1)
    loss = log_loss(y_test, y_proba)
    acc = accuracy_score(y_test, y_pred)

    return loss, acc

  and should_run_async(code)


In [10]:
# define an objective for optuna to optimize
def objective(trial):
    try:
        mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
        mlflow.set_experiment("hpo-with-optuna-tutorial")
        # start mlflow run
        with mlflow.start_run():
            # enable autologging
            mlflow.lightgbm.autolog()

            # generate parameters
            num_boost_round = trial.suggest_int("num_boost_round", 1, 100)
            params = {
                "objective": "multiclass",
                "num_class": 3,
                "boosting": trial.suggest_categorical(
                    "boosting", ["gbdt", "dart", "goss"]
                ),
                "num_iterations": trial.suggest_int("num_iterations", 10, 100),
                "num_leaves": trial.suggest_int("num_leaves", 15, 63),
                # "num_threads": trial.suggest_categorical("num_threads", [1, 2, 4]),
                "learning_rate": trial.suggest_loguniform(
                    "learning_rate", 10e-5, 0.1
                ),
                "metric": "multi_logloss",
                # "seed": trial.suggest_categorical("seed", [1, 3, 5, 7, 11, 13, 42]),
                "verbose": 0,
            }

            # read in dataset
            df = dd.read_csv(
                data_path, storage_options=storage_options
            ).compute()

            # preprocess data
            X_train, X_test, y_train, y_test, enc = preprocess_data(df)

            # train model
            model, train_time = train_model(
                params, num_boost_round, X_train, X_test, y_train, y_test
            )
            mlflow.log_metric("training_time", train_time)

            # evaluate model
            loss, acc = evaluate_model(model, X_test, y_test)
            mlflow.log_metrics({"loss": loss, "accuracy": acc})

            return loss
    except:
        return None

In [13]:
%%time
import joblib
import optuna
import dask_optuna

sampler = optuna.samplers.TPESampler()
storage = dask_optuna.DaskStorage()
study = optuna.create_study(
    direction="minimize",
    study_name="aml-tutorial",
    sampler=sampler,
    storage=storage,
)
with joblib.parallel_backend("dask"):
    study.optimize(objective, n_trials=16, n_jobs=-1)

26b4407a28e4cfc174cc1dc'], 'restrictions': {}, 'loose_restrictions': [], 'priority': {'batch_of__reseed_and_optimize_sequential_1_calls-78a93d1c726b4407a28e4cfc174cc1dc': 0}, 'user_priority': 0, 'resources': None, 'submitting_task': None, 'retries': None, 'fifo_timeout': '100 ms', 'actors': False}]
distributed.comm.utils - ERROR - ('Could not serialize object of type tuple.', "(<function apply at 0x7fab70a99ca0>, batch_of__reseed_and_optimize_sequential_1_calls, [], {'tasks': [(<bound method Study._reseed_and_optimize_sequential of <optuna.study.Study object at 0x7fab1d1d7370>>, [<function objective at 0x7fab1cd29430>, 1, None, (), None, False, datetime.datetime(2020, 10, 11, 10, 57, 15, 148619)], {})]})")
Traceback (most recent call last):
  File "/Users/cody/miniconda3/envs/dkdc/lib/python3.8/site-packages/distributed/comm/utils.py", line 34, in _to_frames
    protocol.dumps(
  File "/Users/cody/miniconda3/envs/dkdc/lib/python3.8/site-packages/distributed/protocol/core.py", line 50, 

TypeError: 'CancelledError' object is not iterable