# Train and hyperparameter tune with RAPIDS

description: train and hyperparameter tune with RAPIDS, cuML, and hyperdrive

In [None]:
from azureml.core import Workspace

ws = Workspace.from_config()
ws

In [None]:
import git
from pathlib import Path

# get root of git repo
prefix = Path(git.Repo(".", search_parent_directories=True).working_tree_dir)

# training script
script_dir = prefix.joinpath("code", "models", "rapids")
script_name = "train.py"

# environment file
environment_file = prefix.joinpath("environments", "rapids-example.dockerfile")

# azure ml settings
environment_name = "rapids-airline-example"
experiment_name = "rapids-airline-hyperdrive-example"
compute_target = "gpu-V100-1"

In [None]:
print(open(script_dir.joinpath(script_name)).read())

In [None]:
from azureml.core import ScriptRunConfig, Experiment, Environment, Dataset

ds = Dataset.File.from_files(
    "https://airlinedataset.blob.core.windows.net/airline-20m/*"
)

arguments = [
    "--data_dir",
    ds.as_mount(),
    "--n_bins",
    32,
    "--compute",
    "single-GPU",
    "--cv-folds",
    1,
]

env = Environment(environment_name)
env.docker.enabled = True
env.docker.base_image = None
env.docker.base_dockerfile = environment_file
env.python.user_managed_dependencies = True

src = ScriptRunConfig(
    source_directory=script_dir,
    script=script_name,
    arguments=arguments,
    environment=env,
    compute_target=compute_target,
)

# run = Experiment(ws, experiment_name).submit(src)
# run

## Tune model hyperparameters

Let's define the hyperparameter space to sweep over. We will tune `n_estimators`, `max_depth` and `max_features` parameters. In this example we will use random sampling to try different configuration sets of hyperparameters and maximize `Accuracy`.

In [None]:
!pip install --upgrade azureml-train

In [None]:
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.parameter_expressions import (
    choice,
    loguniform,
    uniform,
)

param_sampling = RandomParameterSampling(
    {
        "--n_estimators": choice(range(50, 500)),
        "--max_depth": choice(range(5, 19)),
        "--max_features": uniform(0.2, 1.0),
    }
)

hdc = HyperDriveConfig(
    run_config=src,
    hyperparameter_sampling=param_sampling,
    primary_metric_name="Accuracy",
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=5,
    max_concurrent_runs=5,
)

run = Experiment(ws, experiment_name).submit(hdc)
run

In [None]:
from azureml.widgets import RunDetails

RunDetails(run).show()

In [None]:
run.wait_for_completion(show_output=True)

### Find the best model

In [None]:
best_run = run.get_best_run_by_primary_metric()
best_run.get_details()["runDefinition"]["arguments"]

List the model files uploaded during the run:

In [None]:
best_run.get_file_names()