# Basic Multi-Step Training Pipeline with Hyperparameter Tuning


This notebook covers the process of setting up a basic multi-step training pipeline that utilizes hyperparameter tuning within an Azure Machine Learning workspace.

Steps in this notebook include:

- Training Step (Hyperdrive step used for hyperparameter tuning)
- Evaluation Step
- Registration step

This notebook is meant to serve as a guide or template for users who wish to do hyperparameter tuning within more complex multi-step pipelines.

## Import Dependencies

In [None]:
#Load dotenv extension

%load_ext dotenv
%dotenv

import os
import sys

sys.path.append("./")

from azureml.core import Environment, Workspace, Experiment, ScriptRunConfig
from azureml.core.conda_dependencies import CondaDependencies
from azureml.data.data_reference import DataReference
from azureml.pipeline.core import Pipeline, PipelineData, PipelineParameter, TrainingOutput
from azureml.pipeline.steps import PythonScriptStep, HyperDriveStep
from azureml.core.runconfig import RunConfiguration
from azureml.core.authentication import InteractiveLoginAuthentication

from azureml.train.hyperdrive import GridParameterSampling, HyperDriveConfig, PrimaryMetricGoal
from azureml.train.hyperdrive import choice

from src.common.attach_compute import get_compute
from src.common.get_datastores import get_blob_datastore

## Configure Workspace and Set Compute Target

In [None]:
# Interactive login 
interactive_auth = InteractiveLoginAuthentication(force=True)

Create a workspace object from the `config.json` file in the running directory that you can [download from your AzureML](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-environment#workspace) portal:

In [None]:
# Restore AML workspace from config.json file (can be downloaded through the portal)
ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Create a compute target for our training job to run on, using the script helper `get_compute` imported from `attach_compute.py` file and the information provided within `.env` file:

In [None]:
# Set compute target
compute_target = get_compute(
    workspace=ws,
    compute_name=os.getenv("AML_CLUSTER_NAME"),
    vm_size=os.getenv("AML_CLUSTER_CPU_SKU"),
    vm_priority=os.environ.get("AML_CLUSTER_PRIORITY", 'lowpriority'), 
    min_nodes=int(os.environ.get("AML_CLUSTER_MIN_NODES", 0)),
    max_nodes=int(os.environ.get("AML_CLUSTER_MAX_NODES", 4)),
    scale_down=int(os.environ.get("AML_CLUSTER_SCALE_DOWN", 600)),
)

## Configure Datastores 

Create storage containers where data will be accessed, using the script helper `get_blob_datastore` imported from `attach_compute.py` file and the information provided within `.env` file:

In [None]:
# Create root datastore
root_datastore = get_blob_datastore(
    ws, os.getenv("BLOB_DATASTORE_NAME"), 
    os.getenv("STORAGE_NAME"),
    os.getenv("STORAGE_KEY"), 
    os.getenv("STORAGE_CONTAINER")
    )

# Create input and output data reference
root_dir = DataReference(
    datastore=root_datastore, 
    data_reference_name="data_reference", 
    mode="mount"
)

# Create pipeline data to hold model info in intermediate pipeline steps
model_info_dir = PipelineData(
        "model_info_dir",
        datastore=root_datastore,
        output_mode='mount',
        output_overwrite=True,
    )

# Hyperdrive specific output (one file only)
model_info_best = PipelineData(
        "model_info_best",
        datastore=root_datastore,
        output_mode = 'mount',
        training_output=TrainingOutput("Model", model_file="outputs/model/model_info.json"))

## Define Pipeline Parameters

In [None]:
# Just an example how we can use parameters to provide different input folders and values
model_name = PipelineParameter(name="model_name", default_value="basic_model")

## Define Custom Environment 

Create the Azure ML environment that encapsulates our training script's dependencies:

In [None]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# Specify package dependencies 
batch_conda_deps = CondaDependencies.create(
    pip_packages=[
        "azureml==0.2.7",
        "azureml-core==1.15.0",
        "click==7.0",
        "numpy==1.18.5",
        "pandas==1.1.3",
        "Pillow==7.2.0",
        "tqdm==4.61.0",
    ]
)

# Create and set environment
batch_env = Environment(name="train-env")
batch_env.docker.enabled = True

# Set dependencies that will be used within environment (batch_conda_deps is a set of dependencies)
batch_env.python.conda_dependencies = batch_conda_deps

# Set python version that environment will utilize
batch_conda_deps.set_python_version('3.8.6')

# Set up pipeline run configuration and set environment 
run_config = RunConfiguration()
run_config.environment = batch_env

## Configure and instantiate pipeline steps

Create and configure Training pipeline steps:

In [None]:
source_directory = "./"

# Create configurations for training step
train_step_config = ScriptRunConfig(
    script="src/pipeline/train.py",
    compute_target=compute_target,
    source_directory=source_directory,
    environment=batch_env
)

# Initiate sampling method and provide search space
# In this basic case, we've specified different values for the initial_lr parameter
ps = GridParameterSampling(
    {
        "--initial_lr": choice(0.00003, 0.00001, 0.0001),
    })

# Configure hyperdrive config using train config and sampling method specified
hd_config = HyperDriveConfig(
    run_config=train_step_config, 
    hyperparameter_sampling=ps,
    primary_metric_name='accuracy', 
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
    max_total_runs=4,
    max_concurrent_runs=4
)

# Create training step using HyperDriveStep class to run experiment with hyperparameter tuning
train_step = HyperDriveStep(
    name="train step",
    hyperdrive_config=hd_config,
    estimator_entry_script_arguments=[
        "--root_dir",
        root_dir,
        "--model_info_dir",
        model_info_dir,
        "--model_info_best",
        model_info_best,
    ],
    inputs=[root_dir],
    outputs=[model_info_dir, model_info_best],
    allow_reuse=False,
)

# Initialize other steps
eval_step = PythonScriptStep(
    name="eval step",
    script_name="src/pipeline/evaluate.py",
    arguments=[
        "--root_dir",
        root_dir,
        "--model_info_dir",
        model_info_dir,
        "--model_info_best",
        model_info_best,
    ],
    inputs=[root_dir, model_info_dir, model_info_best],
    outputs=[],
    compute_target=compute_target,
    source_directory=source_directory,
    runconfig=run_config,
    allow_reuse=False,
)

eval_step.run_after(train_step)

register_step = PythonScriptStep(
    name="register step",
    script_name="src/pipeline/register.py",
    arguments=[
        "--root_dir",
        root_dir,
        "--model_name",
        model_name,
        "--model_info_dir",
        model_info_dir,
    ],
    inputs=[root_dir, model_info_dir],
    outputs=[],
    compute_target=compute_target,
    source_directory=source_directory,
    runconfig=run_config,
    allow_reuse=False,
)

register_step.run_after(eval_step)

## Configure and publish pipeline to AML

In [None]:
# Create pipeline using existing steps
training_pipeline = Pipeline(workspace=ws, steps=[train_step, eval_step, register_step])

# Check if the pipeline is consistent 
training_pipeline.validate()

# Publish pipeline
published_pipeline = training_pipeline.publish(
    name = "hyperdrive_training_pipeline",
    description = "Hyperdrive training pipeline experiment"
)

## Submit and run pipeline in AML

In [None]:
# Submit the pipeline
pipeline_run = Experiment(ws, 'Reza-hyperdrive-training').submit(training_pipeline)
pipeline_run.wait_for_completion()