## Set up your development environment

Make sure you have installed the azureml SDK by running "install.bat".


### Import packages

Import Python packages you need in this session. Also display the Azure Machine Learning SDK version.

In [1]:
import os
import textwrap
import urllib.parse

from azureml.core import Workspace
from azureml.core import Experiment
from azureml.core.compute import ComputeTarget
from azureml.core.container_registry import ContainerRegistry
from azureml.core.runconfig import MpiConfiguration
from azureml.train.estimator import Estimator
import azureml.contrib.core

### azureml-sdk version

- Check the sdk version, it should be something like 0.1.0.*
- The AzureML SDK is installed from the private wheel for ITP, and its version starts from 0.1.
- It will be merged to master later on.

In [2]:
import azureml.core
print(f"The azureml-sdk version is {azureml.core.VERSION}")

The azureml-sdk version is 1.5.0


### entry.py
The entry script to be executed

In [3]:
entry_script_content = """
    # -*- coding: utf-8 -*-
    #
    # Generic entry script for submitting a run in AML.
    #
    # Generated by pacodegen-0.4.2 on 2019-10-10 05:38:04.306427+00:00
    #

    import os
    import sys
    import argparse
    import urllib.parse

    #

    parser = argparse.ArgumentParser(description="AML Generic Entry Script")
    parser.add_argument("--command", required=True, help="the command to run in url encoding")

    _, unknown = parser.parse_known_args()
    for x in unknown:
        if x.startswith("--env_"):
            parser.add_argument(x)
    args = parser.parse_args()

    for key, value in vars(args).items():
        if key.startswith("env_"):
            os.environ[key[len("env_"):]] = urllib.parse.unquote(value)

    return_code = os.system(urllib.parse.unquote(args.command))
    sys.exit(return_code >> 8)

    #
    # (END)
"""

In [4]:
source_directory = os.path.join(os.path.abspath('.'), "_source")
entry_script_file = "entry.py"
if not os.path.exists(source_directory):
    os.makedirs(source_directory)
with open(os.path.join(source_directory, entry_script_file), "w") as f:
    f.write(textwrap.dedent(entry_script_content).strip() + "\n")

### Load the workspace and comput targets
- Load the workspace and load the config from keyvault
- List all Itp compute targets in the workspace.

In [5]:
ws = Workspace.from_config()

def get_config(ws):
    kv = ws.get_default_keyvault()
    config = kv.get_secrets(['registry-address', 'registry-username', 'registry-password'])
    return config
config = get_config(ws)

In [6]:
# for key, target in ws.compute_targets.items():
#     if type(target) is ItpCompute:
#         print('Found compute target:{}\ttype:{}\tprovisioning_state:{}\tlocation:{}'.format(target.name, target.type, target.provisioning_state, target.location))

### Create an estimator
- Load the container registry and create an estimator associated with it.

In [7]:
from azureml.train.dnn import PyTorch
from azureml.train.dnn import Mpi,Gloo,Nccl

compute_target = "cmaks0518" # this can be replaced by other compute target name.
container_registry = ContainerRegistry()
container_registry.address = config["registry-address"] # "philly2aml.azurecr.io"
container_registry.username = config["registry-username"] # "<name>"
container_registry.password = config["registry-password"] # "<secret>"

# Using Nccl backend
command = "python $CODE/pytorch_mnist_with_metric_and_outputs.py --epochs 10 --save-model --backend nccl --outputs-dir /workspaceblobstore/azureml/outputs/"
# Using Gloo backend 
#command = "python $CODE/pytorch_mnist_with_metric_and_outputs.py --epochs 3 --save-model --backend gloo --outputs-dir /workspaceblobstore/azureml/outputs/"
# Using Gloo backend without cuda
#command = "python $CODE/pytorch_mnist_with_metric_and_outputs.py --epochs 3 --save-model --no-cuda --backend gloo --outputs-dir /workspaceblobstore/azureml/outputs/"

script_params = {
    "--env_CODE": ws.datastores["ds"].path("hello-world-jobs/dlts-multinode-1gpu/horovod/examples").as_mount(),
    "--command": urllib.parse.quote(command)
}


est = PyTorch(
        compute_target=ComputeTarget(workspace=ws, name=compute_target),
        use_gpu=True,
        node_count=1,
        distributed_training=Gloo(),
        image_registry_details=container_registry,
        custom_docker_image="pytorch:cmk8s",  
        user_managed=True,
        source_directory=source_directory,
        entry_script=entry_script_file,
        script_params=script_params
    )

### Submit the run
Submit the run and wait for the result.

In [None]:
run = Experiment(workspace=ws, name="cmk8s-pytorch").submit(est)
run.wait_for_completion(show_output=True)

RunId: cmk8s-pytorch_1590549957_e9f15d84
Web View: https://ml.azure.com/experiments/cmk8s-pytorch/runs/cmk8s-pytorch_1590549957_e9f15d84?wsid=/subscriptions/e9b2ec51-5c94-4fa8-809a-dc1e695e4896/resourcegroups/itp/workspaces/cmaks-test-ws-master
