# Import packages

In [1]:
import os
import urllib
import shutil
import azureml

from azureml.core import Experiment
from azureml.core import Workspace, Run
from azureml.core import Environment

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Initialize a workspace

In [2]:
ws = Workspace.from_config()

# Create a file dataset

A FileDataset object references one or multiple files in your workspace datastore or public urls. The files can be of any format, and the class provides you with the ability to download or mount the files to your compute. By creating a FileDataset, you create a reference to the data source location. If you applied any transformations to the data set, they will be stored in the data set as well. The data remains in its existing location, so no extra storage cost is incurred.

In [3]:
from azureml.core.dataset import Dataset

web_paths = [
            'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
            'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
            'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
            'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'
            ]
dataset = Dataset.File.from_files(path=web_paths)

Use the register() method to register the data set to your workspace so they can be shared with others, reused across various experiments, and referred to by name in your training script

In [4]:
dataset = dataset.register(workspace=ws,
                           name='mnist-dataset',
                           description='training and test dataset',
                           create_new_version=True)

# list the files referenced by dataset
dataset.to_path()

['/http%3A/%2Fyann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
 '/http%3A/%2Fyann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
 '/http%3A/%2Fyann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
 '/http%3A/%2Fyann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz']

# Create a compute target

Create a compute target for your TensorFlow job to run on. In this example, create a GPU-enabled Azure Machine Learning compute cluster.

In [6]:
cluster_name = "AKTCompute"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', 
                                                           max_nodes=4)

    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

Found existing compute target


# Create a custom environment
You can also create your own Azure ML environment that encapsulates your training script's dependencies.

First, define your conda dependencies in a YAML file; in this example the file is named conda_dependencies.yml.

YAML code

channels:
- conda-forge
dependencies:
- python=3.6.2
- pip:
  - azureml-defaults
  - tensorflow-gpu==2.2.0

Create an Azure ML environment from this conda environment specification. The environment will be packaged into a Docker container at runtime.

By default if no base image is specified, Azure ML will use a CPU image azureml.core.environment.DEFAULT_CPU_IMAGE as the base image. Since this example runs training on a GPU cluster, you will need to specify a GPU base image that has the necessary GPU drivers and dependencies. Azure ML maintains a set of base images published on Microsoft Container Registry (MCR) that you can use

In [11]:
tf_env = Environment.from_conda_specification(name='tensorflow-2.2-gpu', file_path='./conda_dependencies.yml')

# Specify a GPU base image
tf_env.docker.enabled = True
tf_env.docker.base_image = 'mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04'

'enabled' is deprecated. Please use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param instead.


# Configure and submit your training run
# Create a ScriptRunConfig
Create a ScriptRunConfig object to specify the configuration details of your training job, including your training script, environment to use, and the compute target to run on. Any arguments to your training script will be passed via command line if specified in the arguments parameter.

In [13]:
from azureml.core import ScriptRunConfig

args = ['--data-folder', dataset.as_mount(),
        '--batch-size', 64,
        '--first-layer-neurons', 256,
        '--second-layer-neurons', 128,
        '--learning-rate', 0.01]

src = ScriptRunConfig(source_directory="script_folder",
                      script='tf_mnist.py',
                      arguments=args,
                      compute_target=compute_target,
                      environment=tf_env)

# Submit a run
The Run object provides the interface to the run history while the job is running and after it has completed.

In [14]:
run = Experiment(workspace=ws, name='Tutorial-TF-Mnist').submit(src)
run.wait_for_completion(show_output=True)

RunId: Tutorial-TF-Mnist_1631365495_e5327f86
Web View: https://ml.azure.com/runs/Tutorial-TF-Mnist_1631365495_e5327f86?wsid=/subscriptions/5f142bd4-c7c4-4488-b9b3-3c141b8b11ec/resourcegroups/aktresources/workspaces/aktmlws&tid=0f8a5db0-6b60-4ca8-9fc9-8b2ae1cb809e

Streaming azureml-logs/20_image_build_log.txt

2021/09/11 13:05:08 Downloading source code...
2021/09/11 13:05:10 Finished downloading source code
2021/09/11 13:05:10 Creating Docker network: acb_default_network, driver: 'bridge'
2021/09/11 13:05:11 Successfully set up Docker network: acb_default_network
2021/09/11 13:05:11 Setting up Docker configuration...
2021/09/11 13:05:11 Successfully set up Docker configuration
2021/09/11 13:05:11 Logging in to registry: 836545afbeff4614814992f6794057dc.azurecr.io
2021/09/11 13:05:13 Successfully logged into 836545afbeff4614814992f6794057dc.azurecr.io
2021/09/11 13:05:13 Executing step ID: acb_step_0. Timeout(sec): 5400, Working directory: '', Network: 'acb_default_network'
2021/09/11 

{'runId': 'Tutorial-TF-Mnist_1631365495_e5327f86',
 'target': 'AKTCompute',
 'status': 'Completed',
 'startTimeUtc': '2021-09-11T13:20:33.945698Z',
 'endTimeUtc': '2021-09-11T13:30:30.684209Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': '7fb9d847-52e3-48a5-97c0-b82b3f3c4dda',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': '8be8af44-176d-4a38-a714-31d0e23975ac'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'input__8be8af44', 'mechanism': 'Mount'}}],
 'outputDatasets': [],
 'runDefinition': {'script': 'tf_mnist.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--data-folder',
   'DatasetConsumptionConfig:input__8be8af44',
   '--batch-size',
   '64',
   '--first-layer-neurons',
   '256',
   '--second-layer-neurons',
   '128',
   '--learning-rate',
   '0.01'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python

# Register a model
Once you've trained the model, you can register it to your workspace.

In [15]:
from azureml.core import Model
from azureml.core.resource_configuration import ResourceConfiguration

model = run.register_model(model_name='tf-mnist', 
                           model_path='outputs/model',
                           model_framework=Model.Framework.TENSORFLOW,
                           model_framework_version='2.0',
                           resource_configuration=ResourceConfiguration(cpu=1, memory_in_gb=0.5))