From 227786a2bfb3b6f870148e708bad04d86dc15668 Mon Sep 17 00:00:00 2001 From: Roope Astala Date: Wed, 19 Sep 2018 11:04:45 -0400 Subject: [PATCH] Update notebooks Update notebooks --- .../04.train-on-remote-vm.ipynb | 5 +- .../05.train-in-spark/05.train-in-spark.ipynb | 288 +----- .../07.hyperdrive-with-sklearn.ipynb | 4 +- automl/00.configuration.ipynb | 2 + automl/03b.auto-ml-remote-batchai.ipynb | 4 +- automl/13.auto-ml-dataprep.ipynb | 4 +- .../14a.auto-ml-classification-ensemble.ipynb | 412 --------- automl/14b.auto-ml-regression-ensemble.ipynb | 437 --------- automl/README.md | 4 +- automl/automl_setup.cmd | 3 +- automl/automl_setup_linux.sh | 2 +- automl/automl_setup_mac.sh | 2 +- onnx/onnx-inference-emotion-recognition.ipynb | 729 +++++++++++++++ onnx/onnx-inference-mnist.ipynb | 854 ++++++++++++++++++ ...ing.ipynb => pipeline-batch-scoring.ipynb} | 6 +- .../01.train-tune-deploy-pytorch.ipynb | 641 +++++++++++++ .../pytorch_score.py | 59 ++ .../pytorch_train.py | 169 ++++ .../02.distributed-pytorch-with-horovod.ipynb | 289 ++++++ .../pytorch_horovod_mnist.py | 157 ++++ ....distributed-tensorflow-with-horovod.ipynb | 360 ++++++++ .../tf_horovod_word2vec.py | 259 ++++++ ...ted-tensorflow-with-parameter-server.ipynb | 286 ++++++ .../tf_mnist_replica.py | 271 ++++++ ....distributed-cntk-with-custom-docker.ipynb | 283 ++++++ .../cntk_mnist.py | 321 +++++++ .../07.tensorboard.ipynb} | 4 +- ...8.export-run-history-to-tensorboard.ipynb} | 0 ....distributed-tensorflow-with-horovod.ipynb | 500 ---------- ...ted-tensorflow-with-parameter-server.ipynb | 473 ---------- .../52.distributed-cntk.ipynb | 509 ----------- .../53.distributed-pytorch-with-horovod.ipynb | 376 -------- tutorials/01.train-models.ipynb | 4 +- 33 files changed, 4752 insertions(+), 2965 deletions(-) delete mode 100644 automl/14a.auto-ml-classification-ensemble.ipynb delete mode 100644 automl/14b.auto-ml-regression-ensemble.ipynb create mode 100644 onnx/onnx-inference-emotion-recognition.ipynb create mode 100644 onnx/onnx-inference-mnist.ipynb rename pipeline/{06.pipeline-batch-scoring.ipynb => pipeline-batch-scoring.ipynb} (99%) create mode 100644 training/01.train-tune-deploy-pytorch/01.train-tune-deploy-pytorch.ipynb create mode 100644 training/01.train-tune-deploy-pytorch/pytorch_score.py create mode 100644 training/01.train-tune-deploy-pytorch/pytorch_train.py create mode 100644 training/02.distributed-pytorch-with-horovod/02.distributed-pytorch-with-horovod.ipynb create mode 100644 training/02.distributed-pytorch-with-horovod/pytorch_horovod_mnist.py create mode 100644 training/04.distributed-tensorflow-with-horovod/04.distributed-tensorflow-with-horovod.ipynb create mode 100644 training/04.distributed-tensorflow-with-horovod/tf_horovod_word2vec.py create mode 100644 training/05.distributed-tensorflow-with-parameter-server/05.distributed-tensorflow-with-parameter-server.ipynb create mode 100644 training/05.distributed-tensorflow-with-parameter-server/tf_mnist_replica.py create mode 100644 training/06.distributed-cntk-with-custom-docker/06.distributed-cntk-with-custom-docker.ipynb create mode 100644 training/06.distributed-cntk-with-custom-docker/cntk_mnist.py rename training/{40.tensorboard/40.tensorboard.ipynb => 07.tensorboard/07.tensorboard.ipynb} (99%) rename training/{41.export-run-history-to-tensorboard/41.export-run-history-to-tensorboard.ipynb => 08.export-run-history-to-tensorboard/08.export-run-history-to-tensorboard.ipynb} (100%) delete mode 100644 training/50.distributed-tensorflow-with-horovod/50.distributed-tensorflow-with-horovod.ipynb delete mode 100644 training/51.distributed-tensorflow-with-parameter-server/51.distributed-tensorflow-with-parameter-server.ipynb delete mode 100644 training/52.distributed-cntk/52.distributed-cntk.ipynb delete mode 100644 training/53.distributed-pytorch-with-horovod/53.distributed-pytorch-with-horovod.ipynb diff --git a/01.getting-started/04.train-on-remote-vm/04.train-on-remote-vm.ipynb b/01.getting-started/04.train-on-remote-vm/04.train-on-remote-vm.ipynb index 9713264cd..4576d155d 100644 --- a/01.getting-started/04.train-on-remote-vm/04.train-on-remote-vm.ipynb +++ b/01.getting-started/04.train-on-remote-vm/04.train-on-remote-vm.ipynb @@ -195,9 +195,10 @@ "metadata": {}, "outputs": [], "source": [ - " '''\n", + "'''\n", " from azureml.core.compute import RemoteCompute \n", - " dsvm_compute = RemoteCompute.attach(ws,name=\"attach-from-sdk6\",username=,address=,ssh_port=22,password=)\n", + " # if you want to connect using SSH key instead of username/password you can provide parameters private_key_file and private_key_passphrase \n", + " dsvm_compute = RemoteCompute.attach(ws,name=\"attach-from-sdk6\",username=,address=,ssh_port=22,password=)\n", "'''" ] }, diff --git a/01.getting-started/05.train-in-spark/05.train-in-spark.ipynb b/01.getting-started/05.train-in-spark/05.train-in-spark.ipynb index 83ad121e4..6ba366a45 100644 --- a/01.getting-started/05.train-in-spark/05.train-in-spark.ipynb +++ b/01.getting-started/05.train-in-spark/05.train-in-spark.ipynb @@ -15,11 +15,9 @@ "source": [ "# 05. Train in Spark\n", "* Create Workspace\n", - "* Create Project\n", - "* Create `train-spark.py` file in the project folder\n", - "* Execute a PySpark script in ACI.\n", - "* Execute a PySpark script in a Docker container on remote DSVM\n", - "* Execute a PySpark script in HDI" + "* Create Experiment\n", + "* Copy relevant files to the script folder\n", + "* Configure and Run" ] }, { @@ -67,8 +65,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Create Project and Associate with Run History\n", - "**Project** is a local folder that contains files for your Azure ML experiments. It is associated with a **run history**, a cloud container of run metrics and output artifacts from your experiments. You can either attach a local folder as a new project, or load a local folder as a project if it has been attached before." + "## Create Experiment\n" ] }, { @@ -77,27 +74,15 @@ "metadata": {}, "outputs": [], "source": [ - "# choose a name for the run history container in the workspace\n", - "experiment_name = 'train-on-spark'\n", + "experiment_name = 'train-on-remote-vm'\n", + "script_folder = './samples/train-on-remote-vm'\n", "\n", - "# project folder\n", - "project_folder = './sample_projects/train-on-spark'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ "import os\n", - "from azureml.project.project import Project\n", + "os.makedirs(script_folder, exist_ok = True)\n", "\n", - "project = Project.attach(workspace_object = ws,\n", - " experiment_name = experiment_name,\n", - " directory = project_folder)\n", + "from azureml.core import Experiment\n", "\n", - "print(project.project_directory, project.history.name, sep = '\\n')" + "exp = Experiment(workspace = ws, name = experiment_name)" ] }, { @@ -119,11 +104,11 @@ "from shutil import copyfile\n", "\n", "# copy iris dataset in to project folder\n", - "copyfile('./iris.csv', os.path.join(project_folder, 'iris.csv'))\n", + "copyfile('iris.csv', os.path.join(script_folder, 'iris.csv'))\n", "\n", "# copy train-spark.py file into project folder\n", "# train-spark.py trains a simple LogisticRegression model using Spark.ML algorithm\n", - "copyfile('./train-spark.py', os.path.join(project_folder, 'train-spark.py'))" + "copyfile('train-spark.py', os.path.join(script_folder, 'train-spark.py'))" ] }, { @@ -154,117 +139,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Configure ACI target" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.runconfig import RunConfiguration\n", - "from azureml.core.conda_dependencies import CondaDependencies\n", - "\n", - "# create a new runconfig object\n", - "run_config = RunConfiguration()\n", - "\n", - "# signal that you want to use ACI to execute script.\n", - "run_config.target = \"containerinstance\"\n", - "\n", - "# ACI container group is only supported in certain regions, which can be different than the region the Workspace is in.\n", - "run_config.container_instance.region = 'eastus'\n", - "\n", - "# set the ACI CPU and Memory \n", - "run_config.container_instance.cpu_cores = 1\n", - "run_config.container_instance.memory_gb = 2\n", - "\n", - "# enable Docker \n", - "run_config.environment.docker.enabled = True\n", - "\n", - "# set Docker base image to the default CPU-based image\n", - "run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_MMLSPARK_CPU_IMAGE\n", - "print('base image is', run_config.environment.docker.base_image)\n", - "#run_config.environment.docker.base_image = 'microsoft/mmlspark:plus-0.9.9'\n", - "\n", - "# use conda_dependencies.yml to create a conda environment in the Docker image for execution\n", - "# please update this file if you need additional packages.\n", - "run_config.environment.python.user_managed_dependencies = False\n", - "\n", - "# auto-prepare the Docker image when used for execution (if it is not already prepared)\n", - "run_config.auto_prepare_environment = True\n", - "\n", - "cd = CondaDependencies()\n", - "# add numpy as a dependency\n", - "cd.add_conda_package('numpy')\n", - "# overwrite the default conda_dependencies.yml file\n", - "cd.save_to_file(base_directory = project_folder, conda_file_path='aml_config/conda_dependencies.yml')\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run Spark job in ACI" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time \n", - "from azureml.core.experiment import Experiment\n", - "from azureml.core.script_run_config import ScriptRunConfig\n", - "\n", - "experiment = Experiment(project_object.workspace_object, project_object.history.name)\n", - "script_run_config = ScriptRunConfig(source_directory = project.project_directory,\n", - " script= 'train-spark.py',\n", - " run_config = run_config)\n", - "run = experiment.submit(script_run_config)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run.wait_for_completion(show_output = True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Show the run in the web UI\n", - "**IMPORTANT**: Please use Chrome to navigate to the URL." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# import helpers.py\n", - "import helpers\n", - "\n", - "# get the URL of the run history web page\n", - "print(helpers.get_run_history_url(run))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Attach a remote Linux VM\n", - "To use remote docker commpute target:\n", - " 1. Create a Linux DSVM in Azure. Here is some [quick instructions](https://docs.microsoft.com/en-us/azure/machine-learning/desktop-workbench/how-to-create-dsvm-hdi). Make sure you use the Ubuntu flavor, NOT CentOS.\n", - " 2. Enter the IP address, username and password below\n", - " \n", - "**Note**: the below example use port 5022. By default SSH runs on port 22 and you don't need to specify it. But if for security reasons you switch to a different port (such as 5022), you can append the port number to the address like the example below. [Read more](../../documentation/sdk/ssh-issue.md) on this." + "### Attach an HDI cluster\n", + "To use HDI commpute target:\n", + " 1. Create an Spark for HDI cluster in Azure. Here is some [quick instructions](https://docs.microsoft.com/en-us/azure/machine-learning/desktop-workbench/how-to-create-dsvm-hdi). Make sure you use the Ubuntu flavor, NOT CentOS.\n", + " 2. Enter the IP address, username and password below" ] }, { @@ -273,25 +151,30 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.core.compute import RemoteCompute\n", + "from azureml.core.compute import HDInsightCompute\n", "\n", "try:\n", - " # Attaches a remote docker on a remote vm as a compute target.\n", - " RemoteCompute.attach(workspace,name = \"cpu-dsvm\", username = \"ninghai\", \n", - " address = \"hai2.eastus2.cloudapp.azure.com:5022\", \n", - " ssh-port=22\n", - " password = \"\"))\n", + " # if you want to connect using SSH key instead of username/password you can provide parameters private_key_file and private_key_passphrase\n", + " hdi_compute_new = HDInsightCompute.attach(ws, \n", + " name=\"hdi-attach\", \n", + " address=\"hdi-ignite-demo-ssh.azurehdinsight.net\", \n", + " ssh_port=22, \n", + " username='', \n", + " password='')\n", + "\n", "except UserErrorException as e:\n", " print(\"Caught = {}\".format(e.message))\n", - " print(\"Compute config already attached.\")" + " print(\"Compute config already attached.\")\n", + " \n", + " \n", + "hdi_compute_new.wait_for_completion(show_output=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Configure a Spark Docker run on the VM\n", - "Execute in the Spark engine in a Docker container in the VM. " + "### Configure HDI run" ] }, { @@ -300,107 +183,32 @@ "metadata": {}, "outputs": [], "source": [ + "from azureml.core.runconfig import RunConfiguration\n", + "from azureml.core.conda_dependencies import CondaDependencies\n", + "\n", + "\n", "# Load the \"cpu-dsvm.runconfig\" file (created by the above attach operation) in memory\n", - "run_config = RunConfiguration.load(path = project_folder, name = \"cpu-dsvm\")\n", + "run_config = RunConfiguration(framework = \"python\")\n", "\n", - "# set framework to PySpark\n", - "run_config.framework = \"PySpark\"\n", + "# Set compute target to the Linux DSVM\n", + "run_config.target = hdi_compute.name\n", "\n", "# Use Docker in the remote VM\n", - "run_config.environment.docker.enabled = True\n", + "# run_config.environment.docker.enabled = True\n", "\n", - "# Use the MMLSpark CPU based image.\n", - "# https://hub.docker.com/r/microsoft/mmlspark/\n", - "run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_MMLSPARK_CPU_IMAGE\n", - "print('base image is:', run_config.environment.docker.base_image)\n", + "# Use CPU base image from DockerHub\n", + "# run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n", + "# print('Base Docker image is:', run_config.environment.docker.base_image)\n", "\n", - "# signal use the user-managed environment\n", - "# do NOT provision a new one based on the conda.yml file\n", + "# Ask system to provision a new one based on the conda_dependencies.yml file\n", "run_config.environment.python.user_managed_dependencies = False\n", "\n", - "# Prepare the Docker and conda environment automatically when execute for the first time.\n", - "run_config.auto_prepare_environment = True" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Submit the Experiment\n", - "Submit script to run in the Spark engine in the Docker container in the remote VM." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "script_run_config = ScriptRunConfig(source_directory = project.project_directory,\n", - " script= 'train-spark.py',\n", - " run_config = run_config)\n", - "run = experiment.submit(script_run_config)\n", - "\n", - "run.wait_for_completion(show_output = True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get the URL of the run history web page\n", - "print(helpers.get_run_history_url(run))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Attach an HDI cluster\n", - "To use HDI commpute target:\n", - " 1. Create an Spark for HDI cluster in Azure. Here is some [quick instructions](https://docs.microsoft.com/en-us/azure/machine-learning/desktop-workbench/how-to-create-dsvm-hdi). Make sure you use the Ubuntu flavor, NOT CentOS.\n", - " 2. Enter the IP address, username and password below" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.compute import HDInsightCompute\n", - "\n", - "try:\n", - " # Attaches a HDI cluster as a compute target.\n", - " HDInsightCompute.attach(ws, name = \"myhdi\",\n", - " username = \"ninghai\", \n", - " address = \"sparkhai-ssh.azurehdinsight.net\", \n", - " password = \"\"))\n", - "except UserErrorException as e:\n", - " print(\"Caught = {}\".format(e.message))\n", - " print(\"Compute config already attached.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Configure HDI run" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# load the runconfig object from the \"myhdi.runconfig\" file generated by the attach operaton above.\n", - "run_config = RunConfiguration.load(path = project_folder, name = 'myhdi')\n", + "# Prepare the Docker and conda environment automatically when executingfor the first time.\n", + "# run_config.prepare_environment = True\n", "\n", - "# ask system to prepare the conda environment automatically when executed for the first time\n", - "run_config.auto_prepare_environment = True" + "# specify CondaDependencies obj\n", + "# run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn'])\n", + "# load the runconfig object from the \"myhdi.runconfig\" file generated by the attach operaton above." ] }, { @@ -448,7 +256,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python [default]", "language": "python", "name": "python3" }, @@ -462,7 +270,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.6" } }, "nbformat": 4, diff --git a/01.getting-started/07.hyperdrive-with-sklearn/07.hyperdrive-with-sklearn.ipynb b/01.getting-started/07.hyperdrive-with-sklearn/07.hyperdrive-with-sklearn.ipynb index beec8dc59..26e80c05c 100644 --- a/01.getting-started/07.hyperdrive-with-sklearn/07.hyperdrive-with-sklearn.ipynb +++ b/01.getting-started/07.hyperdrive-with-sklearn/07.hyperdrive-with-sklearn.ipynb @@ -109,7 +109,9 @@ "metadata": {}, "source": [ "## Provision New Cluster\n", - "Create a new Batch AI cluster using the following Python code." + "Create a new Batch AI cluster using the following Python code.\n", + "\n", + "**Note**: As with other Azure services, there are limits on certain resources (for eg. BatchAI cluster size) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota." ] }, { diff --git a/automl/00.configuration.ipynb b/automl/00.configuration.ipynb index f056c79f1..a499ddb15 100644 --- a/automl/00.configuration.ipynb +++ b/automl/00.configuration.ipynb @@ -181,6 +181,8 @@ "metadata": {}, "outputs": [], "source": [ + "from azureml.core import Workspace\n", + "\n", "ws = Workspace(workspace_name = workspace_name,\n", " subscription_id = subscription_id,\n", " resource_group = resource_group)\n", diff --git a/automl/03b.auto-ml-remote-batchai.ipynb b/automl/03b.auto-ml-remote-batchai.ipynb index 8802127e7..8fc93fca9 100644 --- a/automl/03b.auto-ml-remote-batchai.ipynb +++ b/automl/03b.auto-ml-remote-batchai.ipynb @@ -120,7 +120,9 @@ "## Create Batch AI Cluster\n", "The cluster is created as Machine Learning Compute and will appear under your workspace.\n", "\n", - "Note: The cluster creation can take over 10 minutes, be patient." + "Note: The cluster creation can take over 10 minutes, please be patient.\n", + "\n", + "As with other Azure services, there are limits on certain resources (for eg. BatchAI cluster size) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota." ] }, { diff --git a/automl/13.auto-ml-dataprep.ipynb b/automl/13.auto-ml-dataprep.ipynb index 5da708f83..1d8bea42b 100644 --- a/automl/13.auto-ml-dataprep.ipynb +++ b/automl/13.auto-ml-dataprep.ipynb @@ -46,7 +46,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install --upgrade --extra-index-url https://dataprepdownloads.azureedge.net/pypi/monthly-AE98437A2C8F6F45842C/latest azureml-dataprep --no-cache-dir --force-reinstall\n", + "!pip install --upgrade --extra-index-url https://dataprepdownloads.azureedge.net/pypi/autoML-BD0E9CABED27C837/0.1.1809.11043 azureml-dataprep --no-cache-dir --force-reinstall\n", "!pip install tornado==4.5.1" ] }, @@ -279,7 +279,7 @@ "source": [ "cd = CondaDependencies()\n", "cd.set_pip_index_url(index_url=\"--index-url https://azuremlsdktestpypi.azureedge.net/sdk-release/master/588E708E0DF342C4A80BD954289657CF\")\n", - "cd.set_pip_index_url(index_url=\"--extra-index-url https://dataprepdownloads.azureedge.net/pypi/monthly-AE98437A2C8F6F45842C/latest --extra-index-url https://pypi.python.org/simple\")\n", + "cd.set_pip_index_url(index_url=\"--extra-index-url https://dataprepdownloads.azureedge.net/pypi/autoML-BD0E9CABED27C837/0.1.1809.11043 --extra-index-url https://pypi.python.org/simple\")\n", "cd.remove_pip_package(pip_package=\"azureml-defaults\")\n", "cd.add_pip_package(pip_package='azureml-core')\n", "cd.add_pip_package(pip_package='azureml-telemetry')\n", diff --git a/automl/14a.auto-ml-classification-ensemble.ipynb b/automl/14a.auto-ml-classification-ensemble.ipynb deleted file mode 100644 index 87d633dff..000000000 --- a/automl/14a.auto-ml-classification-ensemble.ipynb +++ /dev/null @@ -1,412 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# AutoML 01: Classification with ensembling on local compute\n", - "\n", - "In this example we use the scikit learn's [digit dataset](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html) to showcase how you can use the AutoML Classifier for a simple classification problem.\n", - "\n", - "Make sure you have executed the [00.configuration](00.configuration.ipynb) before running this notebook.\n", - "\n", - "In this notebook you would see\n", - "1. Creating an Experiment in an existing Workspace\n", - "2. Instantiating AutoMLConfig\n", - "3. Training the Model using local compute\n", - "4. Exploring the results\n", - "5. Testing the fitted model\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Experiment\n", - "\n", - "As part of the setup you have already created a Workspace. For AutoML you would need to create an Experiment. An Experiment is a named object in a Workspace, which is used to run experiments." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import os\n", - "import random\n", - "\n", - "from matplotlib import pyplot as plt\n", - "from matplotlib.pyplot import imshow\n", - "import numpy as np\n", - "import pandas as pd\n", - "from sklearn import datasets\n", - "\n", - "import azureml.core\n", - "from azureml.core.experiment import Experiment\n", - "from azureml.core.workspace import Workspace\n", - "from azureml.train.automl import AutoMLConfig\n", - "from azureml.train.automl.run import AutoMLRun" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ws = Workspace.from_config()\n", - "\n", - "# choose a name for experiment\n", - "experiment_name = 'automl-local-classification'\n", - "# project folder\n", - "project_folder = './sample_projects/automl-local-classification'\n", - "\n", - "experiment=Experiment(ws, experiment_name)\n", - "\n", - "output = {}\n", - "output['SDK version'] = azureml.core.VERSION\n", - "output['Subscription ID'] = ws.subscription_id\n", - "output['Workspace Name'] = ws.name\n", - "output['Resource Group'] = ws.resource_group\n", - "output['Location'] = ws.location\n", - "output['Project Directory'] = project_folder\n", - "output['Experiment Name'] = experiment.name\n", - "pd.set_option('display.max_colwidth', -1)\n", - "pd.DataFrame(data = output, index = ['']).T" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Diagnostics\n", - "\n", - "Opt-in diagnostics for better experience, quality, and security of future releases" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.telemetry import set_diagnostics_collection\n", - "set_diagnostics_collection(send_diagnostics=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load Digits Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn import datasets\n", - "\n", - "digits = datasets.load_digits()\n", - "\n", - "X_digits = digits.data[100:,:]\n", - "y_digits = digits.target[100:]\n", - "X_valid = digits.data[0:100]\n", - "y_valid = digits.target[0:100]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Instantiate Auto ML Config\n", - "\n", - "Instantiate a AutoMLConfig object. This defines the settings and data used to run the experiment.\n", - "\n", - "|Property|Description|\n", - "|-|-|\n", - "|**task**|classification or regression|\n", - "|**primary_metric**|This is the metric that you want to optimize.
Classification supports the following primary metrics
accuracy
AUC_weighted
balanced_accuracy
average_precision_score_weighted
precision_score_weighted|\n", - "|**max_time_sec**|Time limit in seconds for each iterations|\n", - "|**iterations**|Number of iterations. In each iteration Auto ML trains the data with a specific pipeline|\n", - "|**n_cross_validations**|Number of cross validation splits|\n", - "|**X**|(sparse) array-like, shape = [n_samples, n_features]|\n", - "|**y**|(sparse) array-like, shape = [n_samples, ], [n_samples, n_classes]
Multi-class targets. An indicator matrix turns on multilabel classification. This should be an array of integers. |\n", - "|**X_valid**|(sparse) array-like, shape = [n_samples, n_features]|\n", - "|**y_valid**|(sparse) array-like, shape = [n_samples, ], [n_samples, n_classes]
Multi-class targets. An indicator matrix turns on multilabel classification. This should be an array of integers. |\n", - "|**enable_ensembling**|Flag to enable an ensembling iteration after all the other iterations complete|\n", - "|**ensemble_iterations**|Number of iterations during which we choose a fitted model to be part of the final ensemble|" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "automl_config = AutoMLConfig(task = 'classification',\n", - " debug_log = 'automl_errors.log',\n", - " primary_metric = 'AUC_weighted',\n", - " max_time_sec = 12000,\n", - " iterations = 10,\n", - " verbosity = logging.INFO,\n", - " X = X_digits, \n", - " y = y_digits,\n", - " X_valid = X_valid,\n", - " y_valid = y_valid,\n", - " enable_ensembling = True,\n", - " ensemble_iterations = 5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Training the Model\n", - "\n", - "You can call the submit method on the experiment object and pass the run configuration. For Local runs the execution is synchronous. Depending on the data and number of iterations this can run for while.\n", - "You will see the currently running iterations printing to the console." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "local_run = experiment.submit(automl_config, show_output=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Optionally, you can continue an interrupted local run by calling continue_experiment without the iterations parameter, or run more iterations to a completed run by specifying the iterations parameter:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "local_run" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "local_run = local_run.continue_experiment(X = X_digits, \n", - " y = y_digits, \n", - " show_output = True,\n", - " iterations = 5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Exploring the results" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Widget for monitoring runs\n", - "\n", - "The widget will sit on \"loading\" until the first iteration completed, then you will see an auto-updating graph and table show up. It refreshed once per minute, so you should see the graph update as child runs complete.\n", - "\n", - "NOTE: The widget displays a link at the bottom. This links to a web-ui to explore the individual run details." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.train.widgets import RunDetails\n", - "RunDetails(local_run).show() " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "#### Retrieve All Child Runs\n", - "You can also use sdk methods to fetch all the child runs and see individual metrics that we log. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "children = list(local_run.get_children())\n", - "metricslist = {}\n", - "for run in children:\n", - " properties = run.get_properties()\n", - " metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)} \n", - " metricslist[int(properties['iteration'])] = metrics\n", - "\n", - "rundata = pd.DataFrame(metricslist).sort_index(1)\n", - "rundata" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Retrieve the Best Model\n", - "\n", - "Below we select the best pipeline from our iterations. The *get_output* method on automl_classifier returns the best run and the fitted model for the last *fit* invocation. There are overloads on *get_output* that allow you to retrieve the best run and fitted model for *any* logged metric or a particular *iteration*." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "best_run, fitted_model = local_run.get_output()\n", - "print(best_run)\n", - "print(fitted_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Best Model based on any other metric\n", - "Give me the run and the model that has the smallest `log_loss`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "lookup_metric = \"log_loss\"\n", - "best_run, fitted_model = local_run.get_output(metric = lookup_metric)\n", - "print(best_run)\n", - "print(fitted_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Best Model based on any iteration\n", - "Give me the run and the model from the 3rd iteration:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "iteration = 3\n", - "best_run, fitted_model = local_run.get_output(iteration = iteration)\n", - "print(best_run)\n", - "print(fitted_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Testing the Fitted Model \n", - "\n", - "#### Load Test Data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "digits = datasets.load_digits()\n", - "X_digits = digits.data[:10, :]\n", - "y_digits = digits.target[:10]\n", - "images = digits.images[:10]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Testing our best pipeline\n", - "We will try to predict 2 digits and see how our model works." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Randomly select digits and test\n", - "for index in np.random.choice(len(y_digits), 2):\n", - " print(index)\n", - " predicted = fitted_model.predict(X_digits[index:index + 1])[0]\n", - " label = y_digits[index]\n", - " title = \"Label value = %d Predicted value = %d \" % ( label,predicted)\n", - " fig = plt.figure(1, figsize=(3,3))\n", - " ax1 = fig.add_axes((0,0,.8,.8))\n", - " ax1.set_title(title)\n", - " plt.imshow(images[index], cmap=plt.cm.gray_r, interpolation='nearest')\n", - " plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [default]", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.6" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/automl/14b.auto-ml-regression-ensemble.ipynb b/automl/14b.auto-ml-regression-ensemble.ipynb deleted file mode 100644 index 921c2a4d5..000000000 --- a/automl/14b.auto-ml-regression-ensemble.ipynb +++ /dev/null @@ -1,437 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# AutoML 02: Regression with ensembling on local compute\n", - "\n", - "In this example we use the scikit learn's [diabetes dataset](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html) to showcase how you can use the AutoML for a simple regression problem.\n", - "\n", - "Make sure you have executed the [00.configuration](00.configuration.ipynb) before running this notebook.\n", - "\n", - "In this notebook you would see\n", - "1. Creating an Experiment using an existing Workspace\n", - "2. Instantiating AutoMLConfig\n", - "3. Training the Model using local compute\n", - "4. Exploring the results\n", - "5. Testing the fitted model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Experiment\n", - "\n", - "As part of the setup you have already created a Workspace. For AutoML you would need to create an Experiment. An Experiment is a named object in a Workspace, which is used to run experiments." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import os\n", - "import random\n", - "\n", - "from matplotlib import pyplot as plt\n", - "from matplotlib.pyplot import imshow\n", - "import numpy as np\n", - "import pandas as pd\n", - "from sklearn import datasets\n", - "\n", - "import azureml.core\n", - "from azureml.core.experiment import Experiment\n", - "from azureml.core.workspace import Workspace\n", - "from azureml.train.automl import AutoMLConfig\n", - "from azureml.train.automl.run import AutoMLRun" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ws = Workspace.from_config()\n", - "\n", - "# choose a name for the experiment\n", - "experiment_name = 'automl-local-regression'\n", - "# project folder\n", - "project_folder = './sample_projects/automl-local-regression'\n", - "\n", - "experiment = Experiment(ws, experiment_name)\n", - "\n", - "output = {}\n", - "output['SDK version'] = azureml.core.VERSION\n", - "output['Subscription ID'] = ws.subscription_id\n", - "output['Workspace Name'] = ws.name\n", - "output['Resource Group'] = ws.resource_group\n", - "output['Location'] = ws.location\n", - "output['Project Directory'] = project_folder\n", - "output['Experiment Name'] = experiment.name\n", - "pd.set_option('display.max_colwidth', -1)\n", - "pd.DataFrame(data = output, index = ['']).T" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Diagnostics\n", - "\n", - "Opt-in diagnostics for better experience, quality, and security of future releases" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.telemetry import set_diagnostics_collection\n", - "set_diagnostics_collection(send_diagnostics=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Read Data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# load diabetes dataset, a well-known built-in small dataset that comes with scikit-learn\n", - "from sklearn.datasets import load_diabetes\n", - "from sklearn.linear_model import Ridge\n", - "from sklearn.metrics import mean_squared_error\n", - "from sklearn.model_selection import train_test_split\n", - "\n", - "X, y = load_diabetes(return_X_y = True)\n", - "\n", - "columns = ['age', 'gender', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']\n", - "\n", - "x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Instantiate Auto ML Config\n", - "\n", - "Instantiate a AutoMLConfig object. This defines the settings and data used to run the experiment.\n", - "\n", - "|Property|Description|\n", - "|-|-|\n", - "|**task**|classification or regression|\n", - "|**primary_metric**|This is the metric that you want to optimize.
Regression supports the following primary metrics
spearman_correlation
normalized_root_mean_squared_error
r2_score
normalized_mean_absolute_error
normalized_root_mean_squared_log_error|\n", - "|**max_time_sec**|Time limit in seconds for each iterations|\n", - "|**iterations**|Number of iterations. In each iteration Auto ML Classifier trains the data with a specific pipeline|\n", - "|**n_cross_validations**|Number of cross validation splits|\n", - "|**X**|(sparse) array-like, shape = [n_samples, n_features]|\n", - "|**y**|(sparse) array-like, shape = [n_samples, ], [n_samples, n_classes]
Multi-class targets. An indicator matrix turns on multilabel classification. This should be an array of integers. |\n", - "|**enable_ensembling**|Flag to enable an ensembling iteration after all the other iterations complete|\n", - "|**ensemble_iterations**|Number of iterations during which we choose a fitted model to be part of the final ensemble|" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "automl_config = AutoMLConfig(task='regression',\n", - " max_time_sec = 600,\n", - " iterations = 10,\n", - " primary_metric = 'spearman_correlation', \n", - " debug_log = 'automl.log',\n", - " verbosity = logging.INFO,\n", - " X = x_train, \n", - " y = y_train,\n", - " X_valid = x_test,\n", - " y_valid = y_test,\n", - " enable_ensembling = True,\n", - " ensemble_iterations = 5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Training the Model\n", - "\n", - "You can call the submit method on the experiment object and pass the run configuration. For Local runs the execution is synchronous. Depending on the data and number of iterations this can run for while.\n", - "You will see the currently running iterations printing to the console." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "local_run = experiment.submit(automl_config, show_output=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "local_run" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Exploring the results" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Widget for monitoring runs\n", - "\n", - "The widget will sit on \"loading\" until the first iteration completed, then you will see an auto-updating graph and table show up. It refreshed once per minute, so you should see the graph update as child runs complete.\n", - "\n", - "NOTE: The widget displays a link at the bottom. This links to a web-ui to explore the individual run details." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.train.widgets import RunDetails\n", - "RunDetails(local_run).show() " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "#### Retrieve All Child Runs\n", - "You can also use sdk methods to fetch all the child runs and see individual metrics that we log. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "children = list(local_run.get_children())\n", - "metricslist = {}\n", - "for run in children:\n", - " properties = run.get_properties()\n", - " metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)} \n", - " metricslist[int(properties['iteration'])] = metrics\n", - " \n", - "rundata = pd.DataFrame(metricslist).sort_index(1)\n", - "rundata" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Retrieve the Best Model\n", - "\n", - "Below we select the best pipeline from our iterations. The *get_output* method on automl_classifier returns the best run and the fitted model for the last *fit* invocation. There are overloads on *get_output* that allow you to retrieve the best run and fitted model for *any* logged metric or a particular *iteration*." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "best_run, fitted_model = local_run.get_output()\n", - "print(best_run)\n", - "print(fitted_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Best Model based on any other metric\n", - "Show the run and model that has the smallest `root_mean_squared_error` (which turned out to be the same as the one with largest `spearman_correlation` value):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "lookup_metric = \"root_mean_squared_error\"\n", - "best_run, fitted_model = local_run.get_output(metric=lookup_metric)\n", - "print(best_run)\n", - "print(fitted_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Best Model based on any iteration\n", - "Simply show the run and model from the 3rd iteration:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "iteration = 3\n", - "third_run, third_model = local_run.get_output(iteration = iteration)\n", - "print(third_run)\n", - "print(third_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Register fitted model for deployment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "description = 'AutoML Model'\n", - "tags = None\n", - "local_run.register_model(description = description, tags = tags)\n", - "print(local_run.model_id) # Use this id to deploy the model as a web service in Azure" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Testing the Fitted Model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Predict on training and test set, and calculate residual values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "y_pred_train = fitted_model.predict(x_train)\n", - "y_residual_train = y_train - y_pred_train\n", - "\n", - "y_pred_test = fitted_model.predict(x_test)\n", - "y_residual_test = y_test - y_pred_test" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "from sklearn import datasets\n", - "from sklearn.metrics import mean_squared_error, r2_score\n", - "\n", - "# set up a multi-plot chart\n", - "f, (a0, a1) = plt.subplots(1, 2, gridspec_kw = {'width_ratios':[1, 1], 'wspace':0, 'hspace': 0})\n", - "f.suptitle('Regression Residual Values', fontsize = 18)\n", - "f.set_figheight(6)\n", - "f.set_figwidth(16)\n", - "\n", - "# plot residual values of training set\n", - "a0.axis([0, 360, -200, 200])\n", - "a0.plot(y_residual_train, 'bo', alpha = 0.5)\n", - "a0.plot([-10,360],[0,0], 'r-', lw = 3)\n", - "a0.text(16,170,'RMSE = {0:.2f}'.format(np.sqrt(mean_squared_error(y_train, y_pred_train))), fontsize = 12)\n", - "a0.text(16,140,'Variance = {0:.2f}'.format(r2_score(y_train, y_pred_train)), fontsize = 12)\n", - "a0.set_xlabel('Training samples', fontsize = 12)\n", - "a0.set_ylabel('Residual Values', fontsize = 12)\n", - "# plot histogram\n", - "a0.hist(y_residual_train, orientation = 'horizontal', color = 'b', bins = 10, histtype = 'step');\n", - "a0.hist(y_residual_train, orientation = 'horizontal', color = 'b', alpha = 0.2, bins = 10);\n", - "\n", - "# plot residual values of test set\n", - "a1.axis([0, 90, -200, 200])\n", - "a1.plot(y_residual_test, 'bo', alpha = 0.5)\n", - "a1.plot([-10,360],[0,0], 'r-', lw = 3)\n", - "a1.text(5,170,'RMSE = {0:.2f}'.format(np.sqrt(mean_squared_error(y_test, y_pred_test))), fontsize = 12)\n", - "a1.text(5,140,'Variance = {0:.2f}'.format(r2_score(y_test, y_pred_test)), fontsize = 12)\n", - "a1.set_xlabel('Test samples', fontsize = 12)\n", - "a1.set_yticklabels([])\n", - "# plot histogram\n", - "a1.hist(y_residual_test, orientation = 'horizontal', color = 'b', bins = 10, histtype = 'step');\n", - "a1.hist(y_residual_test, orientation = 'horizontal', color = 'b', alpha = 0.2, bins = 10);\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [default]", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.6" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/automl/README.md b/automl/README.md index 7ad8581c2..e8e93f8c7 100644 --- a/automl/README.md +++ b/automl/README.md @@ -138,7 +138,7 @@ cd to the "automl" folder where the sample notebooks were extracted and then run |-|-|-| |**primary_metric**|This is the metric that you want to optimize.

Classification supports the following primary metrics
accuracy
AUC_weighted
balanced_accuracy
average_precision_score_weighted
precision_score_weighted

Regression supports the following primary metrics
spearman_correlation
normalized_root_mean_squared_error
r2_score
normalized_mean_absolute_error
normalized_root_mean_squared_log_error| Classification: accuracy

Regression: spearman_correlation |**max_time_sec**|Time limit in seconds for each iterations|None| -|**iterations**|Number of iterations. In each iteration trains the data with a specific pipeline|25| +|**iterations**|Number of iterations. In each iteration trains the data with a specific pipeline. To get the best result, use at least 100. |25| |**n_cross_validations**|Number of cross validation splits|None| |**validation_size**|Size of validation set as percentage of all training samples|None| |**concurrent_iterations**|Max number of iterations that would be executed in parallel|1| @@ -186,7 +186,7 @@ The main code of the file must be indented so that it is under this condition. # Troubleshooting ## Iterations fail and the log contains "MemoryError" This can be caused by insufficient memory on the DSVM. AutoML loads all training data into memory. So, the available memory should be more than the training data size. -If you are using a remote DSVM, memory is needed for each concurrent iteration. The concurrent_iterations setting specifies the maximum concurrent iterations. For example, if the trinaing data size is 8Gb and concurrent_iterations is set to 10, the minimum memory required is at least 80Gb. +If you are using a remote DSVM, memory is needed for each concurrent iteration. The concurrent_iterations setting specifies the maximum concurrent iterations. For example, if the training data size is 8Gb and concurrent_iterations is set to 10, the minimum memory required is at least 80Gb. To resolve this issue, allocate a DSVM with more memory or reduce the value specified for concurrent_iterations. ## Iterations show as "Not Responding" in the RunDetails widget. diff --git a/automl/automl_setup.cmd b/automl/automl_setup.cmd index 6d82a9072..201a06fe6 100644 --- a/automl/automl_setup.cmd +++ b/automl/automl_setup.cmd @@ -6,11 +6,10 @@ IF "%conda_env_name%"=="" SET conda_env_name="azure_automl" call conda activate %conda_env_name% 2>nul: if not errorlevel 1 ( - call conda env update -f automl_env.yml -n %conda_env_name% + call conda env update --file automl_env.yml -n %conda_env_name% if errorlevel 1 goto ErrorExit ) else ( call conda env create -f automl_env.yml -n %conda_env_name% - if errorlevel 1 goto ErrorExit ) call conda activate %conda_env_name% 2>nul: diff --git a/automl/automl_setup_linux.sh b/automl/automl_setup_linux.sh index 288e09cb7..6e0300549 100644 --- a/automl/automl_setup_linux.sh +++ b/automl/automl_setup_linux.sh @@ -9,7 +9,7 @@ fi if source activate $CONDA_ENV_NAME 2> /dev/null then - conda env update -f automl_env.yml -n $CONDA_ENV_NAME + conda env update -file automl_env.yml -n $CONDA_ENV_NAME else conda env create -f automl_env.yml -n $CONDA_ENV_NAME && source activate $CONDA_ENV_NAME && diff --git a/automl/automl_setup_mac.sh b/automl/automl_setup_mac.sh index 6d0049020..789f143fa 100644 --- a/automl/automl_setup_mac.sh +++ b/automl/automl_setup_mac.sh @@ -9,7 +9,7 @@ fi if source activate $CONDA_ENV_NAME 2> /dev/null then - conda env update -f automl_env.yml -n $CONDA_ENV_NAME + conda env update -file automl_env.yml -n $CONDA_ENV_NAME else conda env create -f automl_env.yml -n $CONDA_ENV_NAME && source activate $CONDA_ENV_NAME && diff --git a/onnx/onnx-inference-emotion-recognition.ipynb b/onnx/onnx-inference-emotion-recognition.ipynb new file mode 100644 index 000000000..e7b039b5d --- /dev/null +++ b/onnx/onnx-inference-emotion-recognition.ipynb @@ -0,0 +1,729 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved. \n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 02. Facial Expression Recognition using ONNX Runtime GPU on AzureML\n", + "\n", + "This example shows how to deploy an image classification neural network using the Facial Expression Recognition ([FER](https://www.kaggle.com/c/challenges-in-representation-learning-facial-expression-recognition-challenge/data)) dataset and Open Neural Network eXchange format ([ONNX](http://aka.ms/onnxdocarticle)) on the Azure Machine Learning platform. This tutorial will show you how to deploy a FER+ model from the [ONNX model zoo](https://github.com/onnx/models), use it to make predictions using ONNX Runtime Inference, and deploy it as a web service in Azure.\n", + "\n", + "Throughout this tutorial, we will be referring to ONNX, a neural network exchange format used to represent deep learning models. With ONNX, AI developers can more easily move models between state-of-the-art tools (CNTK, PyTorch, Caffe, MXNet, TensorFlow) and choose the combination that is best for them. ONNX is developed and supported by a community of partners including Microsoft AI, Facebook, and Amazon. For more information, explore the [ONNX website](http://onnx.ai) and [open source files](https://github.com/onnx).\n", + "\n", + "[ONNX Runtime](https://aka.ms/onnxruntime) is the runtime engine that enables evaluation of trained machine learning (Traditional ML and Deep Learning) models with high performance and low resource utilization.\n", + "\n", + "#### Tutorial Objectives:\n", + "\n", + "1. Describe the FER+ dataset and pretrained Convolutional Neural Net ONNX model for Emotion Recognition, stored in the ONNX model zoo.\n", + "2. Deploy and run the pretrained FER+ ONNX model on an Azure Machine Learning instance\n", + "3. Predict labels for test set data points in the cloud using ONNX Runtime and Azure ML" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "### 1. Install Azure ML SDK and create a new workspace\n", + "Please follow [00.configuration.ipynb](https://github.com/Azure/MachineLearningNotebooks/blob/master/00.configuration.ipynb) notebook.\n", + "\n", + "\n", + "### 2. Install additional packages needed for this Notebook\n", + "You need to install the popular plotting library `matplotlib` and the `onnx` library in the conda environment where Azure Maching Learning SDK is installed.\n", + "\n", + "```sh\n", + "(myenv) $ pip install matplotlib onnx\n", + "```\n", + "\n", + "### 3. Download sample data and pre-trained ONNX model from ONNX Model Zoo.\n", + "\n", + "[Download the ONNX Emotion FER+ model and corresponding test data](https://www.cntk.ai/OnnxModels/emotion_ferplus/opset_7/emotion_ferplus.tar.gz) and place them in the same folder as this tutorial notebook. You can unzip the file through the following line of code.\n", + "\n", + "```sh\n", + "(myenv) $ tar xvzf emotion_ferplus.tar.gz\n", + "```\n", + "\n", + "More information can be found about the ONNX FER+ model on [github](https://github.com/onnx/models/tree/master/emotion_ferplus). For more information about the FER+ dataset, please visit Microsoft Researcher Emad Barsoum's [FER+ source data repository](https://github.com/ebarsoum/FERPlus)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Azure ML workspace\n", + "\n", + "We begin by instantiating a workspace object from the existing workspace created earlier in the configuration notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check core SDK version number\n", + "import azureml.core\n", + "\n", + "print(\"SDK version:\", azureml.core.VERSION)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "print(ws.name, ws.location, ws.resource_group, ws.location, sep = '\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Registering your model with Azure ML" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_dir = \"emotion_ferplus\" # replace this with the location of your model files\n", + "\n", + "# leave as is if it's in the same folder as this notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.model import Model\n", + "\n", + "model = Model.register(model_path = model_dir + \"/\" + \"model.onnx\",\n", + " model_name = \"onnx_emotion\",\n", + " tags = {\"onnx\": \"demo\"},\n", + " description = \"FER+ emotion recognition CNN from ONNX Model Zoo\",\n", + " workspace = ws)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Optional: Displaying your registered models\n", + "\n", + "This step is not required, so feel free to skip it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "models = ws.models()\n", + "for m in models:\n", + " print(\"Name:\", m.name,\"\\tVersion:\", m.version, \"\\tDescription:\", m.description, m.tags)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ONNX FER+ Model Methodology\n", + "\n", + "The image classification model we are using is pre-trained using Microsoft's deep learning cognitive toolkit, [CNTK](https://github.com/Microsoft/CNTK), from the [ONNX model zoo](http://github.com/onnx/models). The model zoo has many other models that can be deployed on cloud providers like AzureML without any additional training. To ensure that our cloud deployed model works, we use testing data from the famous FER+ data set, provided as part of the [trained Emotion Recognition model](https://github.com/onnx/models/tree/master/emotion_ferplus) in the ONNX model zoo.\n", + "\n", + "The original Facial Emotion Recognition (FER) Dataset was released in 2013, but some of the labels are not entirely appropriate for the expression. In the FER+ Dataset, each photo was evaluated by at least 10 croud sourced reviewers, creating a better basis for ground truth. \n", + "\n", + "You can see the difference of label quality in the sample model input below. The FER labels are the first word below each image, and the FER+ labels are the second word below each image.\n", + "\n", + "![](https://raw.githubusercontent.com/Microsoft/FERPlus/master/FER+vsFER.png)\n", + "\n", + "***Input: Photos of cropped faces from FER+ Dataset***\n", + "\n", + "***Task: Classify each facial image into its appropriate emotions in the emotion table***\n", + "\n", + "``` emotion_table = {'neutral':0, 'happiness':1, 'surprise':2, 'sadness':3, 'anger':4, 'disgust':5, 'fear':6, 'contempt':7} ```\n", + "\n", + "***Output: Emotion prediction for input image***\n", + "\n", + "\n", + "Remember, once the application is deployed in Azure ML, you can use your own images as input for the model to classify." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# for images and plots in this notebook\n", + "import matplotlib.pyplot as plt \n", + "from IPython.display import Image\n", + "\n", + "# display images inline\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "The FER+ model from the ONNX Model Zoo is summarized by the graphic below. You can see the entire workflow of our pre-trained model in the following image from Barsoum et. al's paper [\"Training Deep Networks for Facial Expression Recognition\n", + "with Crowd-Sourced Label Distribution\"](https://arxiv.org/pdf/1608.01041.pdf), with our (64,64) input images and our output probabilities for each of the labels." + ] + }, + { + "attachments": { + "image.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![image.png](attachment:image.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deploy our model on Azure ML" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We are now going to deploy our ONNX Model on AML with inference in ONNX Runtime. We begin by writing a score.py file, which will help us run the model in our Azure ML virtual machine (VM), and then specify our environment by writing a yml file.\n", + "\n", + "You will also notice that we import the onnxruntime library to do runtime inference on our ONNX models (passing in input and evaluating out model's predicted output). More information on the API and commands can be found in the [ONNX Runtime documentation](https://aka.ms/onnxruntime).\n", + "\n", + "### Write Score File\n", + "\n", + "A score file is what tells our Azure cloud service what to do. After initializing our model using azureml.core.model, we start an ONNX Runtime GPU inference session to evaluate the data passed in on our function calls." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile score.py\n", + "import json\n", + "import numpy as np\n", + "import onnxruntime\n", + "import sys\n", + "import os\n", + "from azureml.core.model import Model\n", + "import time\n", + "\n", + "def init():\n", + " global session\n", + " model = Model.get_model_path(model_name = 'onnx_emotion')\n", + " session = onnxruntime.InferenceSession(model, None)\n", + " \n", + "def run(input_data):\n", + " '''Purpose: evaluate test input in Azure Cloud using onnxruntime.\n", + " We will call the run function later from our Jupyter Notebook \n", + " so our azure service can evaluate our model input in the cloud. '''\n", + "\n", + " try:\n", + " # load in our data, convert to readable format\n", + " start = time.time()\n", + " data = np.array(json.loads(input_data)['data']).astype('float32')\n", + "\n", + " r = session.run([\"Plus214_Output_0\"], {\"Input3\": data})[0]\n", + " result = emotion_map(postprocess(r[0]))\n", + " end = time.time()\n", + " result_dict = {\"result\": np.array(result).tolist(),\n", + " \"time\": np.array(end - start).tolist()}\n", + " except Exception as e:\n", + " result_dict = {\"error\": str(e)}\n", + " \n", + " return json.dumps(result_dict)\n", + "\n", + "def emotion_map(classes, N=1):\n", + " \"\"\"Take the most probable labels (output of postprocess) and returns the top N emotional labels that fit the picture.\"\"\"\n", + " \n", + " emotion_table = {'neutral':0, 'happiness':1, 'surprise':2, 'sadness':3, 'anger':4, 'disgust':5, 'fear':6, 'contempt':7}\n", + " emotion_keys = list(emotion_table.keys())\n", + " emotions = []\n", + " for i in range(N):\n", + " emotions.append(emotion_keys[classes[i]])\n", + " return emotions\n", + "\n", + "def softmax(x):\n", + " \"\"\"Compute softmax values (probabilities from 0 to 1) for each possible label.\"\"\"\n", + " x = x.reshape(-1)\n", + " e_x = np.exp(x - np.max(x))\n", + " return e_x / e_x.sum(axis=0)\n", + "\n", + "def postprocess(scores):\n", + " \"\"\"This function takes the scores generated by the network and returns the class IDs in decreasing \n", + " order of probability.\"\"\"\n", + " prob = softmax(scores)\n", + " prob = np.squeeze(prob)\n", + " classes = np.argsort(prob)[::-1]\n", + " return classes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Write Environment File" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.conda_dependencies import CondaDependencies \n", + "\n", + "myenv = CondaDependencies()\n", + "myenv.add_pip_package(\"numpy\")\n", + "myenv.add_pip_package(\"azureml-core\")\n", + "myenv.add_pip_package(\"onnxruntime-gpu\")\n", + "\n", + "\n", + "with open(\"myenv.yml\",\"w\") as f:\n", + " f.write(myenv.serialize_to_string())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create the Container Image\n", + "\n", + "This step will likely take a few minutes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.image import ContainerImage\n", + "\n", + "# enable_gpu = True to install CUDA 9.1 and cuDNN 7.0\n", + "\n", + "image_config = ContainerImage.image_configuration(execution_script = \"score.py\",\n", + " runtime = \"python\",\n", + " conda_file = \"myenv.yml\",\n", + " description = \"test\",\n", + " tags = {\"demo\": \"onnx\"},\n", + " enable_gpu = True\n", + " )\n", + "\n", + "\n", + "image = ContainerImage.create(name = \"onnxtest\",\n", + " # this is the model object\n", + " models = [model],\n", + " image_config = image_config,\n", + " workspace = ws)\n", + "\n", + "image.wait_for_creation(show_output = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Debugging\n", + "\n", + "In case you need to debug your code, the next line of code accesses the log file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(image.image_build_log_uri)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We're all set! Let's get our model chugging.\n", + "\n", + "## Deploy the container image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.webservice import AciWebservice\n", + "\n", + "aciconfig = AciWebservice.deploy_configuration(cpu_cores = 1, \n", + " memory_gb = 1, \n", + " tags = {'demo': 'onnx'}, \n", + " description = 'ONNX for facial emotion recognition model')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following cell will likely take a few minutes to run as well." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.webservice import Webservice\n", + "\n", + "aci_service_name = 'onnx-emotion-demo'\n", + "print(\"Service\", aci_service_name)\n", + "\n", + "aci_service = Webservice.deploy_from_image(deployment_config = aciconfig,\n", + " image = image,\n", + " name = aci_service_name,\n", + " workspace = ws)\n", + "\n", + "aci_service.wait_for_deployment(True)\n", + "print(aci_service.state)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if aci_service.state != 'Healthy':\n", + " # run this command for debugging.\n", + " print(aci_service.get_logs())\n", + "\n", + " # If your deployment fails, make sure to delete your aci_service before trying again!\n", + " # aci_service.delete()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Success!\n", + "\n", + "If you've made it this far, you've deployed a working VM with a facial emotion recognition model running in the cloud using Azure ML. Congratulations!\n", + "\n", + "Let's see how well our model deals with our test images." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Testing and Evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Useful Helper Functions\n", + "\n", + "We preprocess and postprocess our data (see score.py file) using the helper functions specified in the [ONNX FER+ Model page in the Model Zoo repository](https://github.com/onnx/models/tree/master/emotion_ferplus)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def preprocess(img):\n", + " \"\"\"Convert image to the write format to be passed into the model\"\"\"\n", + " input_shape = (1, 64, 64)\n", + " img = np.reshape(img, input_shape)\n", + " img = np.expand_dims(img, axis=0)\n", + " return img" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# to manipulate our arrays\n", + "import numpy as np \n", + "\n", + "# read in test data protobuf files included with the model\n", + "import onnx\n", + "from onnx import numpy_helper\n", + "\n", + "# to use parsers to read in our model/data\n", + "import json\n", + "import os\n", + "\n", + "test_inputs = []\n", + "test_outputs = []\n", + "\n", + "# read in 3 testing images from .pb files\n", + "test_data_size = 3\n", + "\n", + "for i in np.arange(test_data_size):\n", + " input_test_data = os.path.join(model_dir, 'test_data_set_{0}'.format(i), 'input_0.pb')\n", + " output_test_data = os.path.join(model_dir, 'test_data_set_{0}'.format(i), 'output_0.pb')\n", + " \n", + " # convert protobuf tensors to np arrays using the TensorProto reader from ONNX\n", + " tensor = onnx.TensorProto()\n", + " with open(input_test_data, 'rb') as f:\n", + " tensor.ParseFromString(f.read())\n", + " \n", + " input_data = preprocess(numpy_helper.to_array(tensor))\n", + " test_inputs.append(input_data)\n", + " \n", + " with open(output_test_data, 'rb') as f:\n", + " tensor.ParseFromString(f.read())\n", + " \n", + " output_data = numpy_helper.to_array(tensor)\n", + " test_outputs.append(output_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nbpresent": { + "id": "c3f2f57c-7454-4d3e-b38d-b0946cf066ea" + } + }, + "source": [ + "### Show some sample images\n", + "We use `matplotlib` to plot 3 images from the dataset with their labels over them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbpresent": { + "id": "396d478b-34aa-4afa-9898-cdce8222a516" + } + }, + "outputs": [], + "source": [ + "plt.figure(figsize = (20, 20))\n", + "for test_image in np.arange(3):\n", + " test_inputs[test_image].reshape(1, 64, 64)\n", + " plt.subplot(1, 8, test_image+1)\n", + " plt.axhline('')\n", + " plt.axvline('')\n", + " plt.text(x = 10, y = -10, s = test_outputs[test_image][0], fontsize = 18)\n", + " plt.imshow(test_inputs[test_image].reshape(64, 64))\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run evaluation / prediction" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize = (16, 6), frameon=False)\n", + "plt.subplot(1, 8, 1)\n", + "\n", + "plt.text(x = 0, y = -30, s = \"True Label: \", fontsize = 13, color = 'black')\n", + "plt.text(x = 0, y = -20, s = \"Result: \", fontsize = 13, color = 'black')\n", + "plt.text(x = 0, y = -10, s = \"Inference Time: \", fontsize = 13, color = 'black')\n", + "plt.text(x = 3, y = 14, s = \"Model Input\", fontsize = 12, color = 'black')\n", + "plt.text(x = 6, y = 18, s = \"(64 x 64)\", fontsize = 12, color = 'black')\n", + "plt.imshow(np.ones((28,28)), cmap=plt.cm.Greys) \n", + "\n", + "\n", + "for i in np.arange(test_data_size):\n", + " \n", + " input_data = json.dumps({'data': test_inputs[i].tolist()})\n", + "\n", + " # predict using the deployed model\n", + " r = json.loads(aci_service.run(input_data))\n", + " \n", + " if len(r) == 1:\n", + " print(r['error'])\n", + " break\n", + " \n", + " result = r['result']\n", + " time_ms = np.round(r['time'] * 1000, 2)\n", + " \n", + " ground_truth = int(np.argmax(test_outputs[i]))\n", + " \n", + " # compare actual value vs. the predicted values:\n", + " plt.subplot(1, 8, i+2)\n", + " plt.axhline('')\n", + " plt.axvline('')\n", + "\n", + " # use different color for misclassified sample\n", + " font_color = 'red' if ground_truth != result else 'black'\n", + " clr_map = plt.cm.gray if ground_truth != result else plt.cm.Greys\n", + "\n", + " # ground truth labels are in blue\n", + " plt.text(x = 10, y = -30, s = ground_truth, fontsize = 18, color = 'blue')\n", + " \n", + " # predictions are in black if correct, red if incorrect\n", + " plt.text(x = 10, y = -20, s = result, fontsize = 18, color = font_color)\n", + " plt.text(x = 5, y = -10, s = str(time_ms) + ' ms', fontsize = 14, color = font_color)\n", + "\n", + " \n", + " plt.imshow(test_inputs[i].reshape(64, 64), cmap = clr_map)\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Try classifying your own images!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Replace the following string with your own path/test image\n", + "# Make sure the dimensions are 28 * 28 pixels\n", + "\n", + "# Any PNG or JPG image file should work\n", + "# Make sure to include the entire path with // instead of /\n", + "\n", + "# e.g. your_test_image = \"C://Users//vinitra.swamy//Pictures//emotion_test_images//img_1.jpg\"\n", + "\n", + "your_test_image = \"\"\n", + "\n", + "import matplotlib.image as mpimg\n", + "\n", + "if your_test_image != \"\":\n", + " img = mpimg.imread(your_test_image)\n", + " plt.subplot(1,3,1)\n", + " plt.imshow(img, cmap = plt.cm.Greys)\n", + " img = img.reshape(1, 1, 64, 64)\n", + "else:\n", + " img = None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if img is None:\n", + " print(\"Add the path for your image data.\")\n", + "else:\n", + " input_data = json.dumps({'data': img.tolist()})\n", + "\n", + " try:\n", + " r = json.loads(aci_service.run(input_data))\n", + " result = r['result']\n", + " time_ms = np.round(r['time'] * 1000, 2)\n", + " except Exception as e:\n", + " print(json.loads(r)['error'])\n", + "\n", + " plt.figure(figsize = (16, 6))\n", + " plt.subplot(1, 15,1)\n", + " plt.axhline('')\n", + " plt.axvline('')\n", + " plt.text(x = -100, y = -20, s = \"Model prediction: \", fontsize = 14)\n", + " plt.text(x = -100, y = -10, s = \"Inference time: \", fontsize = 14)\n", + " plt.text(x = 0, y = -20, s = str(result), fontsize = 14)\n", + " plt.text(x = 0, y = -10, s = str(time_ms) + \" ms\", fontsize = 14)\n", + " plt.text(x = -100, y = 14, s = \"Input image: \", fontsize = 14)\n", + " plt.imshow(img.reshape(28, 28), cmap = plt.cm.Greys) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# remember to delete your service after you are done using it!\n", + "\n", + "# aci_service.delete()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "Congratulations!\n", + "\n", + "In this tutorial, you have managed to:\n", + "- familiarize yourself with the ONNX standard, ONNX Runtime inference, and the pretrained models in the ONNX model zoo\n", + "- understand a state-of-the-art convolutional neural net image classification model (FER+ in ONNX) and deploy it in the Azure ML cloud\n", + "- ensure that your deep learning model is working correctly (in the cloud) on test data, and check it against some of your own!\n", + "\n", + "Next steps:\n", + "- If you have not already, check out another interesting ONNX/AML application that lets you set up a state-of-the-art [handwritten image classification model (MNIST)](https://github.com/Azure/MachineLearningNotebooks/tree/master/onnx/onnx-inference-mnist.ipynb) in the cloud! This tutorial deploys a pre-trained ONNX Computer Vision model for handwritten digit classification in an Azure ML virtual machine.\n", + "- Contribute to our [open source ONNX repository on github](http://github.com/onnx/onnx) and/or add to our [ONNX model zoo](http://github.com/onnx/models)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:finaldemo]", + "language": "python", + "name": "conda-env-finaldemo-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + }, + "msauthor": "vinitra.swamy" + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/onnx/onnx-inference-mnist.ipynb b/onnx/onnx-inference-mnist.ipynb new file mode 100644 index 000000000..8514984e9 --- /dev/null +++ b/onnx/onnx-inference-mnist.ipynb @@ -0,0 +1,854 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved. \n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 01. Handwritten Digit Classification (MNIST) using ONNX Runtime on AzureML\n", + "\n", + "This example shows how to deploy an image classification neural network using the Modified National Institute of Standards and Technology ([MNIST](http://yann.lecun.com/exdb/mnist/)) dataset and Open Neural Network eXchange format ([ONNX](http://aka.ms/onnxdocarticle)) on the Azure Machine Learning platform. MNIST is a popular dataset consisting of 70,000 grayscale images. Each image is a handwritten digit of 28x28 pixels, representing number from 0 to 9. This tutorial will show you how to deploy a MNIST model from the [ONNX model zoo](https://github.com/onnx/models), use it to make predictions using ONNX Runtime Inference, and deploy it as a web service in Azure.\n", + "\n", + "Throughout this tutorial, we will be referring to ONNX, a neural network exchange format used to represent deep learning models. With ONNX, AI developers can more easily move models between state-of-the-art tools (CNTK, PyTorch, Caffe, MXNet, TensorFlow) and choose the combination that is best for them. ONNX is developed and supported by a community of partners including Microsoft AI, Facebook, and Amazon. For more information, explore the [ONNX website](http://onnx.ai) and [open source files](https://github.com/onnx).\n", + "\n", + "[ONNX Runtime](https://aka.ms/onnxruntime) is the runtime engine that enables evaluation of trained machine learning (Traditional ML and Deep Learning) models with high performance and low resource utilization.\n", + "\n", + "#### Tutorial Objectives:\n", + "\n", + "1. Describe the MNIST dataset and pretrained Convolutional Neural Net ONNX model, stored in the ONNX model zoo.\n", + "2. Deploy and run the pretrained MNIST ONNX model on an Azure Machine Learning instance\n", + "3. Predict labels for test set data points in the cloud using ONNX Runtime and Azure ML" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "### 1. Install Azure ML SDK and create a new workspace\n", + "Please follow [00.configuration.ipynb](https://github.com/Azure/MachineLearningNotebooks/blob/master/00.configuration.ipynb) notebook.\n", + "\n", + "\n", + "### 2. Install additional packages needed for this Notebook\n", + "You need to install the popular plotting library `matplotlib` and the `onnx` library in the conda environment where Azure Maching Learning SDK is installed.\n", + "\n", + "```sh\n", + "(myenv) $ pip install matplotlib onnx\n", + "```\n", + "\n", + "### 3. Download sample data and pre-trained ONNX model from ONNX Model Zoo.\n", + "\n", + "[Download the ONNX MNIST model and corresponding test data](https://www.cntk.ai/OnnxModels/mnist/opset_7/mnist.tar.gz) and place them in the same folder as this tutorial notebook. You can unzip the file through the following line of code.\n", + "\n", + "```sh\n", + "(myenv) $ tar xvzf mnist.tar.gz\n", + "```\n", + "\n", + "More information can be found about the ONNX MNIST model on [github](https://github.com/onnx/models/tree/master/mnist). For more information about the MNIST dataset, please visit [Yan LeCun's website](http://yann.lecun.com/exdb/mnist/)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Azure ML workspace\n", + "\n", + "We begin by instantiating a workspace object from the existing workspace created earlier in the configuration notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check core SDK version number\n", + "import azureml.core\n", + "\n", + "print(\"SDK version:\", azureml.core.VERSION)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "print(ws.name, ws.resource_group, ws.location, sep = '\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Registering your model with Azure ML" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_dir = \"mnist\" # replace this with the location of your model files\n", + "\n", + "# leave as is if it's in the same folder as this notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.model import Model\n", + "\n", + "model = Model.register(model_path = model_dir + \"//model.onnx\",\n", + " model_name = \"mnist_1\",\n", + " tags = {\"onnx\": \"demo\"},\n", + " description = \"MNIST image classification CNN from ONNX Model Zoo\",\n", + " workspace = ws)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Optional: Displaying your registered models\n", + "\n", + "This step is not required, so feel free to skip it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "models = ws.models()\n", + "for m in models:\n", + " print(\"Name:\", m.name,\"\\tVersion:\", m.version, \"\\tDescription:\", m.description, m.tags)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nbpresent": { + "id": "c3f2f57c-7454-4d3e-b38d-b0946cf066ea" + } + }, + "source": [ + "### ONNX MNIST Model Methodology\n", + "\n", + "The image classification model we are using is pre-trained using Microsoft's deep learning cognitive toolkit, [CNTK](https://github.com/Microsoft/CNTK), from the [ONNX model zoo](http://github.com/onnx/models). The model zoo has many other models that can be deployed on cloud providers like AzureML without any additional training. To ensure that our cloud deployed model works, we use testing data from the famous MNIST data set, provided as part of the [trained MNIST model](https://github.com/onnx/models/tree/master/mnist) in the ONNX model zoo.\n", + "\n", + "***Input: Handwritten Images from MNIST Dataset***\n", + "\n", + "***Task: Classify each MNIST image into an appropriate digit***\n", + "\n", + "***Output: Digit prediction for input image***\n", + "\n", + "Run the cell below to look at some of the sample images from the MNIST dataset that we used to train this ONNX model. Remember, once the application is deployed in Azure ML, you can use your own images as input for the model to classify!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# for images and plots in this notebook\n", + "import matplotlib.pyplot as plt \n", + "from IPython.display import Image\n", + "\n", + "# display images inline\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Image(url=\"http://3.bp.blogspot.com/_UpN7DfJA0j4/TJtUBWPk0SI/AAAAAAAAABY/oWPMtmqJn3k/s1600/mnist_originals.png\", width=200, height=200)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deploy our model on Azure ML" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We are now going to deploy our ONNX Model on AML with inference in ONNX Runtime. We begin by writing a score.py file, which will help us run the model in our Azure ML virtual machine (VM), and then specify our environment by writing a yml file.\n", + "\n", + "You will also notice that we import the onnxruntime library to do runtime inference on our ONNX models (passing in input and evaluating out model's predicted output). More information on the API and commands can be found in the [ONNX Runtime documentation](https://aka.ms/onnxruntime).\n", + "\n", + "### Write Score File\n", + "\n", + "A score file is what tells our Azure cloud service what to do. After initializing our model using azureml.core.model, we start an ONNX Runtime inference session to evaluate the data passed in on our function calls." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile score.py\n", + "import json\n", + "import numpy as np\n", + "import onnxruntime\n", + "import sys\n", + "import os\n", + "from azureml.core.model import Model\n", + "import time\n", + "\n", + "\n", + "def init():\n", + " global session\n", + " model = Model.get_model_path(model_name = 'mnist_1')\n", + " session = onnxruntime.InferenceSession(model, None)\n", + " \n", + "def run(input_data):\n", + " '''Purpose: evaluate test input in Azure Cloud using onnxruntime.\n", + " We will call the run function later from our Jupyter Notebook \n", + " so our azure service can evaluate our model input in the cloud. '''\n", + "\n", + " try:\n", + " # load in our data, convert to readable format\n", + " start = time.time()\n", + " data = np.array(json.loads(input_data)['data']).astype('float32')\n", + "\n", + " r = session.run([\"Plus214_Output_0\"], {\"Input3\": data})[0]\n", + " result = choose_class(r[0])\n", + " end = time.time()\n", + " result_dict = {\"result\": np.array(result).tolist(),\n", + " \"time\": np.array(end - start).tolist()}\n", + " except Exception as e:\n", + " result_dict = {\"error\": str(e)}\n", + " \n", + " return json.dumps(result_dict)\n", + "\n", + "def choose_class(result_prob):\n", + " \"\"\"We use argmax to determine the right label to choose from our output, after calling softmax on the 10 numbers we receive\"\"\"\n", + " return int(np.argmax(result_prob, axis=0))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Write Environment File" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This step creates a YAML file that specifies which dependencies we would like to see in our Linux Virtual Machine." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.conda_dependencies import CondaDependencies \n", + "\n", + "myenv = CondaDependencies()\n", + "myenv.add_pip_package(\"numpy\")\n", + "myenv.add_pip_package(\"azureml-core\")\n", + "myenv.add_pip_package(\"onnxruntime\")\n", + "\n", + "\n", + "with open(\"myenv.yml\",\"w\") as f:\n", + " f.write(myenv.serialize_to_string())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create the Container Image\n", + "\n", + "This step will likely take a few minutes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.image import ContainerImage\n", + "\n", + "image_config = ContainerImage.image_configuration(execution_script = \"score.py\",\n", + " runtime = \"python\",\n", + " conda_file = \"myenv.yml\",\n", + " description = \"test\",\n", + " tags = {\"demo\": \"onnx\"} \n", + " )\n", + "\n", + "\n", + "image = ContainerImage.create(name = \"onnxtest\",\n", + " # this is the model object\n", + " models = [model],\n", + " image_config = image_config,\n", + " workspace = ws)\n", + "\n", + "image.wait_for_creation(show_output = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Debugging\n", + "\n", + "In case you need to debug your code, the next line of code accesses the log file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(image.image_build_log_uri)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We're all set! Let's get our model chugging.\n", + "\n", + "## Deploy the container image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.webservice import AciWebservice\n", + "\n", + "aciconfig = AciWebservice.deploy_configuration(cpu_cores = 1, \n", + " memory_gb = 1, \n", + " tags = {'demo': 'onnx'}, \n", + " description = 'ONNX for mnist model')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following cell will likely take a few minutes to run as well." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.webservice import Webservice\n", + "\n", + "aci_service_name = 'onnx-demo-mnist'\n", + "print(\"Service\", aci_service_name)\n", + "\n", + "aci_service = Webservice.deploy_from_image(deployment_config = aciconfig,\n", + " image = image,\n", + " name = aci_service_name,\n", + " workspace = ws)\n", + "\n", + "aci_service.wait_for_deployment(True)\n", + "print(aci_service.state)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if aci_service.state != 'Healthy':\n", + " # run this command for debugging.\n", + " print(aci_service.get_logs())\n", + "\n", + " # If your deployment fails, make sure to delete your aci_service or rename your service before trying again!\n", + " # aci_service.delete()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Success!\n", + "\n", + "If you've made it this far, you've deployed a working VM with a handwritten digit classifier running in the cloud using Azure ML. Congratulations!\n", + "\n", + "Let's see how well our model deals with our test images." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Testing and Evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Test Data\n", + "\n", + "These are already in your directory from your ONNX model download (from the model zoo). If you didn't place your model and test data in the same directory as this notebook, edit the \"model_dir\" filename below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# to manipulate our arrays\n", + "import numpy as np \n", + "\n", + "# read in test data protobuf files included with the model\n", + "import onnx\n", + "from onnx import numpy_helper\n", + "\n", + "# to use parsers to read in our model/data\n", + "import json\n", + "import os\n", + "\n", + "test_inputs = []\n", + "test_outputs = []\n", + "\n", + "# read in 3 testing images from .pb files\n", + "test_data_size = 3\n", + "\n", + "for i in np.arange(test_data_size):\n", + " input_test_data = os.path.join(model_dir, 'test_data_set_{0}'.format(i), 'input_0.pb')\n", + " output_test_data = os.path.join(model_dir, 'test_data_set_{0}'.format(i), 'output_0.pb')\n", + " \n", + " # convert protobuf tensors to np arrays using the TensorProto reader from ONNX\n", + " tensor = onnx.TensorProto()\n", + " with open(input_test_data, 'rb') as f:\n", + " tensor.ParseFromString(f.read())\n", + " \n", + " input_data = numpy_helper.to_array(tensor)\n", + " test_inputs.append(input_data)\n", + " \n", + " with open(output_test_data, 'rb') as f:\n", + " tensor.ParseFromString(f.read())\n", + " \n", + " output_data = numpy_helper.to_array(tensor)\n", + " test_outputs.append(output_data)\n", + " \n", + "if len(test_inputs) == test_data_size:\n", + " print('Test data loaded successfully.')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "nbpresent": { + "id": "c3f2f57c-7454-4d3e-b38d-b0946cf066ea" + } + }, + "source": [ + "### Show some sample images\n", + "We use `matplotlib` to plot 3 test images from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbpresent": { + "id": "396d478b-34aa-4afa-9898-cdce8222a516" + } + }, + "outputs": [], + "source": [ + "plt.figure(figsize = (16, 6))\n", + "for test_image in np.arange(3):\n", + " plt.subplot(1, 15, test_image+1)\n", + " plt.axhline('')\n", + " plt.axvline('')\n", + " plt.imshow(test_inputs[test_image].reshape(28, 28), cmap = plt.cm.Greys)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run evaluation / prediction" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize = (16, 6), frameon=False)\n", + "plt.subplot(1, 8, 1)\n", + "\n", + "plt.text(x = 0, y = -30, s = \"True Label: \", fontsize = 13, color = 'black')\n", + "plt.text(x = 0, y = -20, s = \"Result: \", fontsize = 13, color = 'black')\n", + "plt.text(x = 0, y = -10, s = \"Inference Time: \", fontsize = 13, color = 'black')\n", + "plt.text(x = 3, y = 14, s = \"Model Input\", fontsize = 12, color = 'black')\n", + "plt.text(x = 6, y = 18, s = \"(28 x 28)\", fontsize = 12, color = 'black')\n", + "plt.imshow(np.ones((28,28)), cmap=plt.cm.Greys) \n", + "\n", + "\n", + "for i in np.arange(test_data_size):\n", + " \n", + " input_data = json.dumps({'data': test_inputs[i].tolist()})\n", + " \n", + " # predict using the deployed model\n", + " r = json.loads(aci_service.run(input_data))\n", + " \n", + " if len(r) == 1:\n", + " print(r['error'])\n", + " break\n", + " \n", + " result = r['result']\n", + " time_ms = np.round(r['time'] * 1000, 2)\n", + " \n", + " ground_truth = int(np.argmax(test_outputs[i]))\n", + " \n", + " # compare actual value vs. the predicted values:\n", + " plt.subplot(1, 8, i+2)\n", + " plt.axhline('')\n", + " plt.axvline('')\n", + "\n", + " # use different color for misclassified sample\n", + " font_color = 'red' if ground_truth != result else 'black'\n", + " clr_map = plt.cm.gray if ground_truth != result else plt.cm.Greys\n", + "\n", + " # ground truth labels are in blue\n", + " plt.text(x = 10, y = -30, s = ground_truth, fontsize = 18, color = 'blue')\n", + " \n", + " # predictions are in black if correct, red if incorrect\n", + " plt.text(x = 10, y = -20, s = result, fontsize = 18, color = font_color)\n", + " plt.text(x = 5, y = -10, s = str(time_ms) + ' ms', fontsize = 14, color = font_color)\n", + "\n", + " \n", + " plt.imshow(test_inputs[i].reshape(28, 28), cmap = clr_map)\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Try classifying your own images!\n", + "\n", + "Create your own 28 pixel by 28 pixel handwritten image and pass it into the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Preprocessing functions\n", + "\n", + "def rgb2gray(rgb):\n", + " \"\"\"Convert the input image into grayscale\"\"\"\n", + " return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])\n", + "\n", + "def preprocess(img):\n", + " \"\"\"Resize input images and convert them to grayscale.\"\"\"\n", + " if img.shape[0] != 28:\n", + " print(\"Input image size is not 28 * 28 pixels. Please resize and try again.\")\n", + " grayscale = rgb2gray(img)\n", + " grayscale.resize((1, 1, 28, 28))\n", + " return grayscale" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Replace this string with your own path/test image\n", + "# Make sure the dimensions are 28 * 28 pixels\n", + "\n", + "# Any PNG or JPG image file should work\n", + "# Make sure to include the entire path with // instead of /\n", + "\n", + "# e.g. your_test_image = \"C://Users//vinitra.swamy//Pictures//digit.png\"\n", + "\n", + "your_test_image = \"\"\n", + "\n", + "import matplotlib.image as mpimg\n", + "\n", + "if your_test_image != \"\":\n", + " img = mpimg.imread(your_test_image)\n", + " plt.subplot(1,3,1)\n", + " plt.imshow(img, cmap = plt.cm.Greys)\n", + " print(\"Old Dimensions: \", img.shape)\n", + " img = preprocess(img)\n", + " print(\"New Dimensions: \", img.shape)\n", + "else:\n", + " img = None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if img is None:\n", + " print(\"Add the path for your image data.\")\n", + "else:\n", + " input_data = json.dumps({'data': img.tolist()})\n", + "\n", + " try:\n", + " r = json.loads(aci_service.run(input_data))\n", + " result = r['result']\n", + " time_ms = np.round(r['time'] * 1000, 2)\n", + " except Exception as e:\n", + " print(str(e), r['error'])\n", + "\n", + " plt.figure(figsize = (16, 6))\n", + " plt.subplot(1, 15,1)\n", + " plt.axhline('')\n", + " plt.axvline('')\n", + " plt.text(x = -100, y = -20, s = \"Model prediction: \", fontsize = 14)\n", + " plt.text(x = -100, y = -10, s = \"Inference time: \", fontsize = 14)\n", + " plt.text(x = 0, y = -20, s = str(result), fontsize = 14)\n", + " plt.text(x = 0, y = -10, s = str(time_ms) + \" ms\", fontsize = 14)\n", + " plt.text(x = -100, y = 14, s = \"Input image: \", fontsize = 14)\n", + " plt.imshow(img.reshape(28, 28), cmap = plt.cm.gray) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Optional: How does our MNIST model work? \n", + "#### A brief explanation of Convolutional Neural Networks\n", + "\n", + "A [convolutional neural network](https://en.wikipedia.org/wiki/Convolutional_neural_network) (CNN, or ConvNet) is a type of [feed-forward](https://en.wikipedia.org/wiki/Feedforward_neural_network) artificial neural network made up of neurons that have learnable weights and biases. The CNNs take advantage of the spatial nature of the data. In nature, we perceive different objects by their shapes, size and colors. For example, objects in a natural scene are typically edges, corners/vertices (defined by two of more edges), color patches etc. These primitives are often identified using different detectors (e.g., edge detection, color detector) or combination of detectors interacting to facilitate image interpretation (object classification, region of interest detection, scene description etc.) in real world vision related tasks. These detectors are also known as filters. Convolution is a mathematical operator that takes an image and a filter as input and produces a filtered output (representing say edges, corners, or colors in the input image). \n", + "\n", + "Historically, these filters are a set of weights that were often hand crafted or modeled with mathematical functions (e.g., [Gaussian](https://en.wikipedia.org/wiki/Gaussian_filter) / [Laplacian](http://homepages.inf.ed.ac.uk/rbf/HIPR2/log.htm) / [Canny](https://en.wikipedia.org/wiki/Canny_edge_detector) filter). The filter outputs are mapped through non-linear activation functions mimicking human brain cells called [neurons](https://en.wikipedia.org/wiki/Neuron). Popular deep CNNs or ConvNets (such as [AlexNet](https://en.wikipedia.org/wiki/AlexNet), [VGG](https://arxiv.org/abs/1409.1556), [Inception](http://www.cv-foundation.org/openaccess/content_cvpr_2015/papers/Szegedy_Going_Deeper_With_2015_CVPR_paper.pdf), [ResNet](https://arxiv.org/pdf/1512.03385v1.pdf)) that are used for various [computer vision](https://en.wikipedia.org/wiki/Computer_vision) tasks have many of these architectural primitives (inspired from biology). \n", + "\n", + "### Convolution Layer\n", + "\n", + "A convolution layer is a set of filters. Each filter is defined by a weight (**W**) matrix, and bias ($b$).\n", + "\n", + "![](https://www.cntk.ai/jup/cntk103d_filterset_v2.png)\n", + "\n", + "These filters are scanned across the image performing the dot product between the weights and corresponding input value ($x$). The bias value is added to the output of the dot product and the resulting sum is optionally mapped through an activation function. This process is illustrated in the following animation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Image(url=\"https://www.cntk.ai/jup/cntk103d_conv2d_final.gif\", width= 200)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model Description\n", + "\n", + "The MNIST model from the ONNX Model Zoo uses maxpooling to update the weights in its convolutions, summarized by the graphic below. You can see the entire workflow of our pre-trained model in the following image, with our input images and our output probabilities of each of our 10 labels. If you're interested in exploring the logic behind creating a Deep Learning model further, please look at the [training tutorial for our ONNX MNIST Convolutional Neural Network](https://github.com/Microsoft/CNTK/blob/master/Tutorials/CNTK_103D_MNIST_ConvolutionalNeuralNetwork.ipynb). " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Max-Pooling for Convolutional Neural Nets\n", + "\n", + "![](http://www.cntk.ai/jup/c103d_max_pooling.gif)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Pre-Trained Model Architecture\n", + "\n", + "![](http://www.cntk.ai/jup/conv103d_mnist-conv-mp.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Try classifying your own images!\n", + "\n", + "Create your own 28 pixel by 28 pixel handwritten image and pass it into the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Preprocessing functions\n", + "\n", + "def rgb2gray(rgb):\n", + " \"\"\"Convert the input image into grayscale\"\"\"\n", + " return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])\n", + "\n", + "def preprocess(img):\n", + " \"\"\"Resize input images and convert them to grayscale.\"\"\"\n", + " if img.shape[0] != 28:\n", + " print(\"Input image size is not 28 * 28 pixels. Please resize and try again.\")\n", + " grayscale = rgb2gray(img)\n", + " grayscale.resize((1, 1, 28, 28))\n", + " return grayscale" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Replace this string with your own path/test image\n", + "# Make sure the dimensions are 28 * 28 pixels\n", + "\n", + "# Any PNG or JPG image file should work\n", + "# Make sure to include the entire path with // instead of /\n", + "\n", + "# e.g. your_test_image = \"C://Users//vinitra.swamy//Pictures//digit.png\"\n", + "\n", + "your_test_image = \"\"\n", + "\n", + "import matplotlib.image as mpimg\n", + "\n", + "if your_test_image != \"\":\n", + " img = mpimg.imread(your_test_image)\n", + " plt.subplot(1,3,1)\n", + " plt.imshow(img, cmap = plt.cm.Greys)\n", + " print(\"Old Dimensions: \", img.shape)\n", + " img = preprocess(img)\n", + " print(\"New Dimensions: \", img.shape)\n", + "else:\n", + " img = None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if img is None:\n", + " print(\"Add the path for your image data.\")\n", + "else:\n", + " input_data = json.dumps({'data': img.tolist()})\n", + "\n", + " try:\n", + " r = json.loads(aci_service.run(input_data))\n", + " result = r['result']\n", + " time_ms = np.round(r['time'] * 1000, 2)\n", + " except Exception as e:\n", + " print(str(e), r['error'])\n", + "\n", + " plt.figure(figsize = (16, 6))\n", + " plt.subplot(1, 15,1)\n", + " plt.axhline('')\n", + " plt.axvline('')\n", + " plt.text(x = -100, y = -20, s = \"Model prediction: \", fontsize = 14)\n", + " plt.text(x = -100, y = -10, s = \"Inference time: \", fontsize = 14)\n", + " plt.text(x = 0, y = -20, s = str(result), fontsize = 14)\n", + " plt.text(x = 0, y = -10, s = str(time_ms) + \" ms\", fontsize = 14)\n", + " plt.text(x = -100, y = 14, s = \"Input image: \", fontsize = 14)\n", + " plt.imshow(img.reshape(28, 28), cmap = plt.cm.gray) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# remember to delete your service after you are done using it!\n", + "# uncomment the following line of code to delete your service\n", + "\n", + "# aci_service.delete()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "Congratulations!\n", + "\n", + "In this tutorial, you have managed to:\n", + "- familiarize yourself with the ONNX model format, ONNX Runtime inference, and the pretrained models in the ONNX model zoo\n", + "- understand a state-of-the-art convolutional neural net image classification model (MNIST in ONNX) and deploy it in the Azure ML cloud\n", + "- ensure that your deep learning model is working perfectly (in the cloud) on test data, and check it against some of your own!\n", + "\n", + "Next steps:\n", + "- Check out another interesting application based on a Microsoft Research computer vision paper that lets you set up a [facial emotion recognition model](https://github.com/Azure/MachineLearningNotebooks/tree/master/onnx/onnx-inference-emotion-recognition.ipynb) in the cloud! This tutorial deploys a pre-trained ONNX Computer Vision model in an Azure ML virtual machine with GPU support.\n", + "- Contribute to our [open source ONNX repository on github](http://github.com/onnx/onnx) and/or add to our [ONNX model zoo](http://github.com/onnx/models)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:finaldemo]", + "language": "python", + "name": "conda-env-finaldemo-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + }, + "msauthor": "vinitra.swamy" + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pipeline/06.pipeline-batch-scoring.ipynb b/pipeline/pipeline-batch-scoring.ipynb similarity index 99% rename from pipeline/06.pipeline-batch-scoring.ipynb rename to pipeline/pipeline-batch-scoring.ipynb index bf53e4d95..d2b334350 100644 --- a/pipeline/06.pipeline-batch-scoring.ipynb +++ b/pipeline/pipeline-batch-scoring.ipynb @@ -375,7 +375,7 @@ "metadata": {}, "outputs": [], "source": [ - "node_run = list(pipeline_run.get_children())[0]" + "step_run = list(pipeline_run.get_children())[0]" ] }, { @@ -384,7 +384,7 @@ "metadata": {}, "outputs": [], "source": [ - "node_run.download_file(\"./outputs/result-labels.txt\")" + "step_run.download_file(\"./outputs/result-labels.txt\")" ] }, { @@ -522,7 +522,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.3" } }, "nbformat": 4, diff --git a/training/01.train-tune-deploy-pytorch/01.train-tune-deploy-pytorch.ipynb b/training/01.train-tune-deploy-pytorch/01.train-tune-deploy-pytorch.ipynb new file mode 100644 index 000000000..1e6a6e16c --- /dev/null +++ b/training/01.train-tune-deploy-pytorch/01.train-tune-deploy-pytorch.ipynb @@ -0,0 +1,641 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved. \n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 01. Train and deploy with PyTorch\n", + "\n", + "In this tutorial, you will train, hyperparameter tune, and deploy a PyTorch model using the Azure Machine Learning (AML) Python SDK.\n", + "\n", + "This tutorial will train an image classification model using transfer learning, based on PyTorch's [Transfer Learning tutorial](https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html). The model is trained to classify ants and bees by first using a pretrained ResNet18 model that has been trained on the [ImageNet](http://image-net.org/index) dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "* Understand the [architecture and terms](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture) introduced by Azure Machine Learning\n", + "* Go through the [00.configuration.ipynb](https://github.com/Azure/MachineLearningNotebooks/blob/master/00.configuration.ipynb) notebook to:\n", + " * install the AML SDK\n", + " * create a workspace and its configuration file (`config.json`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check core SDK version number\n", + "import azureml.core\n", + "\n", + "print(\"SDK version:\", azureml.core.VERSION)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize workspace\n", + "Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.workspace import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "print('Workspace name: ' + ws.name, \n", + " 'Azure region: ' + ws.location, \n", + " 'Subscription id: ' + ws.subscription_id, \n", + " 'Resource group: ' + ws.resource_group, sep = '\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a remote compute target\n", + "You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) to execute your training script on. In this tutorial, you create an [Azure Batch AI](https://docs.microsoft.com/azure/batch-ai/overview) cluster as your training compute resource. This code creates a cluster for you if it does not already exist in your workspace.\n", + "\n", + "**Creation of the cluster takes approximately 5 minutes.** If the cluster is already in your workspace this code will skip the cluster creation process." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.compute import ComputeTarget, BatchAiCompute\n", + "from azureml.core.compute_target import ComputeTargetException\n", + "\n", + "# choose a name for your cluster\n", + "cluster_name = \"gpucluster\"\n", + "\n", + "try:\n", + " compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n", + " print('Found existing compute target.')\n", + "except ComputeTargetException:\n", + " print('Creating a new compute target...')\n", + " compute_config = BatchAiCompute.provisioning_configuration(vm_size='STANDARD_NC6', \n", + " autoscale_enabled=True,\n", + " cluster_min_nodes=0, \n", + " cluster_max_nodes=4)\n", + "\n", + " # create the cluster\n", + " compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n", + "\n", + " compute_target.wait_for_completion(show_output=True)\n", + "\n", + " # Use the 'status' property to get a detailed status for the current cluster. \n", + " print(compute_target.status.serialize())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above code creates a GPU cluster. If you instead want to create a CPU cluster, provide a different VM size to the `vm_size` parameter, such as `STANDARD_D2_V2`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Upload training data\n", + "The dataset we will use consists of about 120 training images each for ants and bees, with 75 validation images for each class." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, download the dataset (located [here](https://download.pytorch.org/tutorial/hymenoptera_data.zip) as a zip file) locally to your current directory and extract the files. This will create a folder called `hymenoptera_data` with two subfolders `train` and `val` that contain the training and validation images, respectively. [Hymenoptera](https://en.wikipedia.org/wiki/Hymenoptera) is the order of insects that includes ants and bees." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import urllib\n", + "from zipfile import ZipFile\n", + "\n", + "# download data\n", + "download_url = 'https://download.pytorch.org/tutorial/hymenoptera_data.zip'\n", + "data_file = './hymenoptera_data.zip'\n", + "urllib.request.urlretrieve(download_url, filename=data_file)\n", + "\n", + "# extract files\n", + "with ZipFile(data_file, 'r') as zip:\n", + " print('extracting files...')\n", + " zip.extractall()\n", + " print('done')\n", + " \n", + "# delete zip file\n", + "os.remove(data_file)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To make the data accessible for remote training, you will need to upload the data from your local machine to the cloud. AML provides a convenient way to do so via a [Datastore](https://docs.microsoft.com/azure/machine-learning/service/how-to-access-data). The datastore provides a mechanism for you to upload/download data, and interact with it from your remote compute targets. \n", + "\n", + "**Note: If your data is already stored in Azure, or you download the data as part of your training script, you will not need to do this step.**\n", + "\n", + "Each workspace is associated with a default datastore. In this tutorial, we will upload the training data to this default datastore." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ds = ws.get_default_datastore()\n", + "print(ds.datastore_type, ds.account_name, ds.container_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following code will upload the training data to the path `./hymenoptera_data` on the default datastore." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ds.upload(src_dir='./hymenoptera_data', target_path='hymenoptera_data')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's get a reference to the path on the datastore with the training data. We can do so using the `path` method. In the next section, we can then pass this reference to our training script's `--data_dir` argument. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "path_on_datastore = 'hymenoptera_data'\n", + "ds_data = ds.path(path_on_datastore)\n", + "print(ds_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train model on the remote compute\n", + "Now that you have your data and training script prepared, you are ready to train on your remote compute cluster. You can take advantage of Azure compute to leverage GPUs to cut down your training time. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a project directory\n", + "Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script and any additional files your training script depends on." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "project_folder = './pytorch-hymenoptera'\n", + "os.makedirs(project_folder, exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare training script\n", + "Now you will need to create your training script. In this tutorial, the training script is already provided for you at `pytorch_train.py`. In practice, you should be able to take any custom training script as is and run it with AML without having to modify your code.\n", + "\n", + "However, if you would like to use AML's [tracking and metrics](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#metrics) capabilities, you will have to add a small amount of AML code inside your training script. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copy the training script `pytorch_train.py` into your project directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import shutil\n", + "shutil.copy('pytorch_train.py', project_folder)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create an experiment\n", + "Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this transfer learning PyTorch tutorial. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Experiment\n", + "\n", + "experiment_name = 'pytorch-hymenoptera'\n", + "experiment = Experiment(ws, name=experiment_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a PyTorch estimator\n", + "The AML SDK's PyTorch estimator enables you to easily submit PyTorch training jobs for both single-node and distributed runs. For more information on the PyTorch estimator, refer [here](https://docs.microsoft.com/azure/machine-learning/service/how-to-train-pytorch). The following code will define a single-node PyTorch job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.train.dnn import PyTorch\n", + "\n", + "script_params = {\n", + " '--data_dir': ds_data,\n", + " '--num_epochs': 25,\n", + " '--output_dir': './outputs'\n", + "}\n", + "\n", + "estimator = PyTorch(source_directory=project_folder, \n", + " script_params=script_params,\n", + " compute_target=compute_target,\n", + " entry_script='pytorch_train.py',\n", + " use_gpu=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `script_params` parameter is a dictionary containing the command-line arguments to your training script `entry_script`. Please note the following:\n", + "- We passed our training data reference `ds_data` to our script's `--data_dir` argument. This will 1) mount our datastore on the remote compute and 2) provide the path to the training data `hymenoptera_data` on our datastore.\n", + "- We specified the output directory as `./outputs`. The `outputs` directory is specially treated by AML in that all the content in this directory gets uploaded to your workspace as part of your run history. The files written to this directory are therefore accessible even once your remote run is over. In this tutorial, we will save our trained model to this output directory.\n", + "\n", + "To leverage the Azure VM's GPU for training, we set `use_gpu=True`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Submit job\n", + "Run your experiment by submitting your estimator object. Note that this call is asynchronous." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run = experiment.submit(estimator)\n", + "print(run.get_details())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Monitor your run\n", + "You can monitor the progress of the run with a Jupyter widget. Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.train.widgets import RunDetails\n", + "RunDetails(run).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Register the trained model\n", + "Finally, register the trained model from your run to your workspace. The `model_path` parameter takes in the relative path on the remote VM to the model in your `outputs` directory. In the next section, we will deploy this registered model as a web service." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = run.register_model(model_name = 'pytorch-hymenoptera', model_path = 'outputs/model.pt')\n", + "print(model.name, model.id, model.version, sep = '\\t')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deploy model as web service\n", + "Once you have your trained model, you can deploy the model on Azure. In this tutorial, we will deploy the model as a web service in [Azure Container Instances](https://docs.microsoft.com/en-us/azure/container-instances/) (ACI). For more information on deploying models using Azure ML, refer [here](https://docs.microsoft.com/azure/machine-learning/service/how-to-deploy-and-where)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create scoring script\n", + "\n", + "First, we will create a scoring script that will be invoked by the web service call. Note that the scoring script must have two required functions:\n", + "* `init()`: In this function, you typically load the model into a `global` object. This function is executed only once when the Docker container is started. \n", + "* `run(input_data)`: In this function, the model is used to predict a value based on the input data. The input and output typically use JSON as serialization and deserialization format, but you are not limited to that.\n", + "\n", + "Refer to the scoring script `pytorch_score.py` for this tutorial. Our web service will use this file to predict whether an image is an ant or a bee. When writing your own scoring script, don't forget to test it locally first before you go and deploy the web service." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create environment file\n", + "Then, we will need to create an environment file (`myenv.yml`) that specifies all of the scoring script's package dependencies. This file is used to ensure that all of those dependencies are installed in the Docker image by AML. In this case, we need to specify `torch`, `torchvision`, `pillow`, and `azureml-sdk`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile myenv.yml\n", + "name: myenv\n", + "channels:\n", + " - defaults\n", + "dependencies:\n", + " - pip:\n", + " - torch\n", + " - torchvision\n", + " - pillow\n", + " # Required packages for AzureML execution, history, and data preparation.\n", + " - --extra-index-url https://azuremlsdktestpypi.azureedge.net/sdk-release/Preview/E7501C02541B433786111FE8E140CAA1\n", + " - azureml-core" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Configure the container image\n", + "Now configure the Docker image that you will use to build your ACI container." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.image import ContainerImage\n", + "\n", + "image_config = ContainerImage.image_configuration(execution_script='pytorch_score.py', \n", + " runtime='python', \n", + " conda_file='myenv.yml',\n", + " description='Image with hymenoptera model')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Configure the ACI container\n", + "We are almost ready to deploy. Create a deployment configuration file to specify the number of CPUs and gigabytes of RAM needed for your ACI container. While it depends on your model, the default of `1` core and `1` gigabyte of RAM is usually sufficient for many models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.webservice import AciWebservice\n", + "\n", + "aciconfig = AciWebservice.deploy_configuration(cpu_cores=1, \n", + " memory_gb=1, \n", + " tags={'data': 'hymenoptera', 'method':'transfer learning', 'framework':'pytorch'},\n", + " description='Classify ants/bees using transfer learning with PyTorch')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Deploy the registered model\n", + "Finally, let's deploy a web service from our registered model. First, retrieve the model from your workspace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.model import Model\n", + "\n", + "model = Model(ws, name='pytorch-hymenoptera')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then, deploy the web service using the ACI config and image config files created in the previous steps. We pass the `model` object in a list to the `models` parameter. If you would like to deploy more than one registered model, append the additional models to this list." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "from azureml.core.webservice import Webservice\n", + "\n", + "service_name = 'aci-hymenoptera'\n", + "service = Webservice.deploy_from_model(workspace=ws,\n", + " name=service_name,\n", + " models=[model],\n", + " image_config=image_config,\n", + " deployment_config=aciconfig,)\n", + "\n", + "service.wait_for_deployment(show_output=True)\n", + "print(service.state)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If your deployment fails for any reason and you need to redeploy, make sure to delete the service before you do so: `service.delete()`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To get the logs from the deployment process, run the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "service.get_logs()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the web service's HTTP endpoint, which accepts REST client calls. This endpoint can be shared with anyone who wants to test the web service or integrate it into an application." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(service.scoring_uri)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test the web service\n", + "Finally, let's test our deployed web service. We will send the data as a JSON string to the web service hosted in ACI and use the SDK's `run` API to invoke the service. Here we will take an arbitrary image from our validation data to predict on." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os, json, base64\n", + "from io import BytesIO\n", + "from PIL import Image\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def imgToBase64(img):\n", + " \"\"\"Convert pillow image to base64-encoded image\"\"\"\n", + " imgio = BytesIO()\n", + " img.save(imgio, 'JPEG')\n", + " img_str = base64.b64encode(imgio.getvalue())\n", + " return img_str.decode('utf-8')\n", + "\n", + "test_img = os.path.join('hymenoptera_data', 'val', 'bees', '10870992_eebeeb3a12.jpg') #arbitary image from val dataset\n", + "plt.imshow(Image.open(test_img))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "base64Img = imgToBase64(Image.open(test_img))\n", + "\n", + "result = service.run(input_data=json.dumps({'data': base64Img}))\n", + "print(json.loads(result))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete web service\n", + "Once you no longer need the web service, you should delete it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "service.delete()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:amlsdk]", + "language": "python", + "name": "conda-env-amlsdk-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + }, + "msauthor": "minxia" + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/training/01.train-tune-deploy-pytorch/pytorch_score.py b/training/01.train-tune-deploy-pytorch/pytorch_score.py new file mode 100644 index 000000000..7bed01a8a --- /dev/null +++ b/training/01.train-tune-deploy-pytorch/pytorch_score.py @@ -0,0 +1,59 @@ +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. + +import torch +import torch.nn as nn +import torchvision +from torchvision import transforms +import os +import json +import base64 +from io import BytesIO +from PIL import Image + +from azureml.core.model import Model + + +def preprocess_image(image_file): + """Preprocess the input image.""" + data_transforms = transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + + image = Image.open(image_file) + image = data_transforms(image).float() + image = torch.tensor(image) + image = image.unsqueeze(0) + return image + + +def base64ToImg(base64ImgString): + base64Img = base64ImgString.encode('utf-8') + decoded_img = base64.b64decode(base64Img) + return BytesIO(decoded_img) + + +def init(): + global model + model_path = Model.get_model_path('pytorch-hymenoptera') + model = torch.load(model_path, map_location=lambda storage, loc: storage) + model.eval() + + +def run(input_data): + img = base64ToImg(json.loads(input_data)['data']) + img = preprocess_image(img) + + # get prediction + output = model(img) + + classes = ['ants', 'bees'] + softmax = nn.Softmax(dim=1) + pred_probs = softmax(model(img)).detach().numpy()[0] + index = torch.argmax(output, 1) + + result = json.dumps({"label": classes[index], "probability": str(pred_probs[index])}) + return result diff --git a/training/01.train-tune-deploy-pytorch/pytorch_train.py b/training/01.train-tune-deploy-pytorch/pytorch_train.py new file mode 100644 index 000000000..364732320 --- /dev/null +++ b/training/01.train-tune-deploy-pytorch/pytorch_train.py @@ -0,0 +1,169 @@ +# Copyright (c) 2017, PyTorch contributors +# Licensed under the BSD license + +# Adapted from https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html + +from __future__ import print_function, division + +import torch +import torch.nn as nn +import torch.optim as optim +from torch.optim import lr_scheduler +import torchvision +from torchvision import datasets, models, transforms +import numpy as np +import time +import os +import copy +import argparse + + +def load_data(data_dir): + """Load the train/val data.""" + + # Data augmentation and normalization for training + # Just normalization for validation + data_transforms = { + 'train': transforms.Compose([ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]), + 'val': transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]), + } + + image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), + data_transforms[x]) + for x in ['train', 'val']} + dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4, + shuffle=True, num_workers=0) + for x in ['train', 'val']} + dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']} + class_names = image_datasets['train'].classes + + return dataloaders, dataset_sizes, class_names + + +def train_model(model, criterion, optimizer, scheduler, num_epochs, data_dir): + """Train the model.""" + + # load training/validation data + dataloaders, dataset_sizes, class_names = load_data(data_dir) + + device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') + since = time.time() + + best_model_wts = copy.deepcopy(model.state_dict()) + best_acc = 0.0 + + for epoch in range(num_epochs): + print('Epoch {}/{}'.format(epoch, num_epochs - 1)) + print('-' * 10) + + # Each epoch has a training and validation phase + for phase in ['train', 'val']: + if phase == 'train': + scheduler.step() + model.train() # Set model to training mode + else: + model.eval() # Set model to evaluate mode + + running_loss = 0.0 + running_corrects = 0 + + # Iterate over data. + for inputs, labels in dataloaders[phase]: + inputs = inputs.to(device) + labels = labels.to(device) + + # zero the parameter gradients + optimizer.zero_grad() + + # forward + # track history if only in train + with torch.set_grad_enabled(phase == 'train'): + outputs = model(inputs) + _, preds = torch.max(outputs, 1) + loss = criterion(outputs, labels) + + # backward + optimize only if in training phase + if phase == 'train': + loss.backward() + optimizer.step() + + # statistics + running_loss += loss.item() * inputs.size(0) + running_corrects += torch.sum(preds == labels.data) + + epoch_loss = running_loss / dataset_sizes[phase] + epoch_acc = running_corrects.double() / dataset_sizes[phase] + + print('{} Loss: {:.4f} Acc: {:.4f}'.format( + phase, epoch_loss, epoch_acc)) + + # deep copy the model + if phase == 'val' and epoch_acc > best_acc: + best_acc = epoch_acc + best_model_wts = copy.deepcopy(model.state_dict()) + + print() + + time_elapsed = time.time() - since + print('Training complete in {:.0f}m {:.0f}s'.format( + time_elapsed // 60, time_elapsed % 60)) + print('Best val Acc: {:4f}'.format(best_acc)) + + # load best model weights + model.load_state_dict(best_model_wts) + return model + + +def fine_tune_model(num_epochs, data_dir): + """Load a pretrained model and reset the final fully connected layer.""" + + model_ft = models.resnet18(pretrained=True) + num_ftrs = model_ft.fc.in_features + model_ft.fc = nn.Linear(num_ftrs, 2) # only 2 classes to predict + + device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') + model_ft = model_ft.to(device) + + criterion = nn.CrossEntropyLoss() + + # Observe that all parameters are being optimized + optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9) + + # Decay LR by a factor of 0.1 every 7 epochs + exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1) + + model = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, num_epochs, data_dir) + + return model + + +def main(): + for root, dirs, files in os.walk("."): + print(root) + print(dirs) + + # get command-line arguments + parser = argparse.ArgumentParser() + parser.add_argument('--data_dir', type=str, help='directory of training data') + parser.add_argument('--num_epochs', type=int, default=25, help='number of epochs to train') + parser.add_argument('--output_dir', type=str, help='output directory') + args = parser.parse_args() + + print("data directory is: " + args.data_dir) + model = fine_tune_model(args.num_epochs, args.data_dir) + os.makedirs(args.output_dir, exist_ok=True) + torch.save(model, os.path.join(args.output_dir, 'model.pt')) + + +if __name__ == "__main__": + main() diff --git a/training/02.distributed-pytorch-with-horovod/02.distributed-pytorch-with-horovod.ipynb b/training/02.distributed-pytorch-with-horovod/02.distributed-pytorch-with-horovod.ipynb new file mode 100644 index 000000000..da7c539fc --- /dev/null +++ b/training/02.distributed-pytorch-with-horovod/02.distributed-pytorch-with-horovod.ipynb @@ -0,0 +1,289 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 02. Distributed PyTorch with Horovod\n", + "In this tutorial, you will train a PyTorch model on the [MNIST](http://yann.lecun.com/exdb/mnist/) dataset using distributed training via [Horovod](https://github.com/uber/horovod)." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "* Understand the [architecture and terms](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture) introduced by Azure Machine Learning (AML)\n", + "* Go through the [00.configuration.ipynb](https://github.com/Azure/MachineLearningNotebooks/blob/master/00.configuration.ipynb) notebook to:\n", + " * install the AML SDK\n", + " * create a workspace and its configuration file (`config.json`)\n", + "* Review the [tutorial](https://aka.ms/aml-notebook-pytorch) on single-node PyTorch training using the SDK" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check core SDK version number\n", + "import azureml.core\n", + "\n", + "print(\"SDK version:\", azureml.core.VERSION)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize workspace\n", + "\n", + "Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.workspace import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "print('Workspace name: ' + ws.name, \n", + " 'Azure region: ' + ws.location, \n", + " 'Subscription id: ' + ws.subscription_id, \n", + " 'Resource group: ' + ws.resource_group, sep = '\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a remote compute target\n", + "You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) to execute your training script on. In this tutorial, you create an [Azure Batch AI](https://docs.microsoft.com/azure/batch-ai/overview) cluster as your training compute resource. This code creates a cluster for you if it does not already exist in your workspace.\n", + "\n", + "**Creation of the cluster takes approximately 5 minutes.** If the cluster is already in your workspace this code will skip the cluster creation process." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.compute import ComputeTarget, BatchAiCompute\n", + "from azureml.core.compute_target import ComputeTargetException\n", + "\n", + "# choose a name for your cluster\n", + "cluster_name = \"gpucluster\"\n", + "\n", + "try:\n", + " compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n", + " print('Found existing compute target.')\n", + "except ComputeTargetException:\n", + " print('Creating a new compute target...')\n", + " compute_config = BatchAiCompute.provisioning_configuration(vm_size='STANDARD_NC6', \n", + " autoscale_enabled=True,\n", + " cluster_min_nodes=0, \n", + " cluster_max_nodes=4)\n", + "\n", + " # create the cluster\n", + " compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n", + "\n", + " compute_target.wait_for_completion(show_output=True)\n", + "\n", + " # Use the 'status' property to get a detailed status for the current cluster. \n", + " print(compute_target.status.serialize())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above code creates a GPU cluster. If you instead want to create a CPU cluster, provide a different VM size to the `vm_size` parameter, such as `STANDARD_D2_V2`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train model on the remote compute\n", + "Now that we have the cluster ready to go, let's run our distributed training job." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a project directory\n", + "Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script and any additional files your training script depends on." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "project_folder = './pytorch-distr-hvd'\n", + "os.makedirs(project_folder, exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copy the training script `pytorch_horovod_mnist.py` into this project directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import shutil\n", + "shutil.copy('pytorch_horovod_mnist.py', project_folder)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create an experiment\n", + "Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed PyTorch tutorial. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Experiment\n", + "\n", + "experiment_name = 'pytorch-distr-hvd'\n", + "experiment = Experiment(ws, name=experiment_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a PyTorch estimator\n", + "The AML SDK's PyTorch estimator enables you to easily submit PyTorch training jobs for both single-node and distributed runs. For more information on the PyTorch estimator, refer [here](https://docs.microsoft.com/azure/machine-learning/service/how-to-train-pytorch)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.train.dnn import PyTorch\n", + "\n", + "estimator = PyTorch(source_directory=project_folder,\n", + " compute_target=compute_target,\n", + " entry_script='pytorch_horovod_mnist.py',\n", + " node_count=2,\n", + " process_count_per_node=1,\n", + " distributed_backend='mpi',\n", + " use_gpu=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above code specifies that we will run our training script on `2` nodes, with one worker per node. In order to execute a distributed run using MPI/Horovod, you must provide the argument `distributed_backend='mpi'`. Using this estimator with these settings, PyTorch, Horovod and their dependencies will be installed for you. However, if your script also uses other packages, make sure to install them via the `PyTorch` constructor's `pip_packages` or `conda_packages` parameters." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Submit job\n", + "Run your experiment by submitting your estimator object. Note that this call is asynchronous." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run = experiment.submit(estimator)\n", + "print(run.get_details())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Monitor your run\n", + "You can monitor the progress of the run with a Jupyter widget. Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.train.widgets import RunDetails\n", + "RunDetails(run).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, you can block until the script has completed training before running more code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run.wait_for_completion(show_output=True) # this provides a verbose log" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [default]", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + }, + "msauthor": "minxia" + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/training/02.distributed-pytorch-with-horovod/pytorch_horovod_mnist.py b/training/02.distributed-pytorch-with-horovod/pytorch_horovod_mnist.py new file mode 100644 index 000000000..a513cff97 --- /dev/null +++ b/training/02.distributed-pytorch-with-horovod/pytorch_horovod_mnist.py @@ -0,0 +1,157 @@ +# Copyright 2017 Uber Technologies, Inc. +# Licensed under the Apache License, Version 2.0 +# Script from horovod/examples: https://github.com/uber/horovod/blob/master/examples/pytorch_mnist.py + +from __future__ import print_function +import argparse +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torchvision import datasets, transforms +from torch.autograd import Variable +import torch.utils.data.distributed +import horovod.torch as hvd + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +parser.add_argument('--batch-size', type=int, default=64, metavar='N', + help='input batch size for training (default: 64)') +parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', + help='input batch size for testing (default: 1000)') +parser.add_argument('--epochs', type=int, default=10, metavar='N', + help='number of epochs to train (default: 10)') +parser.add_argument('--lr', type=float, default=0.01, metavar='LR', + help='learning rate (default: 0.01)') +parser.add_argument('--momentum', type=float, default=0.5, metavar='M', + help='SGD momentum (default: 0.5)') +parser.add_argument('--no-cuda', action='store_true', default=False, + help='disables CUDA training') +parser.add_argument('--seed', type=int, default=42, metavar='S', + help='random seed (default: 42)') +parser.add_argument('--log-interval', type=int, default=10, metavar='N', + help='how many batches to wait before logging training status') +args = parser.parse_args() +args.cuda = not args.no_cuda and torch.cuda.is_available() + +hvd.init() +torch.manual_seed(args.seed) + +if args.cuda: + # Horovod: pin GPU to local rank. + torch.cuda.set_device(hvd.local_rank()) + torch.cuda.manual_seed(args.seed) + + +kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} +train_dataset = \ + datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])) +train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) +train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) + +test_dataset = \ + datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])) +test_sampler = torch.utils.data.distributed.DistributedSampler( + test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) +test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, + sampler=test_sampler, **kwargs) + + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 10, kernel_size=5) + self.conv2 = nn.Conv2d(10, 20, kernel_size=5) + self.conv2_drop = nn.Dropout2d() + self.fc1 = nn.Linear(320, 50) + self.fc2 = nn.Linear(50, 10) + + def forward(self, x): + x = F.relu(F.max_pool2d(self.conv1(x), 2)) + x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) + x = x.view(-1, 320) + x = F.relu(self.fc1(x)) + x = F.dropout(x, training=self.training) + x = self.fc2(x) + return F.log_softmax(x) + + +model = Net() + +if args.cuda: + # Move model to GPU. + model.cuda() + +# Horovod: broadcast parameters. +hvd.broadcast_parameters(model.state_dict(), root_rank=0) + +# Horovod: scale learning rate by the number of GPUs. +optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(), + momentum=args.momentum) + +# Horovod: wrap optimizer with DistributedOptimizer. +optimizer = hvd.DistributedOptimizer( + optimizer, named_parameters=model.named_parameters()) + + +def train(epoch): + model.train() + train_sampler.set_epoch(epoch) + for batch_idx, (data, target) in enumerate(train_loader): + if args.cuda: + data, target = data.cuda(), target.cuda() + data, target = Variable(data), Variable(target) + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % args.log_interval == 0: + print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + epoch, batch_idx * len(data), len(train_sampler), + 100. * batch_idx / len(train_loader), loss.data[0])) + + +def metric_average(val, name): + tensor = torch.FloatTensor([val]) + avg_tensor = hvd.allreduce(tensor, name=name) + return avg_tensor[0] + + +def test(): + model.eval() + test_loss = 0. + test_accuracy = 0. + for data, target in test_loader: + if args.cuda: + data, target = data.cuda(), target.cuda() + data, target = Variable(data, volatile=True), Variable(target) + output = model(data) + # sum up batch loss + test_loss += F.nll_loss(output, target, size_average=False).data[0] + # get the index of the max log-probability + pred = output.data.max(1, keepdim=True)[1] + test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum() + + test_loss /= len(test_sampler) + test_accuracy /= len(test_sampler) + + test_loss = metric_average(test_loss, 'avg_loss') + test_accuracy = metric_average(test_accuracy, 'avg_accuracy') + + if hvd.rank() == 0: + print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( + test_loss, 100. * test_accuracy)) + + +for epoch in range(1, args.epochs + 1): + train(epoch) + test() diff --git a/training/04.distributed-tensorflow-with-horovod/04.distributed-tensorflow-with-horovod.ipynb b/training/04.distributed-tensorflow-with-horovod/04.distributed-tensorflow-with-horovod.ipynb new file mode 100644 index 000000000..221444e0a --- /dev/null +++ b/training/04.distributed-tensorflow-with-horovod/04.distributed-tensorflow-with-horovod.ipynb @@ -0,0 +1,360 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 04. Distributed Tensorflow with Horovod\n", + "In this tutorial, you will train a word2vec model in TensorFlow using distributed training via [Horovod](https://github.com/uber/horovod)." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "* Understand the [architecture and terms](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture) introduced by Azure Machine Learning (AML)\n", + "* Go through the [00.configuration.ipynb](https://github.com/Azure/MachineLearningNotebooks/blob/master/00.configuration.ipynb) notebook to:\n", + " * install the AML SDK\n", + " * create a workspace and its configuration file (`config.json`)\n", + "* Review the [tutorial](https://aka.ms/aml-notebook-hyperdrive) on single-node TensorFlow training using the SDK" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check core SDK version number\n", + "import azureml.core\n", + "\n", + "print(\"SDK version:\", azureml.core.VERSION)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize workspace\n", + "Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.workspace import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "print('Workspace name: ' + ws.name, \n", + " 'Azure region: ' + ws.location, \n", + " 'Subscription id: ' + ws.subscription_id, \n", + " 'Resource group: ' + ws.resource_group, sep = '\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a remote compute target\n", + "You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) to execute your training script on. In this tutorial, you create an [Azure Batch AI](https://docs.microsoft.com/azure/batch-ai/overview) cluster as your training compute resource. This code creates a cluster for you if it does not already exist in your workspace.\n", + "\n", + "**Creation of the cluster takes approximately 5 minutes.** If the cluster is already in your workspace this code will skip the cluster creation process." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.compute import ComputeTarget, BatchAiCompute\n", + "from azureml.core.compute_target import ComputeTargetException\n", + "\n", + "# choose a name for your cluster\n", + "cluster_name = \"gpucluster\"\n", + "\n", + "try:\n", + " compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n", + " print('Found existing compute target')\n", + "except ComputeTargetException:\n", + " print('Creating a new compute target...')\n", + " compute_config = BatchAiCompute.provisioning_configuration(vm_size='STANDARD_NC6', \n", + " autoscale_enabled=True,\n", + " cluster_min_nodes=0, \n", + " cluster_max_nodes=4)\n", + "\n", + " # create the cluster\n", + " compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n", + "\n", + " compute_target.wait_for_completion(show_output=True)\n", + "\n", + " # Use the 'status' property to get a detailed status for the current cluster. \n", + " print(compute_target.status.serialize())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above code creates a GPU cluster. If you instead want to create a CPU cluster, provide a different VM size to the `vm_size` parameter, such as `STANDARD_D2_V2`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Upload data to datastore\n", + "To make data accessible for remote training, AML provides a convenient way to do so via a [Datastore](https://docs.microsoft.com/azure/machine-learning/service/how-to-access-data). The datastore provides a mechanism for you to upload/download data to Azure Storage, and interact with it from your remote compute targets. \n", + "\n", + "If your data is already stored in Azure, or you download the data as part of your training script, you will not need to do this step. For this tutorial, although you can download the data in your training script, we will demonstrate how to upload the training data to a datastore and access it during training to illustrate the datastore functionality." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, download the training data from [here](http://mattmahoney.net/dc/text8.zip) to your local machine:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import urllib\n", + "\n", + "os.makedirs('./data', exist_ok=True)\n", + "download_url = 'http://mattmahoney.net/dc/text8.zip'\n", + "urllib.request.urlretrieve(download_url, filename='./data/text8.zip')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Each workspace is associated with a default datastore. In this tutorial, we will upload the training data to this default datastore. The below code will upload the contents of the data directory to the path `./data` on the default datastore." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ds = ws.get_default_datastore()\n", + "print(ds.datastore_type, ds.account_name, ds.container_name)\n", + "\n", + "ds.upload(src_dir='data', target_path='data', overwrite=True, show_progress=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For convenience, let's get a reference to the path on the datastore with the zip file of training data. We can do so using the `path` method. In the next section, we can then pass this reference to our training script's `--input_data` argument. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "path_on_datastore = 'data/text8.zip'\n", + "ds_data = ds.path(path_on_datastore)\n", + "print(ds_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train model on the remote compute" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a project directory\n", + "Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script, and any additional files your training script depends on." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "project_folder = './tf-distr-hvd'\n", + "os.makedirs(project_folder, exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copy the training script `tf_horovod_word2vec.py` into this project directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import shutil\n", + "shutil.copy('tf_horovod_word2vec.py', project_folder)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create an experiment\n", + "Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed TensorFlow tutorial. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Experiment\n", + "\n", + "experiment_name = 'tf-distr-hvd'\n", + "experiment = Experiment(ws, name=experiment_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a TensorFlow estimator\n", + "The AML SDK's TensorFlow estimator enables you to easily submit TensorFlow training jobs for both single-node and distributed runs. For more information on the TensorFlow estimator, refer [here](https://docs.microsoft.com/azure/machine-learning/service/how-to-train-tensorflow)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.train.dnn import TensorFlow\n", + "\n", + "script_params={\n", + " '--input_data': ds_data\n", + "}\n", + "\n", + "estimator= TensorFlow(source_directory=project_folder,\n", + " compute_target=compute_target,\n", + " script_params=script_params,\n", + " entry_script='tf_horovod_word2vec.py',\n", + " node_count=2,\n", + " process_count_per_node=1,\n", + " distributed_backend='mpi',\n", + " use_gpu=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above code specifies that we will run our training script on `2` nodes, with one worker per node. In order to execute a distributed run using MPI/Horovod, you must provide the argument `distributed_backend='mpi'`. Using this estimator with these settings, TensorFlow, Horovod and their dependencies will be installed for you. However, if your script also uses other packages, make sure to install them via the `TensorFlow` constructor's `pip_packages` or `conda_packages` parameters.\n", + "\n", + "Note that we passed our training data reference `ds_data` to our script's `--input_data` argument. This will 1) mount our datastore on the remote compute and 2) provide the path to the data zip file on our datastore." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Submit job\n", + "Run your experiment by submitting your estimator object. Note that this call is asynchronous." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run = experiment.submit(estimator)\n", + "print(run)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Monitor your run\n", + "You can monitor the progress of the run with a Jupyter widget. Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.train.widgets import RunDetails\n", + "RunDetails(run).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, you can block until the script has completed training before running more code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run.wait_for_completion(show_output=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [default]", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + }, + "msauthor": "minxia" + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/training/04.distributed-tensorflow-with-horovod/tf_horovod_word2vec.py b/training/04.distributed-tensorflow-with-horovod/tf_horovod_word2vec.py new file mode 100644 index 000000000..98c1e5ee7 --- /dev/null +++ b/training/04.distributed-tensorflow-with-horovod/tf_horovod_word2vec.py @@ -0,0 +1,259 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# Modifications copyright (C) 2017 Uber Technologies, Inc. +# Additional modifications copyright (C) Microsoft Corporation +# Licensed under the Apache License, Version 2.0 +# Script adapted from: https://github.com/uber/horovod/blob/master/examples/tensorflow_word2vec.py +# ====================================== +"""Basic word2vec example.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import math +import os +import random +import zipfile +import argparse + +import numpy as np +from six.moves import urllib +from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow as tf +import horovod.tensorflow as hvd +from azureml.core.run import Run + +# Horovod: initialize Horovod. +hvd.init() + +parser = argparse.ArgumentParser() +parser.add_argument('--input_data', type=str, help='training data') + +args = parser.parse_args() + +input_data = args.input_data +print("the input data is at %s" % input_data) + +# Step 1: Download the data. +url = 'http://mattmahoney.net/dc/text8.zip' + + +def maybe_download(filename, expected_bytes): + """Download a file if not present, and make sure it's the right size.""" + if not filename: + filename = "text8.zip" + if not os.path.exists(filename): + print("Downloading the data from http://mattmahoney.net/dc/text8.zip") + filename, _ = urllib.request.urlretrieve(url, filename) + else: + print("Use the data from %s" % input_data) + statinfo = os.stat(filename) + if statinfo.st_size == expected_bytes: + print('Found and verified', filename) + else: + print(statinfo.st_size) + raise Exception( + 'Failed to verify ' + url + '. Can you get to it with a browser?') + return filename + + +filename = maybe_download(input_data, 31344016) + + +# Read the data into a list of strings. +def read_data(filename): + """Extract the first file enclosed in a zip file as a list of words.""" + with zipfile.ZipFile(filename) as f: + data = tf.compat.as_str(f.read(f.namelist()[0])).split() + return data + + +vocabulary = read_data(filename) +print('Data size', len(vocabulary)) + +# Step 2: Build the dictionary and replace rare words with UNK token. +vocabulary_size = 50000 + + +def build_dataset(words, n_words): + """Process raw inputs into a dataset.""" + count = [['UNK', -1]] + count.extend(collections.Counter(words).most_common(n_words - 1)) + dictionary = dict() + for word, _ in count: + dictionary[word] = len(dictionary) + data = list() + unk_count = 0 + for word in words: + if word in dictionary: + index = dictionary[word] + else: + index = 0 # dictionary['UNK'] + unk_count += 1 + data.append(index) + count[0][1] = unk_count + reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys())) + return data, count, dictionary, reversed_dictionary + + +data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, + vocabulary_size) +del vocabulary # Hint to reduce memory. +print('Most common words (+UNK)', count[:5]) +print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]]) + + +# Step 3: Function to generate a training batch for the skip-gram model. +def generate_batch(batch_size, num_skips, skip_window): + assert num_skips <= 2 * skip_window + # Adjust batch_size to match num_skips + batch_size = batch_size // num_skips * num_skips + span = 2 * skip_window + 1 # [ skip_window target skip_window ] + # Backtrack a little bit to avoid skipping words in the end of a batch + data_index = random.randint(0, len(data) - span - 1) + batch = np.ndarray(shape=(batch_size), dtype=np.int32) + labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) + buffer = collections.deque(maxlen=span) + for _ in range(span): + buffer.append(data[data_index]) + data_index = (data_index + 1) % len(data) + for i in range(batch_size // num_skips): + target = skip_window # target label at the center of the buffer + targets_to_avoid = [skip_window] + for j in range(num_skips): + while target in targets_to_avoid: + target = random.randint(0, span - 1) + targets_to_avoid.append(target) + batch[i * num_skips + j] = buffer[skip_window] + labels[i * num_skips + j, 0] = buffer[target] + buffer.append(data[data_index]) + data_index = (data_index + 1) % len(data) + return batch, labels + + +batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1) +for i in range(8): + print(batch[i], reverse_dictionary[batch[i]], + '->', labels[i, 0], reverse_dictionary[labels[i, 0]]) + +# Step 4: Build and train a skip-gram model. + +max_batch_size = 128 +embedding_size = 128 # Dimension of the embedding vector. +skip_window = 1 # How many words to consider left and right. +num_skips = 2 # How many times to reuse an input to generate a label. + +# We pick a random validation set to sample nearest neighbors. Here we limit the +# validation samples to the words that have a low numeric ID, which by +# construction are also the most frequent. +valid_size = 16 # Random set of words to evaluate similarity on. +valid_window = 100 # Only pick dev samples in the head of the distribution. +valid_examples = np.random.choice(valid_window, valid_size, replace=False) +num_sampled = 64 # Number of negative examples to sample. + +graph = tf.Graph() + +with graph.as_default(): + + # Input data. + train_inputs = tf.placeholder(tf.int32, shape=[None]) + train_labels = tf.placeholder(tf.int32, shape=[None, 1]) + valid_dataset = tf.constant(valid_examples, dtype=tf.int32) + + # Look up embeddings for inputs. + embeddings = tf.Variable( + tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) + embed = tf.nn.embedding_lookup(embeddings, train_inputs) + + # Construct the variables for the NCE loss + nce_weights = tf.Variable( + tf.truncated_normal([vocabulary_size, embedding_size], + stddev=1.0 / math.sqrt(embedding_size))) + nce_biases = tf.Variable(tf.zeros([vocabulary_size])) + + # Compute the average NCE loss for the batch. + # tf.nce_loss automatically draws a new sample of the negative labels each + # time we evaluate the loss. + loss = tf.reduce_mean( + tf.nn.nce_loss(weights=nce_weights, + biases=nce_biases, + labels=train_labels, + inputs=embed, + num_sampled=num_sampled, + num_classes=vocabulary_size)) + + # Horovod: adjust learning rate based on number of GPUs. + optimizer = tf.train.GradientDescentOptimizer(1.0 * hvd.size()) + + # Horovod: add Horovod Distributed Optimizer. + optimizer = hvd.DistributedOptimizer(optimizer) + + train_op = optimizer.minimize(loss) + + # Compute the cosine similarity between minibatch examples and all embeddings. + norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) + normalized_embeddings = embeddings / norm + valid_embeddings = tf.nn.embedding_lookup( + normalized_embeddings, valid_dataset) + similarity = tf.matmul( + valid_embeddings, normalized_embeddings, transpose_b=True) + + # Add variable initializer. + init = tf.global_variables_initializer() + + # Horovod: broadcast initial variable states from rank 0 to all other processes. + # This is necessary to ensure consistent initialization of all workers when + # training is started with random weights or restored from a checkpoint. + bcast = hvd.broadcast_global_variables(0) + +# Step 5: Begin training. + +# Horovod: adjust number of steps based on number of GPUs. +num_steps = 4000 // hvd.size() + 1 + +# Horovod: pin GPU to be used to process local rank (one GPU per process) +config = tf.ConfigProto() +config.gpu_options.allow_growth = True +config.gpu_options.visible_device_list = str(hvd.local_rank()) + +with tf.Session(graph=graph, config=config) as session: + # We must initialize all variables before we use them. + init.run() + bcast.run() + print('Initialized') + run = Run.get_submitted_run() + average_loss = 0 + for step in xrange(num_steps): + # simulate various sentence length by randomization + batch_size = random.randint(max_batch_size // 2, max_batch_size) + batch_inputs, batch_labels = generate_batch( + batch_size, num_skips, skip_window) + feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels} + + # We perform one update step by evaluating the optimizer op (including it + # in the list of returned values for session.run() + _, loss_val = session.run([train_op, loss], feed_dict=feed_dict) + average_loss += loss_val + + if step % 2000 == 0: + if step > 0: + average_loss /= 2000 + # The average loss is an estimate of the loss over the last 2000 batches. + print('Average loss at step ', step, ': ', average_loss) + run.log("Loss", average_loss) + average_loss = 0 + final_embeddings = normalized_embeddings.eval() + + # Evaluate similarity in the end on worker 0. + if hvd.rank() == 0: + sim = similarity.eval() + for i in xrange(valid_size): + valid_word = reverse_dictionary[valid_examples[i]] + top_k = 8 # number of nearest neighbors + nearest = (-sim[i, :]).argsort()[1:top_k + 1] + log_str = 'Nearest to %s:' % valid_word + for k in xrange(top_k): + close_word = reverse_dictionary[nearest[k]] + log_str = '%s %s,' % (log_str, close_word) + print(log_str) diff --git a/training/05.distributed-tensorflow-with-parameter-server/05.distributed-tensorflow-with-parameter-server.ipynb b/training/05.distributed-tensorflow-with-parameter-server/05.distributed-tensorflow-with-parameter-server.ipynb new file mode 100644 index 000000000..92daf0938 --- /dev/null +++ b/training/05.distributed-tensorflow-with-parameter-server/05.distributed-tensorflow-with-parameter-server.ipynb @@ -0,0 +1,286 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 05. Distributed TensorFlow with parameter server\n", + "In this tutorial, you will train a TensorFlow model on the [MNIST](http://yann.lecun.com/exdb/mnist/) dataset using native [distributed TensorFlow](https://www.tensorflow.org/deploy/distributed)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "* Understand the [architecture and terms](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture) introduced by Azure Machine Learning (AML)\n", + "* Go through the [00.configuration.ipynb](https://github.com/Azure/MachineLearningNotebooks/blob/master/00.configuration.ipynb) notebook to:\n", + " * install the AML SDK\n", + " * create a workspace and its configuration file (`config.json`)\n", + "* Review the [tutorial](https://aka.ms/aml-notebook-hyperdrive) on single-node TensorFlow training using the SDK" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check core SDK version number\n", + "import azureml.core\n", + "\n", + "print(\"SDK version:\", azureml.core.VERSION)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize workspace\n", + "Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.workspace import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "print('Workspace name: ' + ws.name, \n", + " 'Azure region: ' + ws.location, \n", + " 'Subscription id: ' + ws.subscription_id, \n", + " 'Resource group: ' + ws.resource_group, sep = '\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a remote compute target\n", + "You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) to execute your training script on. In this tutorial, you create an [Azure Batch AI](https://docs.microsoft.com/azure/batch-ai/overview) cluster as your training compute resource. This code creates a cluster for you if it does not already exist in your workspace.\n", + "\n", + "**Creation of the cluster takes approximately 5 minutes.** If the cluster is already in your workspace this code will skip the cluster creation process." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.compute import ComputeTarget, BatchAiCompute\n", + "from azureml.core.compute_target import ComputeTargetException\n", + "\n", + "# choose a name for your cluster\n", + "cluster_name = \"gpucluster\"\n", + "\n", + "try:\n", + " compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n", + " print('Found existing compute target.')\n", + "except ComputeTargetException:\n", + " print('Creating a new compute target...')\n", + " compute_config = BatchAiCompute.provisioning_configuration(vm_size='STANDARD_NC6', \n", + " autoscale_enabled=True,\n", + " cluster_min_nodes=0, \n", + " cluster_max_nodes=4)\n", + "\n", + " # create the cluster\n", + " compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n", + "\n", + " compute_target.wait_for_completion(show_output=True)\n", + "\n", + " # Use the 'status' property to get a detailed status for the current cluster. \n", + " print(compute_target.status.serialize())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train model on the remote compute\n", + "Now that we have the cluster ready to go, let's run our distributed training job." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a project directory\n", + "Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script, and any additional files your training script depends on." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "project_folder = './tf-distr-ps'\n", + "os.makedirs(project_folder, exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copy the training script `tf_mnist_replica.py` into this project directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import shutil\n", + "shutil.copy('tf_mnist_replica.py', project_folder)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create an experiment\n", + "Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed TensorFlow tutorial. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Experiment\n", + "\n", + "experiment_name = 'tf-distr-ps'\n", + "experiment = Experiment(ws, name=experiment_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a TensorFlow estimator\n", + "The AML SDK's TensorFlow estimator enables you to easily submit TensorFlow training jobs for both single-node and distributed runs. For more information on the TensorFlow estimator, refer [here](https://docs.microsoft.com/azure/machine-learning/service/how-to-train-tensorflow)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.train.dnn import TensorFlow\n", + "\n", + "script_params={\n", + " '--num_gpus': 1\n", + "}\n", + "\n", + "estimator = TensorFlow(source_directory=project_folder,\n", + " compute_target=compute_target,\n", + " script_params=script_params,\n", + " entry_script='tf_mnist_replica.py',\n", + " node_count=2,\n", + " worker_count=2,\n", + " parameter_server_count=1, \n", + " distributed_backend='ps',\n", + " use_gpu=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above code specifies that we will run our training script on `2` nodes, with two workers and one parameter server. In order to execute a native distributed TensorFlow run, you must provide the argument `distributed_backend='ps'`. Using this estimator with these settings, TensorFlow and its dependencies will be installed for you. However, if your script also uses other packages, make sure to install them via the `TensorFlow` constructor's `pip_packages` or `conda_packages` parameters." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Submit job\n", + "Run your experiment by submitting your estimator object. Note that this call is asynchronous." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run = experiment.submit(estimator)\n", + "print(run.get_details())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Monitor your run\n", + "You can monitor the progress of the run with a Jupyter widget. Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.train.widgets import RunDetails\n", + "RunDetails(run).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, you can block until the script has completed training before running more code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run.wait_for_completion(show_output=True) # this provides a verbose log" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [default]", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + }, + "msauthor": "minxia" + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/training/05.distributed-tensorflow-with-parameter-server/tf_mnist_replica.py b/training/05.distributed-tensorflow-with-parameter-server/tf_mnist_replica.py new file mode 100644 index 000000000..1476dd5bc --- /dev/null +++ b/training/05.distributed-tensorflow-with-parameter-server/tf_mnist_replica.py @@ -0,0 +1,271 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 +# Script adapted from: +# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dist_test/python/mnist_replica.py +# ============================================================================== +"""Distributed MNIST training and validation, with model replicas. +A simple softmax model with one hidden layer is defined. The parameters +(weights and biases) are located on one parameter server (ps), while the ops +are executed on two worker nodes by default. The TF sessions also run on the +worker node. +Multiple invocations of this script can be done in parallel, with different +values for --task_index. There should be exactly one invocation with +--task_index, which will create a master session that carries out variable +initialization. The other, non-master, sessions will wait for the master +session to finish the initialization before proceeding to the training stage. +The coordination between the multiple worker invocations occurs due to +the definition of the parameters on the same ps devices. The parameter updates +from one worker is visible to all other workers. As such, the workers can +perform forward computation and gradient calculation in parallel, which +should lead to increased training speed for the simple model. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import math +import sys +import tempfile +import time +import json + +import tensorflow as tf +from tensorflow.examples.tutorials.mnist import input_data +from azureml.core.run import Run + +flags = tf.app.flags +flags.DEFINE_string("data_dir", "/tmp/mnist-data", + "Directory for storing mnist data") +flags.DEFINE_boolean("download_only", False, + "Only perform downloading of data; Do not proceed to " + "session preparation, model definition or training") +flags.DEFINE_integer("num_gpus", 0, "Total number of gpus for each machine." + "If you don't use GPU, please set it to '0'") +flags.DEFINE_integer("replicas_to_aggregate", None, + "Number of replicas to aggregate before parameter update " + "is applied (For sync_replicas mode only; default: " + "num_workers)") +flags.DEFINE_integer("hidden_units", 100, + "Number of units in the hidden layer of the NN") +flags.DEFINE_integer("train_steps", 200, + "Number of (global) training steps to perform") +flags.DEFINE_integer("batch_size", 100, "Training batch size") +flags.DEFINE_float("learning_rate", 0.01, "Learning rate") +flags.DEFINE_boolean( + "sync_replicas", False, + "Use the sync_replicas (synchronized replicas) mode, " + "wherein the parameter updates from workers are aggregated " + "before applied to avoid stale gradients") +flags.DEFINE_boolean( + "existing_servers", False, "Whether servers already exists. If True, " + "will use the worker hosts via their GRPC URLs (one client process " + "per worker host). Otherwise, will create an in-process TensorFlow " + "server.") + +FLAGS = flags.FLAGS + +IMAGE_PIXELS = 28 + + +def main(unused_argv): + data_root = os.path.join("outputs", "MNIST") + mnist = None + tf_config = os.environ.get("TF_CONFIG") + if not tf_config or tf_config == "": + raise ValueError("TF_CONFIG not found.") + tf_config_json = json.loads(tf_config) + cluster = tf_config_json.get('cluster') + job_name = tf_config_json.get('task', {}).get('type') + task_index = tf_config_json.get('task', {}).get('index') + job_name = "worker" if job_name == "master" else job_name + sentinel_path = os.path.join(data_root, "complete.txt") + if job_name == "worker" and task_index == 0: + mnist = input_data.read_data_sets(data_root, one_hot=True) + with open(sentinel_path, 'w+') as f: + f.write("download complete") + else: + while not os.path.exists(sentinel_path): + time.sleep(0.01) + mnist = input_data.read_data_sets(data_root, one_hot=True) + + if FLAGS.download_only: + sys.exit(0) + + print("job name = %s" % job_name) + print("task index = %d" % task_index) + print("number of GPUs = %d" % FLAGS.num_gpus) + + # Construct the cluster and start the server + cluster_spec = tf.train.ClusterSpec(cluster) + + # Get the number of workers. + num_workers = len(cluster_spec.task_indices("worker")) + + if not FLAGS.existing_servers: + # Not using existing servers. Create an in-process server. + server = tf.train.Server( + cluster_spec, job_name=job_name, task_index=task_index) + if job_name == "ps": + server.join() + + is_chief = (task_index == 0) + if FLAGS.num_gpus > 0: + # Avoid gpu allocation conflict: now allocate task_num -> #gpu + # for each worker in the corresponding machine + gpu = (task_index % FLAGS.num_gpus) + worker_device = "/job:worker/task:%d/gpu:%d" % (task_index, gpu) + elif FLAGS.num_gpus == 0: + # Just allocate the CPU to worker server + cpu = 0 + worker_device = "/job:worker/task:%d/cpu:%d" % (task_index, cpu) + # The device setter will automatically place Variables ops on separate + # parameter servers (ps). The non-Variable ops will be placed on the workers. + # The ps use CPU and workers use corresponding GPU + with tf.device( + tf.train.replica_device_setter( + worker_device=worker_device, + ps_device="/job:ps/cpu:0", + cluster=cluster)): + global_step = tf.Variable(0, name="global_step", trainable=False) + + # Variables of the hidden layer + hid_w = tf.Variable( + tf.truncated_normal( + [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units], + stddev=1.0 / IMAGE_PIXELS), + name="hid_w") + hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b") + + # Variables of the softmax layer + sm_w = tf.Variable( + tf.truncated_normal( + [FLAGS.hidden_units, 10], + stddev=1.0 / math.sqrt(FLAGS.hidden_units)), + name="sm_w") + sm_b = tf.Variable(tf.zeros([10]), name="sm_b") + + # Ops: located on the worker specified with task_index + x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS]) + y_ = tf.placeholder(tf.float32, [None, 10]) + + hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) + hid = tf.nn.relu(hid_lin) + + y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) + cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) + + opt = tf.train.AdamOptimizer(FLAGS.learning_rate) + + if FLAGS.sync_replicas: + if FLAGS.replicas_to_aggregate is None: + replicas_to_aggregate = num_workers + else: + replicas_to_aggregate = FLAGS.replicas_to_aggregate + + opt = tf.train.SyncReplicasOptimizer( + opt, + replicas_to_aggregate=replicas_to_aggregate, + total_num_replicas=num_workers, + name="mnist_sync_replicas") + + train_step = opt.minimize(cross_entropy, global_step=global_step) + + if FLAGS.sync_replicas: + local_init_op = opt.local_step_init_op + if is_chief: + local_init_op = opt.chief_init_op + + ready_for_local_init_op = opt.ready_for_local_init_op + + # Initial token and chief queue runners required by the sync_replicas mode + chief_queue_runner = opt.get_chief_queue_runner() + sync_init_op = opt.get_init_tokens_op() + + init_op = tf.global_variables_initializer() + train_dir = tempfile.mkdtemp() + + if FLAGS.sync_replicas: + sv = tf.train.Supervisor( + is_chief=is_chief, + logdir=train_dir, + init_op=init_op, + local_init_op=local_init_op, + ready_for_local_init_op=ready_for_local_init_op, + recovery_wait_secs=1, + global_step=global_step) + else: + sv = tf.train.Supervisor( + is_chief=is_chief, + logdir=train_dir, + init_op=init_op, + recovery_wait_secs=1, + global_step=global_step) + + sess_config = tf.ConfigProto( + allow_soft_placement=True, + log_device_placement=False, + device_filters=["/job:ps", + "/job:worker/task:%d" % task_index]) + + # The chief worker (task_index==0) session will prepare the session, + # while the remaining workers will wait for the preparation to complete. + if is_chief: + print("Worker %d: Initializing session..." % task_index) + else: + print("Worker %d: Waiting for session to be initialized..." % + task_index) + + if FLAGS.existing_servers: + server_grpc_url = "grpc://" + task_index + print("Using existing server at: %s" % server_grpc_url) + + sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config) + else: + sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) + + print("Worker %d: Session initialization complete." % task_index) + + if FLAGS.sync_replicas and is_chief: + # Chief worker will start the chief queue runner and call the init op. + sess.run(sync_init_op) + sv.start_queue_runners(sess, [chief_queue_runner]) + + # Perform training + time_begin = time.time() + print("Training begins @ %f" % time_begin) + + local_step = 0 + while True: + # Training feed + batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size) + train_feed = {x: batch_xs, y_: batch_ys} + + _, step = sess.run([train_step, global_step], feed_dict=train_feed) + local_step += 1 + + now = time.time() + print("%f: Worker %d: training step %d done (global step: %d)" % + (now, task_index, local_step, step)) + + if step >= FLAGS.train_steps: + break + + time_end = time.time() + print("Training ends @ %f" % time_end) + training_time = time_end - time_begin + print("Training elapsed time: %f s" % training_time) + + # Validation feed + val_feed = {x: mnist.validation.images, y_: mnist.validation.labels} + val_xent = sess.run(cross_entropy, feed_dict=val_feed) + print("After %d training step(s), validation cross entropy = %g" % + (FLAGS.train_steps, val_xent)) + if job_name == "worker" and task_index == 0: + run = Run.get_submitted_run() + run.log("CrossEntropy", val_xent) + + +if __name__ == "__main__": + tf.app.run() diff --git a/training/06.distributed-cntk-with-custom-docker/06.distributed-cntk-with-custom-docker.ipynb b/training/06.distributed-cntk-with-custom-docker/06.distributed-cntk-with-custom-docker.ipynb new file mode 100644 index 000000000..de9a0d409 --- /dev/null +++ b/training/06.distributed-cntk-with-custom-docker/06.distributed-cntk-with-custom-docker.ipynb @@ -0,0 +1,283 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 06. Distributed CNTK using custom docker images\n", + "In this tutorial, you will train a CNTK model on the [MNIST](http://yann.lecun.com/exdb/mnist/) dataset using a custom docker image and distributed training." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "* Understand the [architecture and terms](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture) introduced by Azure Machine Learning services\n", + "* Go through the [00.configuration.ipynb]() notebook to:\n", + " * install the AML SDK\n", + " * create a workspace and its configuration file (`config.json`)\n", + "* Review the [tutorial]() on single-node PyTorch training using the SDK" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check core SDK version number\n", + "import azureml.core\n", + "\n", + "print(\"SDK version:\", azureml.core.VERSION)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize workspace\n", + "\n", + "Initialize a [Workspace](https://review.docs.microsoft.com/en-us/azure/machine-learning/service/concept-azure-machine-learning-architecture?branch=release-ignite-aml#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.workspace import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "print('Workspace name: ' + ws.name, \n", + " 'Azure region: ' + ws.location, \n", + " 'Subscription id: ' + ws.subscription_id, \n", + " 'Resource group: ' + ws.resource_group, sep = '\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a remote compute target\n", + "You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) to execute your training script on. In this tutorial, you create an [Azure Batch AI](https://docs.microsoft.com/azure/batch-ai/overview) cluster as your training compute resource. This code creates a cluster for you if it does not already exist in your workspace.\n", + "\n", + "**Creation of the cluster takes approximately 5 minutes.** If the cluster is already in your workspace this code will skip the cluster creation process." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.compute import ComputeTarget, BatchAiCompute\n", + "from azureml.core.compute_target import ComputeTargetException\n", + "\n", + "# choose a name for your cluster\n", + "cluster_name = \"gpucluster\"\n", + "\n", + "try:\n", + " compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n", + " print('Found existing compute target.')\n", + "except ComputeTargetException:\n", + " print('Creating a new compute target...')\n", + " compute_config = BatchAiCompute.provisioning_configuration(vm_size='STANDARD_NC6', \n", + " autoscale_enabled=True,\n", + " cluster_min_nodes=0, \n", + " cluster_max_nodes=4)\n", + "\n", + " # create the cluster\n", + " compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n", + "\n", + " compute_target.wait_for_completion(show_output=True)\n", + "\n", + " # Use the 'status' property to get a detailed status for the current cluster. \n", + " print(compute_target.status.serialize())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train model on the remote compute\n", + "Now that we have the cluster ready to go, let's run our distributed training job." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a project directory\n", + "Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script, and any additional files your training script depends on." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "project_folder = './cntk-distr'\n", + "os.makedirs(project_folder, exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copy the training script `tf_mnist_replica.py` into this project directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import shutil\n", + "shutil.copy('cntk_mnist.py', project_folder)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create an experiment\n", + "Create an [experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed CNTK tutorial. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Experiment\n", + "\n", + "experiment_name = 'cntk-distr'\n", + "experiment = Experiment(ws, name=experiment_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create an Estimator\n", + "The AML SDK's base Estimator enables you to easily submit custom scripts for both single-node and distributed runs. You should this generic estimator for training code using frameworks such as sklearn or CNTK that don't have corresponding custom estimators. For more information on using the generic estimator, refer [here](https://docs.microsoft.com/azure/machine-learning/service/how-to-train-ml-models)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.train.estimator import *\n", + "\n", + "estimator = Estimator(source_directory=project_folder,\n", + " compute_target=compute_target,\n", + " entry_script='cntk_mnist.py',\n", + " node_count=2,\n", + " process_count_per_node=1,\n", + " distributed_backend='mpi', \n", + " pip_packages=['cntk==2.5.1'],\n", + " custom_docker_base_image='microsoft/mmlspark:0.12')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We would like to train our model using a [pre-built Docker container](https://hub.docker.com/r/microsoft/mmlspark/). To do so, we specify the name of the docker image to the argument `custom_docker_base_image`. You can only provide images available in public docker repositories such as Docker Hub using this argument. To use an image from a private docker repository, use the constructor's `environment_definition` parameter instead. Finally, we provide the `cntk` package to `pip_packages` to install CNTK 2.5.1 on our custom image.\n", + "\n", + "The above code specifies that we will run our training script on `2` nodes, with one worker per node. In order to run distributed CNTK, which uses MPI, you must provide the argument `distributed_backend='mpi'`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Submit job\n", + "Run your experiment by submitting your estimator object. Note that this call is asynchronous." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run = experiment.submit(estimator)\n", + "print(run.get_details())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Monitor your run\n", + "You can monitor the progress of the run with a Jupyter widget. Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.train.widgets import RunDetails\n", + "RunDetails(run).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, you can block until the script has completed training before running more code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run.wait_for_completion(show_output=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [default]", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/training/06.distributed-cntk-with-custom-docker/cntk_mnist.py b/training/06.distributed-cntk-with-custom-docker/cntk_mnist.py new file mode 100644 index 000000000..41ea88b2b --- /dev/null +++ b/training/06.distributed-cntk-with-custom-docker/cntk_mnist.py @@ -0,0 +1,321 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# Script adapted from: +# 1. https://github.com/Microsoft/CNTK/blob/v2.0/Tutorials/CNTK_103A_MNIST_DataLoader.ipynb +# 2. https://github.com/Microsoft/CNTK/blob/v2.0/Tutorials/CNTK_103C_MNIST_MultiLayerPerceptron.ipynb +# =================================================================================================== +"""Train a CNTK multi-layer perceptron on the MNIST dataset.""" + +from __future__ import print_function +import gzip +import numpy as np +import os +import shutil +import struct +import sys +import time + +import cntk as C +from azureml.core.run import Run +import argparse + +run = Run.get_submitted_run() + +parser = argparse.ArgumentParser() + +parser.add_argument('--learning_rate', type=float, default=0.001, help='learning rate') +parser.add_argument('--num_hidden_layers', type=int, default=2, help='number of hidden layers') +parser.add_argument('--minibatch_size', type=int, default=64, help='minibatchsize') + +args = parser.parse_args() + +# Functions to load MNIST images and unpack into train and test set. +# - loadData reads image data and formats into a 28x28 long array +# - loadLabels reads the corresponding labels data, 1 for each image +# - load packs the downloaded image and labels data into a combined format to be read later by +# CNTK text reader + + +def loadData(src, cimg): + print('Downloading ' + src) + gzfname, h = urlretrieve(src, './delete.me') + print('Done.') + try: + with gzip.open(gzfname) as gz: + n = struct.unpack('I', gz.read(4)) + # Read magic number. + if n[0] != 0x3080000: + raise Exception('Invalid file: unexpected magic number.') + # Read number of entries. + n = struct.unpack('>I', gz.read(4))[0] + if n != cimg: + raise Exception('Invalid file: expected {0} entries.'.format(cimg)) + crow = struct.unpack('>I', gz.read(4))[0] + ccol = struct.unpack('>I', gz.read(4))[0] + if crow != 28 or ccol != 28: + raise Exception('Invalid file: expected 28 rows/cols per image.') + # Read data. + res = np.fromstring(gz.read(cimg * crow * ccol), dtype=np.uint8) + finally: + os.remove(gzfname) + return res.reshape((cimg, crow * ccol)) + + +def loadLabels(src, cimg): + print('Downloading ' + src) + gzfname, h = urlretrieve(src, './delete.me') + print('Done.') + try: + with gzip.open(gzfname) as gz: + n = struct.unpack('I', gz.read(4)) + # Read magic number. + if n[0] != 0x1080000: + raise Exception('Invalid file: unexpected magic number.') + # Read number of entries. + n = struct.unpack('>I', gz.read(4)) + if n[0] != cimg: + raise Exception('Invalid file: expected {0} rows.'.format(cimg)) + # Read labels. + res = np.fromstring(gz.read(cimg), dtype=np.uint8) + finally: + os.remove(gzfname) + return res.reshape((cimg, 1)) + + +def try_download(dataSrc, labelsSrc, cimg): + data = loadData(dataSrc, cimg) + labels = loadLabels(labelsSrc, cimg) + return np.hstack((data, labels)) + +# Save the data files into a format compatible with CNTK text reader + + +def savetxt(filename, ndarray): + dir = os.path.dirname(filename) + + if not os.path.exists(dir): + os.makedirs(dir) + + if not os.path.isfile(filename): + print("Saving", filename) + with open(filename, 'w') as f: + labels = list(map(' '.join, np.eye(10, dtype=np.uint).astype(str))) + for row in ndarray: + row_str = row.astype(str) + label_str = labels[row[-1]] + feature_str = ' '.join(row_str[:-1]) + f.write('|labels {} |features {}\n'.format(label_str, feature_str)) + else: + print("File already exists", filename) + +# Read a CTF formatted text (as mentioned above) using the CTF deserializer from a file + + +def create_reader(path, is_training, input_dim, num_label_classes): + return C.io.MinibatchSource(C.io.CTFDeserializer(path, C.io.StreamDefs( + labels=C.io.StreamDef(field='labels', shape=num_label_classes, is_sparse=False), + features=C.io.StreamDef(field='features', shape=input_dim, is_sparse=False) + )), randomize=is_training, max_sweeps=C.io.INFINITELY_REPEAT if is_training else 1) + +# Defines a utility that prints the training progress + + +def print_training_progress(trainer, mb, frequency, verbose=1): + training_loss = "NA" + eval_error = "NA" + + if mb % frequency == 0: + training_loss = trainer.previous_minibatch_loss_average + eval_error = trainer.previous_minibatch_evaluation_average + if verbose: + print("Minibatch: {0}, Loss: {1:.4f}, Error: {2:.2f}%".format(mb, training_loss, eval_error * 100)) + + return mb, training_loss, eval_error + +# Create the network architecture + + +def create_model(features): + with C.layers.default_options(init=C.layers.glorot_uniform(), activation=C.ops.relu): + h = features + for _ in range(num_hidden_layers): + h = C.layers.Dense(hidden_layers_dim)(h) + r = C.layers.Dense(num_output_classes, activation=None)(h) + return r + + +if __name__ == '__main__': + run = Run.get_submitted_run() + + try: + from urllib.request import urlretrieve + except ImportError: + from urllib import urlretrieve + + # Select the right target device when this script is being used: + if 'TEST_DEVICE' in os.environ: + if os.environ['TEST_DEVICE'] == 'cpu': + C.device.try_set_default_device(C.device.cpu()) + else: + C.device.try_set_default_device(C.device.gpu(0)) + + # URLs for the train image and labels data + url_train_image = 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz' + url_train_labels = 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz' + num_train_samples = 60000 + + print("Downloading train data") + train = try_download(url_train_image, url_train_labels, num_train_samples) + + url_test_image = 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz' + url_test_labels = 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz' + num_test_samples = 10000 + + print("Downloading test data") + test = try_download(url_test_image, url_test_labels, num_test_samples) + + # Save the train and test files (prefer our default path for the data + rank = os.environ.get("OMPI_COMM_WORLD_RANK") + data_dir = os.path.join("outputs", "MNIST") + sentinel_path = os.path.join(data_dir, "complete.txt") + if rank == '0': + print('Writing train text file...') + savetxt(os.path.join(data_dir, "Train-28x28_cntk_text.txt"), train) + + print('Writing test text file...') + savetxt(os.path.join(data_dir, "Test-28x28_cntk_text.txt"), test) + with open(sentinel_path, 'w+') as f: + f.write("download complete") + + print('Done with downloading data.') + else: + while not os.path.exists(sentinel_path): + time.sleep(0.01) + + # Ensure we always get the same amount of randomness + np.random.seed(0) + + # Define the data dimensions + input_dim = 784 + num_output_classes = 10 + + # Ensure the training and test data is generated and available for this tutorial. + # We search in two locations in the toolkit for the cached MNIST data set. + data_found = False + for data_dir in [os.path.join("..", "Examples", "Image", "DataSets", "MNIST"), + os.path.join("data_" + str(rank), "MNIST"), + os.path.join("outputs", "MNIST")]: + train_file = os.path.join(data_dir, "Train-28x28_cntk_text.txt") + test_file = os.path.join(data_dir, "Test-28x28_cntk_text.txt") + if os.path.isfile(train_file) and os.path.isfile(test_file): + data_found = True + break + if not data_found: + raise ValueError("Please generate the data by completing CNTK 103 Part A") + print("Data directory is {0}".format(data_dir)) + + num_hidden_layers = args.num_hidden_layers + hidden_layers_dim = 400 + + input = C.input_variable(input_dim) + label = C.input_variable(num_output_classes) + + z = create_model(input) + # Scale the input to 0-1 range by dividing each pixel by 255. + z = create_model(input / 255.0) + + loss = C.cross_entropy_with_softmax(z, label) + label_error = C.classification_error(z, label) + + # Instantiate the trainer object to drive the model training + learning_rate = args.learning_rate + lr_schedule = C.learning_rate_schedule(learning_rate, C.UnitType.minibatch) + learner = C.sgd(z.parameters, lr_schedule) + trainer = C.Trainer(z, (loss, label_error), [learner]) + + # Initialize the parameters for the trainer + minibatch_size = args.minibatch_size + num_samples_per_sweep = 60000 + num_sweeps_to_train_with = 10 + num_minibatches_to_train = (num_samples_per_sweep * num_sweeps_to_train_with) / minibatch_size + + # Create the reader to training data set + reader_train = create_reader(train_file, True, input_dim, num_output_classes) + + # Map the data streams to the input and labels. + input_map = { + label: reader_train.streams.labels, + input: reader_train.streams.features + } + + # Run the trainer on and perform model training + training_progress_output_freq = 500 + + errors = [] + losses = [] + for i in range(0, int(num_minibatches_to_train)): + # Read a mini batch from the training data file + data = reader_train.next_minibatch(minibatch_size, input_map=input_map) + + trainer.train_minibatch(data) + batchsize, loss, error = print_training_progress(trainer, i, training_progress_output_freq, verbose=1) + if (error != 'NA') and (loss != 'NA'): + errors.append(float(error)) + losses.append(float(loss)) + + # log the losses + if rank == '0': + run.log_list("Loss", losses) + run.log_list("Error", errors) + + # Read the training data + reader_test = create_reader(test_file, False, input_dim, num_output_classes) + + test_input_map = { + label: reader_test.streams.labels, + input: reader_test.streams.features, + } + + # Test data for trained model + test_minibatch_size = 512 + num_samples = 10000 + num_minibatches_to_test = num_samples // test_minibatch_size + test_result = 0.0 + + for i in range(num_minibatches_to_test): + # We are loading test data in batches specified by test_minibatch_size + # Each data point in the minibatch is a MNIST digit image of 784 dimensions + # with one pixel per dimension that we will encode / decode with the + # trained model. + data = reader_test.next_minibatch(test_minibatch_size, + input_map=test_input_map) + + eval_error = trainer.test_minibatch(data) + test_result = test_result + eval_error + + # Average of evaluation errors of all test minibatches + print("Average test error: {0:.2f}%".format((test_result * 100) / num_minibatches_to_test)) + + out = C.softmax(z) + + # Read the data for evaluation + reader_eval = create_reader(test_file, False, input_dim, num_output_classes) + + eval_minibatch_size = 25 + eval_input_map = {input: reader_eval.streams.features} + + data = reader_test.next_minibatch(eval_minibatch_size, input_map=test_input_map) + + img_label = data[label].asarray() + img_data = data[input].asarray() + predicted_label_prob = [out.eval(img_data[i]) for i in range(len(img_data))] + + # Find the index with the maximum value for both predicted as well as the ground truth + pred = [np.argmax(predicted_label_prob[i]) for i in range(len(predicted_label_prob))] + gtlabel = [np.argmax(img_label[i]) for i in range(len(img_label))] + + print("Label :", gtlabel[:25]) + print("Predicted:", pred) + + # save model to outputs folder + z.save('outputs/cntk.model') diff --git a/training/40.tensorboard/40.tensorboard.ipynb b/training/07.tensorboard/07.tensorboard.ipynb similarity index 99% rename from training/40.tensorboard/40.tensorboard.ipynb rename to training/07.tensorboard/07.tensorboard.ipynb index 97b64db9f..f34b5e652 100644 --- a/training/40.tensorboard/40.tensorboard.ipynb +++ b/training/07.tensorboard/07.tensorboard.ipynb @@ -480,7 +480,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python [default]", "language": "python", "name": "python3" }, @@ -494,7 +494,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.6" } }, "nbformat": 4, diff --git a/training/41.export-run-history-to-tensorboard/41.export-run-history-to-tensorboard.ipynb b/training/08.export-run-history-to-tensorboard/08.export-run-history-to-tensorboard.ipynb similarity index 100% rename from training/41.export-run-history-to-tensorboard/41.export-run-history-to-tensorboard.ipynb rename to training/08.export-run-history-to-tensorboard/08.export-run-history-to-tensorboard.ipynb diff --git a/training/50.distributed-tensorflow-with-horovod/50.distributed-tensorflow-with-horovod.ipynb b/training/50.distributed-tensorflow-with-horovod/50.distributed-tensorflow-with-horovod.ipynb deleted file mode 100644 index a53acd5b3..000000000 --- a/training/50.distributed-tensorflow-with-horovod/50.distributed-tensorflow-with-horovod.ipynb +++ /dev/null @@ -1,500 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 50. Distributed Tensorflow Horovod\n", - "\n", - "In this tutorial we demonstrate how to use the Azure ML Training SDK to train Tensorflow model in a distributed manner using Horovod framework.\n", - "\n", - "# Prerequisites\n", - "\n", - "Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Check core SDK version number\n", - "import azureml.core\n", - "\n", - "print(\"SDK version:\", azureml.core.VERSION)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.workspace import Workspace\n", - "\n", - "ws = Workspace.from_config()\n", - "print('Workspace name: ' + ws.name, \n", - " 'Azure region: ' + ws.location, \n", - " 'Subscription id: ' + ws.subscription_id, \n", - " 'Resource group: ' + ws.resource_group, sep = '\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import getpass\n", - "import os\n", - "from azureml.core.experiment import Experiment\n", - "\n", - "username = getpass.getuser().replace('-','')\n", - "\n", - "# choose a name for the run history container in the workspace\n", - "experiment = Experiment(ws, username + '-horovod')\n", - "\n", - "# project folder name\n", - "project_folder = './samples/distributed-tensorflow-horovod'\n", - "os.makedirs(project_folder, exist_ok = True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This recipe is using a MLC-managed Batch AI cluster. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.compute import BatchAiCompute\n", - "from azureml.core.compute import ComputeTarget\n", - "\n", - "batchai_cluster_name='gpucluster'\n", - "\n", - "\n", - "try:\n", - " # Check for existing cluster\n", - " compute_target = ComputeTarget(ws,batchai_cluster_name)\n", - " print('Found existing compute target')\n", - "except:\n", - " # Else, create new one\n", - " print('Creating a new compute target...')\n", - " provisioning_config = BatchAiCompute.provisioning_configuration(vm_size = \"STANDARD_NC6\", # NC6 is GPU-enabled\n", - " #vm_priority = 'lowpriority', # optional\n", - " autoscale_enabled = True,\n", - " cluster_min_nodes = 0, \n", - " cluster_max_nodes = 4)\n", - " compute_target = ComputeTarget.create(ws, batchai_cluster_name, provisioning_config)\n", - " # can poll for a minimum number of nodes and for a specific timeout. \n", - " # if no min node count is provided it will use the scale settings for the cluster\n", - " compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n", - "\n", - " # For a more detailed view of current BatchAI cluster status, use the 'status' property \n", - "print(compute_target.status.serialize())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile {project_folder}/word2vec.py\n", - "\n", - "# Copyright 2015 The TensorFlow Authors. All Rights Reserved.\n", - "# Modifications copyright (C) 2017 Uber Technologies, Inc.\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# http://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License.\n", - "# ==============================================================================\n", - "\"\"\"Basic word2vec example.\"\"\"\n", - "\n", - "from __future__ import absolute_import\n", - "from __future__ import division\n", - "from __future__ import print_function\n", - "\n", - "import collections\n", - "import math\n", - "import os\n", - "import random\n", - "import zipfile\n", - "import argparse\n", - "\n", - "import numpy as np\n", - "from six.moves import urllib\n", - "from six.moves import xrange # pylint: disable=redefined-builtin\n", - "import tensorflow as tf\n", - "import horovod.tensorflow as hvd\n", - "from azureml.core.run import Run\n", - "\n", - "# Horovod: initialize Horovod.\n", - "hvd.init()\n", - "\n", - "parser = argparse.ArgumentParser()\n", - "parser.add_argument('--data_dir', type=str, help='input directory')\n", - "\n", - "args = parser.parse_args()\n", - "\n", - "data_dir = args.data_dir\n", - "print(\"the input data_dir is %s\" % data_dir)\n", - "\n", - "# Step 1: Download the data.\n", - "url = 'http://mattmahoney.net/dc/text8.zip'\n", - "\n", - "\n", - "def maybe_download(filename, expected_bytes):\n", - " \"\"\"Download a file if not present, and make sure it's the right size.\"\"\"\n", - " if not filename:\n", - " filename = \"text8.zip\"\n", - " if not os.path.exists(filename):\n", - " print(\"Downloading the data from http://mattmahoney.net/dc/text8.zip\")\n", - " filename, _ = urllib.request.urlretrieve(url, filename)\n", - " else:\n", - " print(\"Use the data from the input data_dir %s\" % data_dir)\n", - " statinfo = os.stat(filename)\n", - " if statinfo.st_size == expected_bytes:\n", - " print('Found and verified', filename)\n", - " else:\n", - " print(statinfo.st_size)\n", - " raise Exception(\n", - " 'Failed to verify ' + url + '. Can you get to it with a browser?')\n", - " return filename\n", - "\n", - "filename = maybe_download(data_dir, 31344016)\n", - "\n", - "\n", - "# Read the data into a list of strings.\n", - "def read_data(filename):\n", - " \"\"\"Extract the first file enclosed in a zip file as a list of words.\"\"\"\n", - " with zipfile.ZipFile(filename) as f:\n", - " data = tf.compat.as_str(f.read(f.namelist()[0])).split()\n", - " return data\n", - "\n", - "vocabulary = read_data(filename)\n", - "print('Data size', len(vocabulary))\n", - "\n", - "# Step 2: Build the dictionary and replace rare words with UNK token.\n", - "vocabulary_size = 50000\n", - "\n", - "\n", - "def build_dataset(words, n_words):\n", - " \"\"\"Process raw inputs into a dataset.\"\"\"\n", - " count = [['UNK', -1]]\n", - " count.extend(collections.Counter(words).most_common(n_words - 1))\n", - " dictionary = dict()\n", - " for word, _ in count:\n", - " dictionary[word] = len(dictionary)\n", - " data = list()\n", - " unk_count = 0\n", - " for word in words:\n", - " if word in dictionary:\n", - " index = dictionary[word]\n", - " else:\n", - " index = 0 # dictionary['UNK']\n", - " unk_count += 1\n", - " data.append(index)\n", - " count[0][1] = unk_count\n", - " reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))\n", - " return data, count, dictionary, reversed_dictionary\n", - "\n", - "data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,\n", - " vocabulary_size)\n", - "del vocabulary # Hint to reduce memory.\n", - "print('Most common words (+UNK)', count[:5])\n", - "print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])\n", - "\n", - "\n", - "# Step 3: Function to generate a training batch for the skip-gram model.\n", - "def generate_batch(batch_size, num_skips, skip_window):\n", - " assert num_skips <= 2 * skip_window\n", - " # Adjust batch_size to match num_skips\n", - " batch_size = batch_size // num_skips * num_skips\n", - " span = 2 * skip_window + 1 # [ skip_window target skip_window ]\n", - " # Backtrack a little bit to avoid skipping words in the end of a batch\n", - " data_index = random.randint(0, len(data) - span - 1)\n", - " batch = np.ndarray(shape=(batch_size), dtype=np.int32)\n", - " labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)\n", - " buffer = collections.deque(maxlen=span)\n", - " for _ in range(span):\n", - " buffer.append(data[data_index])\n", - " data_index = (data_index + 1) % len(data)\n", - " for i in range(batch_size // num_skips):\n", - " target = skip_window # target label at the center of the buffer\n", - " targets_to_avoid = [skip_window]\n", - " for j in range(num_skips):\n", - " while target in targets_to_avoid:\n", - " target = random.randint(0, span - 1)\n", - " targets_to_avoid.append(target)\n", - " batch[i * num_skips + j] = buffer[skip_window]\n", - " labels[i * num_skips + j, 0] = buffer[target]\n", - " buffer.append(data[data_index])\n", - " data_index = (data_index + 1) % len(data)\n", - " return batch, labels\n", - "\n", - "batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)\n", - "for i in range(8):\n", - " print(batch[i], reverse_dictionary[batch[i]],\n", - " '->', labels[i, 0], reverse_dictionary[labels[i, 0]])\n", - "\n", - "# Step 4: Build and train a skip-gram model.\n", - "\n", - "max_batch_size = 128\n", - "embedding_size = 128 # Dimension of the embedding vector.\n", - "skip_window = 1 # How many words to consider left and right.\n", - "num_skips = 2 # How many times to reuse an input to generate a label.\n", - "\n", - "# We pick a random validation set to sample nearest neighbors. Here we limit the\n", - "# validation samples to the words that have a low numeric ID, which by\n", - "# construction are also the most frequent.\n", - "valid_size = 16 # Random set of words to evaluate similarity on.\n", - "valid_window = 100 # Only pick dev samples in the head of the distribution.\n", - "valid_examples = np.random.choice(valid_window, valid_size, replace=False)\n", - "num_sampled = 64 # Number of negative examples to sample.\n", - "\n", - "graph = tf.Graph()\n", - "\n", - "with graph.as_default():\n", - "\n", - " # Input data.\n", - " train_inputs = tf.placeholder(tf.int32, shape=[None])\n", - " train_labels = tf.placeholder(tf.int32, shape=[None, 1])\n", - " valid_dataset = tf.constant(valid_examples, dtype=tf.int32)\n", - "\n", - " # Look up embeddings for inputs.\n", - " embeddings = tf.Variable(\n", - " tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))\n", - " embed = tf.nn.embedding_lookup(embeddings, train_inputs)\n", - "\n", - " # Construct the variables for the NCE loss\n", - " nce_weights = tf.Variable(\n", - " tf.truncated_normal([vocabulary_size, embedding_size],\n", - " stddev=1.0 / math.sqrt(embedding_size)))\n", - " nce_biases = tf.Variable(tf.zeros([vocabulary_size]))\n", - "\n", - " # Compute the average NCE loss for the batch.\n", - " # tf.nce_loss automatically draws a new sample of the negative labels each\n", - " # time we evaluate the loss.\n", - " loss = tf.reduce_mean(\n", - " tf.nn.nce_loss(weights=nce_weights,\n", - " biases=nce_biases,\n", - " labels=train_labels,\n", - " inputs=embed,\n", - " num_sampled=num_sampled,\n", - " num_classes=vocabulary_size))\n", - "\n", - " # Horovod: adjust learning rate based on number of GPUs.\n", - " optimizer = tf.train.GradientDescentOptimizer(1.0 * hvd.size())\n", - "\n", - " # Horovod: add Horovod Distributed Optimizer.\n", - " optimizer = hvd.DistributedOptimizer(optimizer)\n", - "\n", - " train_op = optimizer.minimize(loss)\n", - "\n", - " # Compute the cosine similarity between minibatch examples and all embeddings.\n", - " norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))\n", - " normalized_embeddings = embeddings / norm\n", - " valid_embeddings = tf.nn.embedding_lookup(\n", - " normalized_embeddings, valid_dataset)\n", - " similarity = tf.matmul(\n", - " valid_embeddings, normalized_embeddings, transpose_b=True)\n", - "\n", - " # Add variable initializer.\n", - " init = tf.global_variables_initializer()\n", - "\n", - " # Horovod: broadcast initial variable states from rank 0 to all other processes.\n", - " # This is necessary to ensure consistent initialization of all workers when\n", - " # training is started with random weights or restored from a checkpoint.\n", - " bcast = hvd.broadcast_global_variables(0)\n", - "\n", - "# Step 5: Begin training.\n", - "\n", - "# Horovod: adjust number of steps based on number of GPUs.\n", - "num_steps = 4000 // hvd.size() + 1\n", - "\n", - "# Horovod: pin GPU to be used to process local rank (one GPU per process)\n", - "config = tf.ConfigProto()\n", - "config.gpu_options.allow_growth = True\n", - "config.gpu_options.visible_device_list = str(hvd.local_rank())\n", - "\n", - "with tf.Session(graph=graph, config=config) as session:\n", - " # We must initialize all variables before we use them.\n", - " init.run()\n", - " bcast.run()\n", - " print('Initialized')\n", - " run = Run.get_submitted_run()\n", - " average_loss = 0\n", - " for step in xrange(num_steps):\n", - " # simulate various sentence length by randomization\n", - " batch_size = random.randint(max_batch_size // 2, max_batch_size)\n", - " batch_inputs, batch_labels = generate_batch(\n", - " batch_size, num_skips, skip_window)\n", - " feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}\n", - "\n", - " # We perform one update step by evaluating the optimizer op (including it\n", - " # in the list of returned values for session.run()\n", - " _, loss_val = session.run([train_op, loss], feed_dict=feed_dict)\n", - " average_loss += loss_val\n", - "\n", - " if step % 2000 == 0:\n", - " if step > 0:\n", - " average_loss /= 2000\n", - " # The average loss is an estimate of the loss over the last 2000 batches.\n", - " print('Average loss at step ', step, ': ', average_loss)\n", - " run.log(\"Loss\", average_loss)\n", - " average_loss = 0\n", - " final_embeddings = normalized_embeddings.eval()\n", - "\n", - " # Evaluate similarity in the end on worker 0.\n", - " if hvd.rank() == 0:\n", - " sim = similarity.eval()\n", - " for i in xrange(valid_size):\n", - " valid_word = reverse_dictionary[valid_examples[i]]\n", - " top_k = 8 # number of nearest neighbors\n", - " nearest = (-sim[i, :]).argsort()[1:top_k + 1]\n", - " log_str = 'Nearest to %s:' % valid_word\n", - " for k in xrange(top_k):\n", - " close_word = reverse_dictionary[nearest[k]]\n", - " log_str = '%s %s,' % (log_str, close_word)\n", - " print(log_str)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Upload http://mattmahoney.net/dc/text8.zip to the azure blob storage." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ds = ws.get_default_datastore()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import urllib\n", - "\n", - "os.makedirs('./data', exist_ok = True)\n", - "\n", - "urllib.request.urlretrieve('http://mattmahoney.net/dc/text8.zip', filename = './data/text8.zip')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ds.upload(src_dir = 'data', target_path = 'data', overwrite=True, show_progress = True)\n", - "\n", - "path_on_datastore = \"/data/text8.zip\"\n", - "ds_data = ds.path(path_on_datastore)\n", - "print(ds_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.train.dnn import *\n", - "script_params={\n", - " \"--data_dir\": ds_data\n", - "}\n", - "tf_estimator = TensorFlow(source_directory=project_folder,\n", - " compute_target=compute_target,\n", - " entry_script='word2vec.py',\n", - " script_params=script_params,\n", - " node_count=2,\n", - " process_count_per_node=1,\n", - " distributed_backend=\"mpi\",\n", - " use_gpu=False)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run = experiment.submit(tf_estimator)\n", - "print(run)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.train.widgets import RunDetails\n", - "RunDetails(run).show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run.wait_for_completion(show_output=True)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [default]", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.6" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/training/51.distributed-tensorflow-with-parameter-server/51.distributed-tensorflow-with-parameter-server.ipynb b/training/51.distributed-tensorflow-with-parameter-server/51.distributed-tensorflow-with-parameter-server.ipynb deleted file mode 100644 index 55decdf03..000000000 --- a/training/51.distributed-tensorflow-with-parameter-server/51.distributed-tensorflow-with-parameter-server.ipynb +++ /dev/null @@ -1,473 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 51. Distributed TensorFlow using Parameter Server\n", - "In this tutorial we demonstrate how to use the Azure ML Training SDK to train Tensorflow model in a distributed manner using Parameter Server.\n", - "\n", - "# Prerequisites\n", - "\n", - "Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Check core SDK version number\n", - "import azureml.core\n", - "\n", - "print(\"SDK version:\", azureml.core.VERSION)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.workspace import Workspace\n", - "\n", - "ws = Workspace.from_config()\n", - "print('Workspace name: ' + ws.name, \n", - " 'Azure region: ' + ws.location, \n", - " 'Subscription id: ' + ws.subscription_id, \n", - " 'Resource group: ' + ws.resource_group, sep = '\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import getpass\n", - "import os\n", - "from azureml.core.experiment import Experiment\n", - "\n", - "username = getpass.getuser().replace('-','')\n", - "\n", - "# choose a name for the run history container in the workspace\n", - "run_history_name = username + '-tf_ps'\n", - "\n", - "experiment = Experiment(ws, run_history_name)\n", - "\n", - "# project folder name\n", - "project_folder = './' + run_history_name\n", - "\n", - "print(project_folder)\n", - "os.makedirs(project_folder, exist_ok = True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This recipe is using a MLC-managed Batch AI cluster. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.compute import BatchAiCompute\n", - "from azureml.core.compute import ComputeTarget\n", - "\n", - "batchai_cluster_name='gpucluster'\n", - "\n", - "\n", - "try:\n", - " # Check for existing cluster\n", - " compute_target = ComputeTarget(ws,batchai_cluster_name)\n", - " print('Found existing compute target')\n", - "except:\n", - " # Else, create new one\n", - " print('Creating a new compute target...')\n", - " provisioning_config = BatchAiCompute.provisioning_configuration(vm_size = \"STANDARD_NC6\", # NC6 is GPU-enabled\n", - " #vm_priority = 'lowpriority', # optional\n", - " autoscale_enabled = True,\n", - " cluster_min_nodes = 0, \n", - " cluster_max_nodes = 4)\n", - " compute_target = ComputeTarget.create(ws, batchai_cluster_name, provisioning_config)\n", - " # can poll for a minimum number of nodes and for a specific timeout. \n", - " # if no min node count is provided it will use the scale settings for the cluster\n", - " compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n", - "\n", - " # For a more detailed view of current BatchAI cluster status, use the 'status' property \n", - "print(compute_target.status.serialize())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile {project_folder}/mnist_replica.py\n", - "\n", - "# Copyright 2016 The TensorFlow Authors. All Rights Reserved.\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# http://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License.\n", - "# ==============================================================================\n", - "\"\"\"Distributed MNIST training and validation, with model replicas.\n", - "A simple softmax model with one hidden layer is defined. The parameters\n", - "(weights and biases) are located on one parameter server (ps), while the ops\n", - "are executed on two worker nodes by default. The TF sessions also run on the\n", - "worker node.\n", - "Multiple invocations of this script can be done in parallel, with different\n", - "values for --task_index. There should be exactly one invocation with\n", - "--task_index, which will create a master session that carries out variable\n", - "initialization. The other, non-master, sessions will wait for the master\n", - "session to finish the initialization before proceeding to the training stage.\n", - "The coordination between the multiple worker invocations occurs due to\n", - "the definition of the parameters on the same ps devices. The parameter updates\n", - "from one worker is visible to all other workers. As such, the workers can\n", - "perform forward computation and gradient calculation in parallel, which\n", - "should lead to increased training speed for the simple model.\n", - "\"\"\"\n", - "\n", - "from __future__ import absolute_import\n", - "from __future__ import division\n", - "from __future__ import print_function\n", - "\n", - "import os\n", - "import math\n", - "import sys\n", - "import tempfile\n", - "import time\n", - "import json\n", - "\n", - "import tensorflow as tf\n", - "from tensorflow.examples.tutorials.mnist import input_data\n", - "from azureml.core.run import Run\n", - "\n", - "flags = tf.app.flags\n", - "flags.DEFINE_string(\"data_dir\", \"/tmp/mnist-data\",\n", - " \"Directory for storing mnist data\")\n", - "flags.DEFINE_boolean(\"download_only\", False,\n", - " \"Only perform downloading of data; Do not proceed to \"\n", - " \"session preparation, model definition or training\")\n", - "flags.DEFINE_integer(\"num_gpus\", 0, \"Total number of gpus for each machine.\"\n", - " \"If you don't use GPU, please set it to '0'\")\n", - "flags.DEFINE_integer(\"replicas_to_aggregate\", None,\n", - " \"Number of replicas to aggregate before parameter update \"\n", - " \"is applied (For sync_replicas mode only; default: \"\n", - " \"num_workers)\")\n", - "flags.DEFINE_integer(\"hidden_units\", 100,\n", - " \"Number of units in the hidden layer of the NN\")\n", - "flags.DEFINE_integer(\"train_steps\", 200,\n", - " \"Number of (global) training steps to perform\")\n", - "flags.DEFINE_integer(\"batch_size\", 100, \"Training batch size\")\n", - "flags.DEFINE_float(\"learning_rate\", 0.01, \"Learning rate\")\n", - "flags.DEFINE_boolean(\n", - " \"sync_replicas\", False,\n", - " \"Use the sync_replicas (synchronized replicas) mode, \"\n", - " \"wherein the parameter updates from workers are aggregated \"\n", - " \"before applied to avoid stale gradients\")\n", - "flags.DEFINE_boolean(\n", - " \"existing_servers\", False, \"Whether servers already exists. If True, \"\n", - " \"will use the worker hosts via their GRPC URLs (one client process \"\n", - " \"per worker host). Otherwise, will create an in-process TensorFlow \"\n", - " \"server.\")\n", - "\n", - "FLAGS = flags.FLAGS\n", - "\n", - "IMAGE_PIXELS = 28\n", - "\n", - "\n", - "def main(unused_argv):\n", - " data_root = os.path.join(\"outputs\", \"MNIST\")\n", - " mnist = None\n", - " tf_config = os.environ.get(\"TF_CONFIG\")\n", - " if not tf_config or tf_config == \"\":\n", - " raise ValueError(\"TF_CONFIG not found.\")\n", - " tf_config_json = json.loads(tf_config)\n", - " cluster = tf_config_json.get('cluster')\n", - " job_name = tf_config_json.get('task', {}).get('type')\n", - " task_index = tf_config_json.get('task', {}).get('index')\n", - " job_name = \"worker\" if job_name == \"master\" else job_name\n", - " sentinel_path = os.path.join(data_root, \"complete.txt\") \n", - " if job_name==\"worker\" and task_index==0:\n", - " mnist = input_data.read_data_sets(data_root, one_hot=True)\n", - " path = os.path.join(data_root, \"complete.txt\") \n", - " with open(sentinel_path, 'w+') as f:\n", - " f.write(\"download complete\")\n", - " else:\n", - " while not os.path.exists(sentinel_path):\n", - " time.sleep(0.01)\n", - " mnist = input_data.read_data_sets(data_root, one_hot=True)\n", - " \n", - " if FLAGS.download_only:\n", - " sys.exit(0)\n", - "\n", - " print(\"job name = %s\" % job_name)\n", - " print(\"task index = %d\" % task_index)\n", - " print(\"number of GPUs = %d\" % FLAGS.num_gpus)\n", - "\n", - " #Construct the cluster and start the server\n", - " cluster_spec = tf.train.ClusterSpec(cluster)\n", - " \n", - " # Get the number of workers.\n", - " num_workers = len(cluster_spec.task_indices(\"worker\"))\n", - "\n", - " if not FLAGS.existing_servers:\n", - " # Not using existing servers. Create an in-process server.\n", - " server = tf.train.Server(\n", - " cluster_spec, job_name=job_name, task_index=task_index)\n", - " if job_name == \"ps\":\n", - " server.join()\n", - "\n", - " is_chief = (task_index == 0)\n", - " if FLAGS.num_gpus > 0:\n", - " # Avoid gpu allocation conflict: now allocate task_num -> #gpu\n", - " # for each worker in the corresponding machine\n", - " gpu = (task_index % FLAGS.num_gpus)\n", - " worker_device = \"/job:worker/task:%d/gpu:%d\" % (task_index, gpu)\n", - " elif FLAGS.num_gpus == 0:\n", - " # Just allocate the CPU to worker server\n", - " cpu = 0\n", - " worker_device = \"/job:worker/task:%d/cpu:%d\" % (task_index, cpu)\n", - " # The device setter will automatically place Variables ops on separate\n", - " # parameter servers (ps). The non-Variable ops will be placed on the workers.\n", - " # The ps use CPU and workers use corresponding GPU\n", - " with tf.device(\n", - " tf.train.replica_device_setter(\n", - " worker_device=worker_device,\n", - " ps_device=\"/job:ps/cpu:0\",\n", - " cluster=cluster)):\n", - " global_step = tf.Variable(0, name=\"global_step\", trainable=False)\n", - "\n", - " # Variables of the hidden layer\n", - " hid_w = tf.Variable(\n", - " tf.truncated_normal(\n", - " [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],\n", - " stddev=1.0 / IMAGE_PIXELS),\n", - " name=\"hid_w\")\n", - " hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name=\"hid_b\")\n", - "\n", - " # Variables of the softmax layer\n", - " sm_w = tf.Variable(\n", - " tf.truncated_normal(\n", - " [FLAGS.hidden_units, 10],\n", - " stddev=1.0 / math.sqrt(FLAGS.hidden_units)),\n", - " name=\"sm_w\")\n", - " sm_b = tf.Variable(tf.zeros([10]), name=\"sm_b\")\n", - "\n", - " # Ops: located on the worker specified with task_index\n", - " x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS])\n", - " y_ = tf.placeholder(tf.float32, [None, 10])\n", - "\n", - " hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)\n", - " hid = tf.nn.relu(hid_lin)\n", - "\n", - " y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))\n", - " cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))\n", - "\n", - " opt = tf.train.AdamOptimizer(FLAGS.learning_rate)\n", - "\n", - " if FLAGS.sync_replicas:\n", - " if FLAGS.replicas_to_aggregate is None:\n", - " replicas_to_aggregate = num_workers\n", - " else:\n", - " replicas_to_aggregate = FLAGS.replicas_to_aggregate\n", - "\n", - " opt = tf.train.SyncReplicasOptimizer(\n", - " opt,\n", - " replicas_to_aggregate=replicas_to_aggregate,\n", - " total_num_replicas=num_workers,\n", - " name=\"mnist_sync_replicas\")\n", - "\n", - " train_step = opt.minimize(cross_entropy, global_step=global_step)\n", - "\n", - " if FLAGS.sync_replicas:\n", - " local_init_op = opt.local_step_init_op\n", - " if is_chief:\n", - " local_init_op = opt.chief_init_op\n", - "\n", - " ready_for_local_init_op = opt.ready_for_local_init_op\n", - "\n", - " # Initial token and chief queue runners required by the sync_replicas mode\n", - " chief_queue_runner = opt.get_chief_queue_runner()\n", - " sync_init_op = opt.get_init_tokens_op()\n", - "\n", - " init_op = tf.global_variables_initializer()\n", - " train_dir = tempfile.mkdtemp()\n", - "\n", - " if FLAGS.sync_replicas:\n", - " sv = tf.train.Supervisor(\n", - " is_chief=is_chief,\n", - " logdir=train_dir,\n", - " init_op=init_op,\n", - " local_init_op=local_init_op,\n", - " ready_for_local_init_op=ready_for_local_init_op,\n", - " recovery_wait_secs=1,\n", - " global_step=global_step)\n", - " else:\n", - " sv = tf.train.Supervisor(\n", - " is_chief=is_chief,\n", - " logdir=train_dir,\n", - " init_op=init_op,\n", - " recovery_wait_secs=1,\n", - " global_step=global_step)\n", - "\n", - " sess_config = tf.ConfigProto(\n", - " allow_soft_placement=True,\n", - " log_device_placement=False,\n", - " device_filters=[\"/job:ps\",\n", - " \"/job:worker/task:%d\" % task_index])\n", - "\n", - " # The chief worker (task_index==0) session will prepare the session,\n", - " # while the remaining workers will wait for the preparation to complete.\n", - " if is_chief:\n", - " print(\"Worker %d: Initializing session...\" % task_index)\n", - " else:\n", - " print(\"Worker %d: Waiting for session to be initialized...\" %\n", - " task_index)\n", - "\n", - " if FLAGS.existing_servers:\n", - " server_grpc_url = \"grpc://\" + worker_spec[task_index]\n", - " print(\"Using existing server at: %s\" % server_grpc_url)\n", - "\n", - " sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config)\n", - " else:\n", - " sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)\n", - "\n", - " print(\"Worker %d: Session initialization complete.\" % task_index)\n", - "\n", - " if FLAGS.sync_replicas and is_chief:\n", - " # Chief worker will start the chief queue runner and call the init op.\n", - " sess.run(sync_init_op)\n", - " sv.start_queue_runners(sess, [chief_queue_runner])\n", - "\n", - " # Perform training\n", - " time_begin = time.time()\n", - " print(\"Training begins @ %f\" % time_begin)\n", - "\n", - " local_step = 0\n", - " while True:\n", - " # Training feed\n", - " batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size)\n", - " train_feed = {x: batch_xs, y_: batch_ys}\n", - "\n", - " _, step = sess.run([train_step, global_step], feed_dict=train_feed)\n", - " local_step += 1\n", - "\n", - " now = time.time()\n", - " print(\"%f: Worker %d: training step %d done (global step: %d)\" %\n", - " (now, task_index, local_step, step))\n", - "\n", - " if step >= FLAGS.train_steps:\n", - " break\n", - "\n", - " time_end = time.time()\n", - " print(\"Training ends @ %f\" % time_end)\n", - " training_time = time_end - time_begin\n", - " print(\"Training elapsed time: %f s\" % training_time)\n", - "\n", - " # Validation feed\n", - " val_feed = {x: mnist.validation.images, y_: mnist.validation.labels}\n", - " val_xent = sess.run(cross_entropy, feed_dict=val_feed)\n", - " print(\"After %d training step(s), validation cross entropy = %g\" %\n", - " (FLAGS.train_steps, val_xent))\n", - " if job_name==\"worker\" and task_index==0:\n", - " run = Run.get_submitted_run()\n", - " run.log(\"CrossEntropy\", val_xent)\n", - "\n", - "if __name__ == \"__main__\":\n", - " tf.app.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.train.dnn import *\n", - "tf_estimator = TensorFlow(source_directory=project_folder,\n", - " compute_target=compute_target,\n", - " entry_script='mnist_replica.py',\n", - " node_count=2,\n", - " worker_count=2,\n", - " parameter_server_count=1, \n", - " distributed_backend=\"ps\",\n", - " use_gpu=False)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run = experiment.submit(tf_estimator)\n", - "print(run)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.train.widgets import RunDetails\n", - "RunDetails(run).show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run.wait_for_completion(show_output=True)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/training/52.distributed-cntk/52.distributed-cntk.ipynb b/training/52.distributed-cntk/52.distributed-cntk.ipynb deleted file mode 100644 index 38c566875..000000000 --- a/training/52.distributed-cntk/52.distributed-cntk.ipynb +++ /dev/null @@ -1,509 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 52. Distributed CNTK\n", - "In this tutorial we demonstrate how to use the Azure ML Training SDK to train CNTK model in a distributed manner.\n", - "\n", - "# Prerequisites\n", - "\n", - "Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Check core SDK version number\n", - "import azureml.core\n", - "\n", - "print(\"SDK version:\", azureml.core.VERSION)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.workspace import Workspace\n", - "\n", - "ws = Workspace.from_config()\n", - "print('Workspace name: ' + ws.name, \n", - " 'Azure region: ' + ws.location, \n", - " 'Subscription id: ' + ws.subscription_id, \n", - " 'Resource group: ' + ws.resource_group, sep = '\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import getpass\n", - "import os\n", - "from azureml.core.experiment import Experiment\n", - "\n", - "username = getpass.getuser().replace('-','')\n", - "\n", - "# choose a name for the run history container in the workspace\n", - "run_history_name = username + '-cntk-distrib'\n", - "\n", - "experiment = Experiment(ws, run_history_name)\n", - "\n", - "# project folder name\n", - "project_folder = './' + run_history_name\n", - "\n", - "print(project_folder)\n", - "os.makedirs(project_folder, exist_ok = True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This recipe is using a MLC-managed Batch AI cluster. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.compute import BatchAiCompute\n", - "from azureml.core.compute import ComputeTarget\n", - "\n", - "batchai_cluster_name='gpucluster'\n", - "\n", - "\n", - "try:\n", - " # Check for existing cluster\n", - " compute_target = ComputeTarget(ws,batchai_cluster_name)\n", - " print('Found existing compute target')\n", - "except:\n", - " # Else, create new one\n", - " print('Creating a new compute target...')\n", - " provisioning_config = BatchAiCompute.provisioning_configuration(vm_size = \"STANDARD_NC6\", # NC6 is GPU-enabled\n", - " #vm_priority = 'lowpriority', # optional\n", - " autoscale_enabled = True,\n", - " cluster_min_nodes = 0, \n", - " cluster_max_nodes = 4)\n", - " compute_target = ComputeTarget.create(ws, batchai_cluster_name, provisioning_config)\n", - " # can poll for a minimum number of nodes and for a specific timeout. \n", - " # if no min node count is provided it will use the scale settings for the cluster\n", - " compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n", - "\n", - " # For a more detailed view of current BatchAI cluster status, use the 'status' property \n", - "print(compute_target.status.serialize())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile {project_folder}/cntk_mnist.py\n", - "\n", - "# This code is adapted from CNTK MNIST tutorials: \n", - "# 1. https://github.com/Microsoft/CNTK/blob/v2.0/Tutorials/CNTK_103A_MNIST_DataLoader.ipynb\n", - "# 2. https://github.com/Microsoft/CNTK/blob/v2.0/Tutorials/CNTK_103C_MNIST_MultiLayerPerceptron.ipynb\n", - "\n", - "# Import the relevant modules to be used later\n", - "from __future__ import print_function\n", - "import gzip\n", - "import numpy as np\n", - "import os\n", - "import shutil\n", - "import struct\n", - "import sys\n", - "import time\n", - "import pandas \n", - "\n", - "import cntk as C\n", - "from azureml.core.run import Run\n", - "import argparse\n", - "\n", - "run = Run.get_submitted_run()\n", - "\n", - "parser=argparse.ArgumentParser()\n", - "\n", - "parser.add_argument('--learning_rate', type=float, default=0.001, help='learning rate')\n", - "parser.add_argument('--num_hidden_layers', type=int, default=2, help='number of hidden layers')\n", - "parser.add_argument('--minibatch_size', type=int, default=64, help='minibatchsize')\n", - "\n", - "args=parser.parse_args() \n", - "\n", - "# Functions to load MNIST images and unpack into train and test set.\n", - "# - loadData reads image data and formats into a 28x28 long array\n", - "# - loadLabels reads the corresponding labels data, 1 for each image\n", - "# - load packs the downloaded image and labels data into a combined format to be read later by \n", - "# CNTK text reader \n", - "def loadData(src, cimg):\n", - " print ('Downloading ' + src)\n", - " gzfname, h = urlretrieve(src, './delete.me')\n", - " print ('Done.')\n", - " try:\n", - " with gzip.open(gzfname) as gz:\n", - " n = struct.unpack('I', gz.read(4))\n", - " # Read magic number.\n", - " if n[0] != 0x3080000:\n", - " raise Exception('Invalid file: unexpected magic number.')\n", - " # Read number of entries.\n", - " n = struct.unpack('>I', gz.read(4))[0]\n", - " if n != cimg:\n", - " raise Exception('Invalid file: expected {0} entries.'.format(cimg))\n", - " crow = struct.unpack('>I', gz.read(4))[0]\n", - " ccol = struct.unpack('>I', gz.read(4))[0]\n", - " if crow != 28 or ccol != 28:\n", - " raise Exception('Invalid file: expected 28 rows/cols per image.')\n", - " # Read data.\n", - " res = np.fromstring(gz.read(cimg * crow * ccol), dtype = np.uint8)\n", - " finally:\n", - " os.remove(gzfname)\n", - " return res.reshape((cimg, crow * ccol))\n", - "\n", - "def loadLabels(src, cimg):\n", - " print ('Downloading ' + src)\n", - " gzfname, h = urlretrieve(src, './delete.me')\n", - " print ('Done.')\n", - " try:\n", - " with gzip.open(gzfname) as gz:\n", - " n = struct.unpack('I', gz.read(4))\n", - " # Read magic number.\n", - " if n[0] != 0x1080000:\n", - " raise Exception('Invalid file: unexpected magic number.')\n", - " # Read number of entries.\n", - " n = struct.unpack('>I', gz.read(4))\n", - " if n[0] != cimg:\n", - " raise Exception('Invalid file: expected {0} rows.'.format(cimg))\n", - " # Read labels.\n", - " res = np.fromstring(gz.read(cimg), dtype = np.uint8)\n", - " finally:\n", - " os.remove(gzfname)\n", - " return res.reshape((cimg, 1))\n", - "\n", - "def try_download(dataSrc, labelsSrc, cimg):\n", - " data = loadData(dataSrc, cimg)\n", - " labels = loadLabels(labelsSrc, cimg)\n", - " return np.hstack((data, labels))\n", - "\n", - "# Save the data files into a format compatible with CNTK text reader\n", - "def savetxt(filename, ndarray):\n", - " dir = os.path.dirname(filename)\n", - "\n", - " if not os.path.exists(dir):\n", - " os.makedirs(dir)\n", - "\n", - " if not os.path.isfile(filename):\n", - " print(\"Saving\", filename )\n", - " with open(filename, 'w') as f:\n", - " labels = list(map(' '.join, np.eye(10, dtype=np.uint).astype(str)))\n", - " for row in ndarray:\n", - " row_str = row.astype(str)\n", - " label_str = labels[row[-1]]\n", - " feature_str = ' '.join(row_str[:-1])\n", - " f.write('|labels {} |features {}\\n'.format(label_str, feature_str))\n", - " else:\n", - " print(\"File already exists\", filename)\n", - "\n", - "# Read a CTF formatted text (as mentioned above) using the CTF deserializer from a file\n", - "def create_reader(path, is_training, input_dim, num_label_classes):\n", - " return C.io.MinibatchSource(C.io.CTFDeserializer(path, C.io.StreamDefs(\n", - " labels = C.io.StreamDef(field='labels', shape=num_label_classes, is_sparse=False),\n", - " features = C.io.StreamDef(field='features', shape=input_dim, is_sparse=False)\n", - " )), randomize = is_training, max_sweeps = C.io.INFINITELY_REPEAT if is_training else 1)\n", - "\n", - "# Defines a utility that prints the training progress\n", - "def print_training_progress(trainer, mb, frequency, verbose=1):\n", - " training_loss = \"NA\"\n", - " eval_error = \"NA\"\n", - "\n", - " if mb%frequency == 0:\n", - " training_loss = trainer.previous_minibatch_loss_average\n", - " eval_error = trainer.previous_minibatch_evaluation_average\n", - " if verbose: \n", - " print (\"Minibatch: {0}, Loss: {1:.4f}, Error: {2:.2f}%\".format(mb, training_loss, eval_error*100))\n", - " \n", - " return mb, training_loss, eval_error\n", - "\n", - "# Create the network architecture\n", - "def create_model(features):\n", - " with C.layers.default_options(init = C.layers.glorot_uniform(), activation = C.ops.relu):\n", - " h = features\n", - " for _ in range(num_hidden_layers):\n", - " h = C.layers.Dense(hidden_layers_dim)(h)\n", - " r = C.layers.Dense(num_output_classes, activation = None)(h)\n", - " return r\n", - "\n", - "\n", - "if __name__ == '__main__':\n", - " run = Run.get_submitted_run()\n", - "\n", - " try: \n", - " from urllib.request import urlretrieve \n", - " except ImportError: \n", - " from urllib import urlretrieve\n", - "\n", - " # Select the right target device when this script is being used:\n", - " if 'TEST_DEVICE' in os.environ:\n", - " if os.environ['TEST_DEVICE'] == 'cpu':\n", - " C.device.try_set_default_device(C.device.cpu())\n", - " else:\n", - " C.device.try_set_default_device(C.device.gpu(0))\n", - "\n", - " # URLs for the train image and labels data\n", - " url_train_image = 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz'\n", - " url_train_labels = 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz'\n", - " num_train_samples = 60000\n", - "\n", - " print(\"Downloading train data\")\n", - " train = try_download(url_train_image, url_train_labels, num_train_samples)\n", - "\n", - " url_test_image = 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz'\n", - " url_test_labels = 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'\n", - " num_test_samples = 10000\n", - "\n", - " print(\"Downloading test data\")\n", - " test = try_download(url_test_image, url_test_labels, num_test_samples)\n", - "\n", - "\n", - " # Save the train and test files (prefer our default path for the data\n", - " rank = os.environ.get(\"OMPI_COMM_WORLD_RANK\") \n", - " data_dir = os.path.join(\"outputs\", \"MNIST\")\n", - " sentinel_path = os.path.join(data_dir, \"complete.txt\") \n", - " if rank == '0': \n", - " print ('Writing train text file...')\n", - " savetxt(os.path.join(data_dir, \"Train-28x28_cntk_text.txt\"), train)\n", - "\n", - " print ('Writing test text file...')\n", - " savetxt(os.path.join(data_dir, \"Test-28x28_cntk_text.txt\"), test)\n", - " with open(sentinel_path, 'w+') as f:\n", - " f.write(\"download complete\")\n", - "\n", - " print('Done with downloading data.')\n", - " else:\n", - " while not os.path.exists(sentinel_path):\n", - " time.sleep(0.01)\n", - " \n", - "\n", - " # Ensure we always get the same amount of randomness\n", - " np.random.seed(0)\n", - "\n", - " # Define the data dimensions\n", - " input_dim = 784\n", - " num_output_classes = 10\n", - "\n", - " # Ensure the training and test data is generated and available for this tutorial.\n", - " # We search in two locations in the toolkit for the cached MNIST data set.\n", - " data_found = False\n", - " for data_dir in [os.path.join(\"..\", \"Examples\", \"Image\", \"DataSets\", \"MNIST\"),\n", - " os.path.join(\"data_\" + str(rank), \"MNIST\"),\n", - " os.path.join(\"outputs\", \"MNIST\")]:\n", - " train_file = os.path.join(data_dir, \"Train-28x28_cntk_text.txt\")\n", - " test_file = os.path.join(data_dir, \"Test-28x28_cntk_text.txt\")\n", - " if os.path.isfile(train_file) and os.path.isfile(test_file):\n", - " data_found = True\n", - " break\n", - " if not data_found:\n", - " raise ValueError(\"Please generate the data by completing CNTK 103 Part A\")\n", - " print(\"Data directory is {0}\".format(data_dir))\n", - "\n", - " num_hidden_layers = args.num_hidden_layers\n", - " hidden_layers_dim = 400\n", - "\n", - " input = C.input_variable(input_dim)\n", - " label = C.input_variable(num_output_classes)\n", - "\n", - " \n", - " z = create_model(input)\n", - " # Scale the input to 0-1 range by dividing each pixel by 255.\n", - " z = create_model(input/255.0)\n", - "\n", - " loss = C.cross_entropy_with_softmax(z, label)\n", - " label_error = C.classification_error(z, label)\n", - "\n", - "\n", - " # Instantiate the trainer object to drive the model training\n", - " learning_rate = args.learning_rate\n", - " lr_schedule = C.learning_rate_schedule(learning_rate, C.UnitType.minibatch)\n", - " learner = C.sgd(z.parameters, lr_schedule)\n", - " trainer = C.Trainer(z, (loss, label_error), [learner])\n", - "\n", - "\n", - " # Initialize the parameters for the trainer\n", - " minibatch_size = args.minibatch_size\n", - " num_samples_per_sweep = 60000\n", - " num_sweeps_to_train_with = 10\n", - " num_minibatches_to_train = (num_samples_per_sweep * num_sweeps_to_train_with) / minibatch_size\n", - "\n", - " # Create the reader to training data set\n", - " reader_train = create_reader(train_file, True, input_dim, num_output_classes)\n", - "\n", - " # Map the data streams to the input and labels.\n", - " input_map = {\n", - " label : reader_train.streams.labels,\n", - " input : reader_train.streams.features\n", - " } \n", - "\n", - " # Run the trainer on and perform model training\n", - " training_progress_output_freq = 500\n", - " \n", - " errors = []\n", - " losses = []\n", - " for i in range(0, int(num_minibatches_to_train)): \n", - " # Read a mini batch from the training data file\n", - " data = reader_train.next_minibatch(minibatch_size, input_map = input_map)\n", - " \n", - " trainer.train_minibatch(data)\n", - " batchsize, loss, error = print_training_progress(trainer, i, training_progress_output_freq, verbose=1)\n", - " if (error != 'NA') and (loss != 'NA'):\n", - " errors.append(float(error))\n", - " losses.append(float(loss))\n", - " \n", - " # log the losses\n", - " if rank == '0': \n", - " run.log_list(\"Loss\", losses)\n", - " run.log_list(\"Error\",errors)\n", - "\n", - " # Read the training data\n", - " reader_test = create_reader(test_file, False, input_dim, num_output_classes)\n", - "\n", - " test_input_map = {\n", - " label : reader_test.streams.labels,\n", - " input : reader_test.streams.features,\n", - " }\n", - "\n", - " # Test data for trained model\n", - " test_minibatch_size = 512\n", - " num_samples = 10000\n", - " num_minibatches_to_test = num_samples // test_minibatch_size\n", - " test_result = 0.0\n", - "\n", - " \n", - " for i in range(num_minibatches_to_test): \n", - " # We are loading test data in batches specified by test_minibatch_size\n", - " # Each data point in the minibatch is a MNIST digit image of 784 dimensions \n", - " # with one pixel per dimension that we will encode / decode with the \n", - " # trained model.\n", - " data = reader_test.next_minibatch(test_minibatch_size,\n", - " input_map = test_input_map)\n", - "\n", - " eval_error = trainer.test_minibatch(data)\n", - " test_result = test_result + eval_error\n", - " \n", - "\n", - " # Average of evaluation errors of all test minibatches\n", - " print(\"Average test error: {0:.2f}%\".format(test_result*100 / num_minibatches_to_test))\n", - "\n", - " out = C.softmax(z)\n", - "\n", - " # Read the data for evaluation\n", - " reader_eval = create_reader(test_file, False, input_dim, num_output_classes)\n", - "\n", - " eval_minibatch_size = 25\n", - " eval_input_map = {input: reader_eval.streams.features} \n", - "\n", - " data = reader_test.next_minibatch(eval_minibatch_size, input_map = test_input_map)\n", - "\n", - " img_label = data[label].asarray()\n", - " img_data = data[input].asarray()\n", - " predicted_label_prob = [out.eval(img_data[i]) for i in range(len(img_data))]\n", - "\n", - " # Find the index with the maximum value for both predicted as well as the ground truth\n", - " pred = [np.argmax(predicted_label_prob[i]) for i in range(len(predicted_label_prob))]\n", - " gtlabel = [np.argmax(img_label[i]) for i in range(len(img_label))]\n", - "\n", - " print(\"Label :\", gtlabel[:25])\n", - " print(\"Predicted:\", pred)\n", - " \n", - " # save model to outputs folder\n", - " z.save('outputs/cntk.model')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.train.estimator import *\n", - "pip_packages=['cntk==2.5.1', 'pandas==0.23.4']\n", - "cntk_estimator = Estimator(source_directory=project_folder,\n", - " compute_target=compute_target,\n", - " entry_script='cntk_mnist.py',\n", - " node_count=2,\n", - " process_count_per_node=1,\n", - " distributed_backend=\"mpi\", \n", - " pip_packages=pip_packages,\n", - " custom_docker_base_image=\"microsoft/mmlspark:0.12\",\n", - " use_gpu=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run = experiment.submit(cntk_estimator)\n", - "print(run)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.train.widgets import RunDetails\n", - "RunDetails(run).show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run.wait_for_completion(show_output=True)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/training/53.distributed-pytorch-with-horovod/53.distributed-pytorch-with-horovod.ipynb b/training/53.distributed-pytorch-with-horovod/53.distributed-pytorch-with-horovod.ipynb deleted file mode 100644 index 46db6dab0..000000000 --- a/training/53.distributed-pytorch-with-horovod/53.distributed-pytorch-with-horovod.ipynb +++ /dev/null @@ -1,376 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PyTorch Distributed Demo\n", - "\n", - "In this demo, we will run a sample PyTorch job using Horovod on a multi-node Batch AI cluster." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisites\n", - "Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Check core SDK version number\n", - "import azureml.core\n", - "\n", - "print(\"SDK version:\", azureml.core.VERSION)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Initialize Workspace\n", - "\n", - "Initialize a workspace object from persisted configuration." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.workspace import Workspace\n", - "\n", - "ws = Workspace.from_config()\n", - "print('Workspace name: ' + ws.name, \n", - " 'Azure region: ' + ws.location, \n", - " 'Subscription id: ' + ws.subscription_id, \n", - " 'Resource group: ' + ws.resource_group, sep = '\\n')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set experiment name and create project\n", - "Choose a name for your run history container in the workspace, and create a folder for the project." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "experiment_name = 'pytorch-dist-hvd'\n", - "\n", - "# project folder\n", - "project_folder = './sample_projects/pytorch-dist-hvd'\n", - "os.makedirs(project_folder, exist_ok = True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Write demo PyTorch code\n", - "\n", - "We will use a distributed PyTorch implementation of the classic MNIST problem. The following cell writes the main implementation to the project folder." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile {project_folder}/pytorch_horovod_mnist.py\n", - "\n", - "from __future__ import print_function\n", - "import argparse\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "import torch.optim as optim\n", - "from torchvision import datasets, transforms\n", - "from torch.autograd import Variable\n", - "import torch.utils.data.distributed\n", - "import horovod.torch as hvd\n", - "\n", - "# Training settings\n", - "parser = argparse.ArgumentParser(description='PyTorch MNIST Example')\n", - "parser.add_argument('--batch-size', type=int, default=64, metavar='N',\n", - " help='input batch size for training (default: 64)')\n", - "parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',\n", - " help='input batch size for testing (default: 1000)')\n", - "parser.add_argument('--epochs', type=int, default=10, metavar='N',\n", - " help='number of epochs to train (default: 10)')\n", - "parser.add_argument('--lr', type=float, default=0.01, metavar='LR',\n", - " help='learning rate (default: 0.01)')\n", - "parser.add_argument('--momentum', type=float, default=0.5, metavar='M',\n", - " help='SGD momentum (default: 0.5)')\n", - "parser.add_argument('--no-cuda', action='store_true', default=False,\n", - " help='disables CUDA training')\n", - "parser.add_argument('--seed', type=int, default=42, metavar='S',\n", - " help='random seed (default: 42)')\n", - "parser.add_argument('--log-interval', type=int, default=10, metavar='N',\n", - " help='how many batches to wait before logging training status')\n", - "args = parser.parse_args()\n", - "args.cuda = not args.no_cuda and torch.cuda.is_available()\n", - "\n", - "hvd.init()\n", - "torch.manual_seed(args.seed)\n", - "\n", - "if args.cuda:\n", - " # Horovod: pin GPU to local rank.\n", - " torch.cuda.set_device(hvd.local_rank())\n", - " torch.cuda.manual_seed(args.seed)\n", - "\n", - "\n", - "kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}\n", - "train_dataset = \\\n", - " datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True,\n", - " transform=transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.1307,), (0.3081,))\n", - " ]))\n", - "train_sampler = torch.utils.data.distributed.DistributedSampler(\n", - " train_dataset, num_replicas=hvd.size(), rank=hvd.rank())\n", - "train_loader = torch.utils.data.DataLoader(\n", - " train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs)\n", - "\n", - "test_dataset = \\\n", - " datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.1307,), (0.3081,))\n", - " ]))\n", - "test_sampler = torch.utils.data.distributed.DistributedSampler(\n", - " test_dataset, num_replicas=hvd.size(), rank=hvd.rank())\n", - "test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size,\n", - " sampler=test_sampler, **kwargs)\n", - "\n", - "\n", - "class Net(nn.Module):\n", - " def __init__(self):\n", - " super(Net, self).__init__()\n", - " self.conv1 = nn.Conv2d(1, 10, kernel_size=5)\n", - " self.conv2 = nn.Conv2d(10, 20, kernel_size=5)\n", - " self.conv2_drop = nn.Dropout2d()\n", - " self.fc1 = nn.Linear(320, 50)\n", - " self.fc2 = nn.Linear(50, 10)\n", - "\n", - " def forward(self, x):\n", - " x = F.relu(F.max_pool2d(self.conv1(x), 2))\n", - " x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))\n", - " x = x.view(-1, 320)\n", - " x = F.relu(self.fc1(x))\n", - " x = F.dropout(x, training=self.training)\n", - " x = self.fc2(x)\n", - " return F.log_softmax(x)\n", - "\n", - "\n", - "model = Net()\n", - "\n", - "if args.cuda:\n", - " # Move model to GPU.\n", - " model.cuda()\n", - "\n", - "# Horovod: broadcast parameters.\n", - "hvd.broadcast_parameters(model.state_dict(), root_rank=0)\n", - "\n", - "# Horovod: scale learning rate by the number of GPUs.\n", - "optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(),\n", - " momentum=args.momentum)\n", - "\n", - "# Horovod: wrap optimizer with DistributedOptimizer.\n", - "optimizer = hvd.DistributedOptimizer(\n", - " optimizer, named_parameters=model.named_parameters())\n", - "\n", - "\n", - "def train(epoch):\n", - " model.train()\n", - " train_sampler.set_epoch(epoch)\n", - " for batch_idx, (data, target) in enumerate(train_loader):\n", - " if args.cuda:\n", - " data, target = data.cuda(), target.cuda()\n", - " data, target = Variable(data), Variable(target)\n", - " optimizer.zero_grad()\n", - " output = model(data)\n", - " loss = F.nll_loss(output, target)\n", - " loss.backward()\n", - " optimizer.step()\n", - " if batch_idx % args.log_interval == 0:\n", - " print('Train Epoch: {} [{}/{} ({:.0f}%)]\\tLoss: {:.6f}'.format(\n", - " epoch, batch_idx * len(data), len(train_sampler),\n", - " 100. * batch_idx / len(train_loader), loss.data[0]))\n", - "\n", - "\n", - "def metric_average(val, name):\n", - " tensor = torch.FloatTensor([val])\n", - " avg_tensor = hvd.allreduce(tensor, name=name)\n", - " return avg_tensor[0]\n", - "\n", - "\n", - "def test():\n", - " model.eval()\n", - " test_loss = 0.\n", - " test_accuracy = 0.\n", - " for data, target in test_loader:\n", - " if args.cuda:\n", - " data, target = data.cuda(), target.cuda()\n", - " data, target = Variable(data, volatile=True), Variable(target)\n", - " output = model(data)\n", - " # sum up batch loss\n", - " test_loss += F.nll_loss(output, target, size_average=False).data[0]\n", - " # get the index of the max log-probability\n", - " pred = output.data.max(1, keepdim=True)[1]\n", - " test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum()\n", - "\n", - " test_loss /= len(test_sampler)\n", - " test_accuracy /= len(test_sampler)\n", - "\n", - " test_loss = metric_average(test_loss, 'avg_loss')\n", - " test_accuracy = metric_average(test_accuracy, 'avg_accuracy')\n", - "\n", - " if hvd.rank() == 0:\n", - " print('\\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\\n'.format(\n", - " test_loss, 100. * test_accuracy))\n", - "\n", - "\n", - "for epoch in range(1, args.epochs + 1):\n", - " train(epoch)\n", - " test()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Deploy Batch AI cluster\n", - "\n", - "To run this in a distributed context, we'll need a Batch AI cluster with at least two nodes.\n", - "\n", - "Here, we use exactly two CPU nodes, to conserve resources. If you want to try it with some other number or SKU, just change the relevant values in the following code block." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.compute import BatchAiCompute\n", - "from azureml.core.compute import ComputeTarget\n", - "\n", - "batchai_cluster_name='gpucluster'\n", - "\n", - "\n", - "try:\n", - " # Check for existing cluster\n", - " compute_target = ComputeTarget(ws,batchai_cluster_name)\n", - " print('Found existing compute target')\n", - "except:\n", - " # Else, create new one\n", - " print('Creating a new compute target...')\n", - " provisioning_config = BatchAiCompute.provisioning_configuration(vm_size = \"STANDARD_NC6\", # NC6 is GPU-enabled\n", - " #vm_priority = 'lowpriority', # optional\n", - " autoscale_enabled = True,\n", - " cluster_min_nodes = 0, \n", - " cluster_max_nodes = 4)\n", - " compute_target = ComputeTarget.create(ws, batchai_cluster_name, provisioning_config)\n", - " # can poll for a minimum number of nodes and for a specific timeout. \n", - " # if no min node count is provided it will use the scale settings for the cluster\n", - " compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n", - "\n", - " # For a more detailed view of current BatchAI cluster status, use the 'status' property \n", - "print(compute_target.status.serialize())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Submit job\n", - "\n", - "Now that we have a cluster ready to go, let's submit our job.\n", - "\n", - "We need to use a custom estimator here, and specify that we want the `pytorch`, `horovod` and `torchvision` packages installed to our image." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.train.dnn import PyTorch\n", - "\n", - "estimator = PyTorch(source_directory=project_folder,\n", - " compute_target=compute_target,\n", - " entry_script='pytorch_horovod_mnist.py',\n", - " node_count=2,\n", - " process_count_per_node=1,\n", - " distributed_backend=\"mpi\",\n", - " use_gpu=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.experiment import Experiment\n", - "\n", - "experiment = Experiment(workspace=ws, name=experiment_name)\n", - "run = experiment.submit(estimator)\n", - "print(run)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.train.widgets import RunDetails\n", - "RunDetails(run).show()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tutorials/01.train-models.ipynb b/tutorials/01.train-models.ipynb index e8a1e896b..916145955 100644 --- a/tutorials/01.train-models.ipynb +++ b/tutorials/01.train-models.ipynb @@ -121,7 +121,9 @@ "\n", "Azure Azure ML Managed Compute is a managed service that enables data scientists to train machine learning models on clusters of Azure virtual machines, including VMs with GPU support. In this tutorial, you create an Azure Managed Compute cluster as your training environment. This code creates a cluster for you if it does not already exist in your workspace. \n", "\n", - " **Creation of the cluster takes approximately 5 minutes.** If the cluster is already in the workspace this code uses it and skips the creation process." + " **Creation of the cluster takes approximately 5 minutes.** If the cluster is already in the workspace this code uses it and skips the creation process.\n", + "\n", + "**Note**: As with other Azure services, there are limits on certain resources (for eg. BatchAI cluster size) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota." ] }, {