Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -195,9 +195,10 @@
"metadata": {},
"outputs": [],
"source": [
" '''\n",
"'''\n",
" from azureml.core.compute import RemoteCompute \n",
" dsvm_compute = RemoteCompute.attach(ws,name=\"attach-from-sdk6\",username=<username>,address=<ipaddress>,ssh_port=22,password=<password>)\n",
" # if you want to connect using SSH key instead of username/password you can provide parameters private_key_file and private_key_passphrase \n",
" dsvm_compute = RemoteCompute.attach(ws,name=\"attach-from-sdk6\",username=<username>,address=<ipaddress>,ssh_port=22,password=<password>)\n",
"'''"
]
},
Expand Down
288 changes: 48 additions & 240 deletions 01.getting-started/05.train-in-spark/05.train-in-spark.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,9 @@
"source": [
"# 05. Train in Spark\n",
"* Create Workspace\n",
"* Create Project\n",
"* Create `train-spark.py` file in the project folder\n",
"* Execute a PySpark script in ACI.\n",
"* Execute a PySpark script in a Docker container on remote DSVM\n",
"* Execute a PySpark script in HDI"
"* Create Experiment\n",
"* Copy relevant files to the script folder\n",
"* Configure and Run"
]
},
{
Expand Down Expand Up @@ -67,8 +65,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Project and Associate with Run History\n",
"**Project** is a local folder that contains files for your Azure ML experiments. It is associated with a **run history**, a cloud container of run metrics and output artifacts from your experiments. You can either attach a local folder as a new project, or load a local folder as a project if it has been attached before."
"## Create Experiment\n"
]
},
{
Expand All @@ -77,27 +74,15 @@
"metadata": {},
"outputs": [],
"source": [
"# choose a name for the run history container in the workspace\n",
"experiment_name = 'train-on-spark'\n",
"experiment_name = 'train-on-remote-vm'\n",
"script_folder = './samples/train-on-remote-vm'\n",
"\n",
"# project folder\n",
"project_folder = './sample_projects/train-on-spark'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from azureml.project.project import Project\n",
"os.makedirs(script_folder, exist_ok = True)\n",
"\n",
"project = Project.attach(workspace_object = ws,\n",
" experiment_name = experiment_name,\n",
" directory = project_folder)\n",
"from azureml.core import Experiment\n",
"\n",
"print(project.project_directory, project.history.name, sep = '\\n')"
"exp = Experiment(workspace = ws, name = experiment_name)"
]
},
{
Expand All @@ -119,11 +104,11 @@
"from shutil import copyfile\n",
"\n",
"# copy iris dataset in to project folder\n",
"copyfile('./iris.csv', os.path.join(project_folder, 'iris.csv'))\n",
"copyfile('iris.csv', os.path.join(script_folder, 'iris.csv'))\n",
"\n",
"# copy train-spark.py file into project folder\n",
"# train-spark.py trains a simple LogisticRegression model using Spark.ML algorithm\n",
"copyfile('./train-spark.py', os.path.join(project_folder, 'train-spark.py'))"
"copyfile('train-spark.py', os.path.join(script_folder, 'train-spark.py'))"
]
},
{
Expand Down Expand Up @@ -154,117 +139,10 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### Configure ACI target"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"# create a new runconfig object\n",
"run_config = RunConfiguration()\n",
"\n",
"# signal that you want to use ACI to execute script.\n",
"run_config.target = \"containerinstance\"\n",
"\n",
"# ACI container group is only supported in certain regions, which can be different than the region the Workspace is in.\n",
"run_config.container_instance.region = 'eastus'\n",
"\n",
"# set the ACI CPU and Memory \n",
"run_config.container_instance.cpu_cores = 1\n",
"run_config.container_instance.memory_gb = 2\n",
"\n",
"# enable Docker \n",
"run_config.environment.docker.enabled = True\n",
"\n",
"# set Docker base image to the default CPU-based image\n",
"run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_MMLSPARK_CPU_IMAGE\n",
"print('base image is', run_config.environment.docker.base_image)\n",
"#run_config.environment.docker.base_image = 'microsoft/mmlspark:plus-0.9.9'\n",
"\n",
"# use conda_dependencies.yml to create a conda environment in the Docker image for execution\n",
"# please update this file if you need additional packages.\n",
"run_config.environment.python.user_managed_dependencies = False\n",
"\n",
"# auto-prepare the Docker image when used for execution (if it is not already prepared)\n",
"run_config.auto_prepare_environment = True\n",
"\n",
"cd = CondaDependencies()\n",
"# add numpy as a dependency\n",
"cd.add_conda_package('numpy')\n",
"# overwrite the default conda_dependencies.yml file\n",
"cd.save_to_file(base_directory = project_folder, conda_file_path='aml_config/conda_dependencies.yml')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Run Spark job in ACI"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time \n",
"from azureml.core.experiment import Experiment\n",
"from azureml.core.script_run_config import ScriptRunConfig\n",
"\n",
"experiment = Experiment(project_object.workspace_object, project_object.history.name)\n",
"script_run_config = ScriptRunConfig(source_directory = project.project_directory,\n",
" script= 'train-spark.py',\n",
" run_config = run_config)\n",
"run = experiment.submit(script_run_config)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run.wait_for_completion(show_output = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Show the run in the web UI\n",
"**IMPORTANT**: Please use Chrome to navigate to the URL."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# import helpers.py\n",
"import helpers\n",
"\n",
"# get the URL of the run history web page\n",
"print(helpers.get_run_history_url(run))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Attach a remote Linux VM\n",
"To use remote docker commpute target:\n",
" 1. Create a Linux DSVM in Azure. Here is some [quick instructions](https://docs.microsoft.com/en-us/azure/machine-learning/desktop-workbench/how-to-create-dsvm-hdi). Make sure you use the Ubuntu flavor, NOT CentOS.\n",
" 2. Enter the IP address, username and password below\n",
" \n",
"**Note**: the below example use port 5022. By default SSH runs on port 22 and you don't need to specify it. But if for security reasons you switch to a different port (such as 5022), you can append the port number to the address like the example below. [Read more](../../documentation/sdk/ssh-issue.md) on this."
"### Attach an HDI cluster\n",
"To use HDI commpute target:\n",
" 1. Create an Spark for HDI cluster in Azure. Here is some [quick instructions](https://docs.microsoft.com/en-us/azure/machine-learning/desktop-workbench/how-to-create-dsvm-hdi). Make sure you use the Ubuntu flavor, NOT CentOS.\n",
" 2. Enter the IP address, username and password below"
]
},
{
Expand All @@ -273,25 +151,30 @@
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import RemoteCompute\n",
"from azureml.core.compute import HDInsightCompute\n",
"\n",
"try:\n",
" # Attaches a remote docker on a remote vm as a compute target.\n",
" RemoteCompute.attach(workspace,name = \"cpu-dsvm\", username = \"ninghai\", \n",
" address = \"hai2.eastus2.cloudapp.azure.com:5022\", \n",
" ssh-port=22\n",
" password = \"<password>\"))\n",
" # if you want to connect using SSH key instead of username/password you can provide parameters private_key_file and private_key_passphrase\n",
" hdi_compute_new = HDInsightCompute.attach(ws, \n",
" name=\"hdi-attach\", \n",
" address=\"hdi-ignite-demo-ssh.azurehdinsight.net\", \n",
" ssh_port=22, \n",
" username='<username>', \n",
" password='<password>')\n",
"\n",
"except UserErrorException as e:\n",
" print(\"Caught = {}\".format(e.message))\n",
" print(\"Compute config already attached.\")"
" print(\"Compute config already attached.\")\n",
" \n",
" \n",
"hdi_compute_new.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Configure a Spark Docker run on the VM\n",
"Execute in the Spark engine in a Docker container in the VM. "
"### Configure HDI run"
]
},
{
Expand All @@ -300,107 +183,32 @@
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"\n",
"# Load the \"cpu-dsvm.runconfig\" file (created by the above attach operation) in memory\n",
"run_config = RunConfiguration.load(path = project_folder, name = \"cpu-dsvm\")\n",
"run_config = RunConfiguration(framework = \"python\")\n",
"\n",
"# set framework to PySpark\n",
"run_config.framework = \"PySpark\"\n",
"# Set compute target to the Linux DSVM\n",
"run_config.target = hdi_compute.name\n",
"\n",
"# Use Docker in the remote VM\n",
"run_config.environment.docker.enabled = True\n",
"# run_config.environment.docker.enabled = True\n",
"\n",
"# Use the MMLSpark CPU based image.\n",
"# https://hub.docker.com/r/microsoft/mmlspark/\n",
"run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_MMLSPARK_CPU_IMAGE\n",
"print('base image is:', run_config.environment.docker.base_image)\n",
"# Use CPU base image from DockerHub\n",
"# run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n",
"# print('Base Docker image is:', run_config.environment.docker.base_image)\n",
"\n",
"# signal use the user-managed environment\n",
"# do NOT provision a new one based on the conda.yml file\n",
"# Ask system to provision a new one based on the conda_dependencies.yml file\n",
"run_config.environment.python.user_managed_dependencies = False\n",
"\n",
"# Prepare the Docker and conda environment automatically when execute for the first time.\n",
"run_config.auto_prepare_environment = True"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Submit the Experiment\n",
"Submit script to run in the Spark engine in the Docker container in the remote VM."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"script_run_config = ScriptRunConfig(source_directory = project.project_directory,\n",
" script= 'train-spark.py',\n",
" run_config = run_config)\n",
"run = experiment.submit(script_run_config)\n",
"\n",
"run.wait_for_completion(show_output = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# get the URL of the run history web page\n",
"print(helpers.get_run_history_url(run))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Attach an HDI cluster\n",
"To use HDI commpute target:\n",
" 1. Create an Spark for HDI cluster in Azure. Here is some [quick instructions](https://docs.microsoft.com/en-us/azure/machine-learning/desktop-workbench/how-to-create-dsvm-hdi). Make sure you use the Ubuntu flavor, NOT CentOS.\n",
" 2. Enter the IP address, username and password below"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import HDInsightCompute\n",
"\n",
"try:\n",
" # Attaches a HDI cluster as a compute target.\n",
" HDInsightCompute.attach(ws, name = \"myhdi\",\n",
" username = \"ninghai\", \n",
" address = \"sparkhai-ssh.azurehdinsight.net\", \n",
" password = \"<pwd>\"))\n",
"except UserErrorException as e:\n",
" print(\"Caught = {}\".format(e.message))\n",
" print(\"Compute config already attached.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Configure HDI run"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# load the runconfig object from the \"myhdi.runconfig\" file generated by the attach operaton above.\n",
"run_config = RunConfiguration.load(path = project_folder, name = 'myhdi')\n",
"# Prepare the Docker and conda environment automatically when executingfor the first time.\n",
"# run_config.prepare_environment = True\n",
"\n",
"# ask system to prepare the conda environment automatically when executed for the first time\n",
"run_config.auto_prepare_environment = True"
"# specify CondaDependencies obj\n",
"# run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn'])\n",
"# load the runconfig object from the \"myhdi.runconfig\" file generated by the attach operaton above."
]
},
{
Expand Down Expand Up @@ -448,7 +256,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
Expand All @@ -462,7 +270,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
"version": "3.6.6"
}
},
"nbformat": 4,
Expand Down
Loading