In [20]:
import azureml.core

print("Azure SDK version:", azureml.core.VERSION)

Azure SDK version: 1.0.76


In [21]:
from azureml.core import Workspace

ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')

cesardl-automl-northcentralus-ws
automlpmdemo
northcentralus
102a16c3-37d3-48a8-9237-4c9b1e8e80e0


In [22]:
from azureml.core import Environment

envs = Environment.list(workspace=ws)

# List Environments and packages in my workspace
for env in envs:
    if env.startswith("AzureML"):
        print("Name",env)
        #print("packages", envs[env].python.conda_dependencies.serialize_to_string())
        
# Use curated environment for Spark
curated_environment = Environment.get(workspace=ws, name="AzureML-PySpark-MmlSpark-0.15")

# Save curated environment definition to folder (Two files, one for conda_dependencies.yml and another file for azureml_environment.json)
curated_environment.save_to_directory(path="./curated_environment_definition", overwrite=True)

In [23]:
# Copy based on curated environment
spark_environment = curated_environment
spark_environment.name = "Custom-AzureML-PySpark-Environment"

# Create base Environment from Conda specification
# spark_environment = Environment.from_conda_specification(name="Custom-AzureML-PySpark-Environment", file_path="./curated_environment_definition/conda_dependencies.yml")


In [24]:
from azureml.core import ContainerRegistry

# Set base Docker Image
spark_environment.docker.enabled = True

# Specify custom Docker base image and registry, if you don't want to use the defaults
spark_environment.docker.base_image="mcr.microsoft.com/mmlspark/release" 
container_registry = ContainerRegistry()
container_registry.address = "mcr.microsoft.com"
# container_registry.username = ""   # Use username if using a private Docker Registry like ACR
# container_registry.password = ""   # Use password if using a private Docker Registry like ACR
spark_environment.docker.base_image_registry=container_registry

spark_environment.save_to_directory(path="./spark_environment_definition", overwrite=True)

In [25]:
# Register Environment

spark_environment.register(ws)

envs = Environment.list(workspace=ws)

# List Environments and packages in my workspace
for env in envs:
    if env.startswith("Custom"):
        print("Environment Name",env)
        print("packages", envs[env].python.conda_dependencies.serialize_to_string())

Environment Name Custom-AzureML-PySpark-Environment
packages channels:
- conda-forge
dependencies:
- python=3.6.2
- pip:
  - azureml-core==1.0.81.1
  - azureml-defaults==1.0.81
  - azureml-telemetry==1.0.81.1
  - azureml-train-restclients-hyperdrive==1.0.81
  - azureml-train-core==1.0.81
name: azureml_2d6f32a8b5b445b7627fd1ae36599989



In [26]:
# Create Experiment

from azureml.core import Experiment
experiment_name = 'test-spark-job-on-amlcompute'
experiment = Experiment(workspace=ws, name=experiment_name)

In [32]:
# Create project directory and copy the training script into the project directory
import os
import shutil

project_folder = './project-submit-folder'
os.makedirs(project_folder, exist_ok=True)

# Copy the needed files
shutil.copy('spark-job.py', project_folder)
shutil.copy('spark-job-simple.py', project_folder)
shutil.copy('iris.csv', project_folder)

'./project-submit-folder/iris.csv'

In [33]:
# Connect or Create a Remote AML compute cluster
# Define remote compute target to use
# Further docs on Remote Compute Target: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-remote

# Choose a name for your cluster.
amlcompute_cluster_name = "cesardl-cpu-clus"

found = False
# Check if this compute target already exists in the workspace.
cts = ws.compute_targets

if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':
     found = True
     print('Found existing training cluster.')
     # Get existing cluster
     # Method 1:
     aml_remote_compute = cts[amlcompute_cluster_name]
     # Method 2:
     # aml_remote_compute = ComputeTarget(ws, amlcompute_cluster_name)
    
if not found:
     print('Creating a new training cluster...')
     provisioning_config = AmlCompute.provisioning_configuration(vm_size = "STANDARD_D13_V2", # for GPU, use "STANDARD_NC12"
                                                                 #vm_priority = 'lowpriority', # optional
                                                                 max_nodes = 20)
     # Create the cluster.
     aml_remote_compute = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)
    
print('Checking cluster status...')
# Can poll for a minimum number of nodes and for a specific timeout.
# If no min_node_count is provided, it will use the scale settings for the cluster.
aml_remote_compute.wait_for_completion(show_output = True, min_node_count = 0, timeout_in_minutes = 20)
    
# For a more detailed view of current AmlCompute status, use get_status().

Found existing training cluster.
Checking cluster status...
Succeeded
AmlCompute wait for completion finished
Minimum number of nodes requested have been provisioned


In [34]:
# Configure ScriptRunConfig
from azureml.core import ScriptRunConfig, RunConfiguration, Experiment
from azureml.core.conda_dependencies import CondaDependencies

## use pyspark framework
# spark_run_config = RunConfiguration(framework="pyspark")
## Set compute target to the cluster
# spark_run_config.target = aml_remote_compute.name

# specify CondaDependencies object to ask system installing numpy
# https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-environments#add-packages-to-an-environment
# cd = CondaDependencies()
# cd.add_conda_package('numpy')


script_runconfig = ScriptRunConfig(source_directory=project_folder, 
                                   script="spark-job-simple.py"
                                   # run_config = spark_run_config
                                  )

# Attach compute target to run config
script_runconfig.run_config.target = aml_remote_compute 
# runconfig.run_config.target = "local"

# Attach environment to run config
script_runconfig.run_config.environment = spark_environment
script_runconfig.run_config.framework="pyspark"
script_runconfig.run_config.environment.python.conda_dependencies = cd

In [35]:
run = experiment.submit(script_runconfig)
run

Experiment,Id,Type,Status,Details Page,Docs Page
test-spark-job-on-amlcompute,test-spark-job-on-amlcompute_1579975186_f1de880d,azureml.scriptrun,Starting,Link to Azure Machine Learning studio,Link to Documentation


In [36]:
# Monitor run

from azureml.widgets import RunDetails
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'NOTSET',…

In [None]:
# get all metris logged in the run
metrics = run.get_metrics()
print(metrics)

In [None]:
# register the generated model
model = run.register_model(model_name='iris-spark.model', model_path='outputs/iris-spark.model')