In [24]:
import azureml.core

print("Azure SDK version:", azureml.core.VERSION)

Azure SDK version: 1.0.76


In [25]:
from azureml.core import Workspace

ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')

cesardl-automl-northcentralus-ws
automlpmdemo
northcentralus
102a16c3-37d3-48a8-9237-4c9b1e8e80e0


In [26]:
from azureml.core import Environment

custom_env_name = "Custom-AzureML-PySpark-Environment"
curated_env_name = "AzureML-PySpark-MmlSpark-0.15"

environments = Environment.list(workspace=ws)
found = False
if custom_env_name in environments:
     found = True
     print('Found existing custom environment in Workspace.')
     spark_env = Environment.get(workspace=ws, name=custom_env_name)
     
if not found:
    print('Creating a new custom environment..')    
    # Use curated environment for Spark
    curated_spark_env = Environment.get(workspace=ws, name=curated_env_name)
    curated_spark_env.save_to_directory(path="./curated_environment_definition", overwrite=True)

    # Copy to custom environment
    spark_env = curated_spark_env
    # Need a custom name for a custom environment
    spark_env.name = custom_env_name
    # Enable Docker so it is mandatory
    spark_env.docker.enabled = True

    # Save to local files to investigate definition
    spark_env.save_to_directory(path="./spark_environment_definition", overwrite=True)

    # Register Environment in the Workspace
    spark_env.register(ws)

# List my Spark Environment loaded from the Workspace
envs = Environment.list(workspace=ws)
for env in envs:
    if env.startswith("Custom"):
        print("Environment Name",env)
        print("packages", envs[env].python.conda_dependencies.serialize_to_string())

Found existing custom environment in Workspace.
Environment Name Custom-AzureML-PySpark-Environment
packages channels:
- conda-forge
dependencies:
- python=3.6.2
- pip:
  - azureml-core==1.0.81.1
  - azureml-defaults==1.0.81
  - azureml-telemetry==1.0.81.1
  - azureml-train-restclients-hyperdrive==1.0.81
  - azureml-train-core==1.0.81
name: azureml_2d6f32a8b5b445b7627fd1ae36599989

Environment Name Custom-AzureML-PySpark-Environment-TEST
packages channels:
- conda-forge
dependencies:
- python=3.6.2
- pip:
  - azureml-core==1.0.81.1
  - azureml-defaults==1.0.81
  - azureml-telemetry==1.0.81.1
  - azureml-train-restclients-hyperdrive==1.0.81
  - azureml-train-core==1.0.81
name: azureml_2d6f32a8b5b445b7627fd1ae36599989



In [30]:
from azureml.core import Experiment
experiment_name = 'spark-experiment-poc'
experiment = Experiment(workspace=ws, name=experiment_name)

In [31]:
# Create project directory and copy the training script into the project directory
import os
import shutil

project_folder = './project-submit-folder'
os.makedirs(project_folder, exist_ok=True)

# Copy the needed files
shutil.copy('spark-job.py', project_folder)
shutil.copy('spark-job-simple.py', project_folder)
shutil.copy('iris.csv', project_folder)

'./project-submit-folder/iris.csv'

In [32]:
from azureml.core import ScriptRunConfig, RunConfiguration, Experiment
from azureml.core.conda_dependencies import CondaDependencies

## use pyspark framework
spark_run_config = RunConfiguration(framework="pyspark")
spark_run_config.environment = spark_env

# compute = ws.compute_targets["cpu-cluster"]
# spark_run_config.target = compute
spark_run_config.target = "local"

scriptconfig = ScriptRunConfig(source_directory="project-submit-folder", 
                               script="spark-job-simple.py",
                               run_config = spark_run_config)

In [33]:
run = experiment.submit(scriptconfig)
run

Experiment,Id,Type,Status,Details Page,Docs Page
spark-experiment-poc,spark-experiment-poc_1580174412_29c0b213,azureml.scriptrun,Running,Link to Azure Machine Learning studio,Link to Documentation


In [35]:
# Monitor run

from azureml.widgets import RunDetails
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'NOTSET',…

In [36]:
# See files associated with the 'Best run'
print(run.get_file_names())

['azureml-logs/60_control_log.txt', 'azureml-logs/70_driver_log.txt', 'logs/azureml/49_azureml.log']


In [37]:
# get all metris logged in the run
metrics = run.get_metrics()
print(metrics)

{'PythonVersion': '3.6.2 | packaged by conda-forge | (default, Jul 23 2017, 22:59:30) \n[GCC 4.8.2 20140120 (Red Hat 4.8.2-15)]', 'SparkVersion': '2.4.0'}
