In [4]:
from azure.ml import MLClient
from azure.identity import DefaultAzureCredential

In [5]:
subscription_id = "15ae9cb6-95c1-483d-a0e3-b1a1a3b06324"
resource_group = "ray"
workspace = "ray"

ml_client = MLClient(DefaultAzureCredential(), subscription_id, resource_group, workspace)

## Command Job

In [21]:
from azure.ml import command, Input, Output, PyTorchDistribution
from azure.ml.entities import ResourceConfiguration, Environment 
prep = command(
  code='src',
  command=
    "python startDask.py "
    "--script prep-nyctaxi.py "
    "--nyc_taxi_dataset ${{inputs.nyc_taxi_dataset}} "
    "--output_folder ${{outputs.output_folder}}",
  inputs={
    'nyc_taxi_dataset': Input(
        path= 'wasbs://datasets@azuremlexamples.blob.core.windows.net/nyctaxi/',
        mode= 'ro_mount')},
  outputs={
    'output_folder':Output(
      type= 'uri_folder')},
  environment=Environment( 
    image= 'mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04',
    conda_file= 'conda.yml'),
  compute= 'daniel-big',
  resources=ResourceConfiguration(instance_count= 4),
  distribution=PyTorchDistribution(),
  experiment_name= 'dask-nyctaxi-pipeline-example',
  description= 'This sample shows how to run a distributed DASK job on AzureML. The 24GB NYC Taxi dataset is read in CSV format by a 4 node DASK cluster, processed and then written as job output in parquet format.'
)

In [22]:
prep_submitted = ml_client.jobs.create_or_update(prep)
prep_submitted

Experiment,Name,Type,Status,Details Page
dask-nyctaxi-pipeline-example,jovial_oregano_6lvd1jv4gd,command,Starting,Link to Azure Machine Learning studio


In [23]:
train = command(
  code= 'src',
  command=
    "python train-xgboost.py "
    "--nyc_taxi_parquet ${{inputs.nyc_taxi_parquet}} "
    "--model ${{outputs.model}} "
    "--tree_method ${{inputs.tree_method}} "
    "--learning_rate ${{inputs.learning_rate}} "
    "--gamma ${{inputs.gamma}} "
    "--max_depth ${{inputs.max_depth}} "
    "--num_boost_round ${{inputs.num_boost_round}} ",
  inputs={
    "nyc_taxi_parquet": Input(
      path='azureml:azureml_polite_loquat_c3x4fj4l4m_output_data_output_folder:1',
      mode='ro_mount'),
    "tree_method": "auto",
    "learning_rate": 0.3,
    "gamma": 1,
    "max_depth": 7,
    "num_boost_round": 20,
  },
  outputs={
    "model": Output(type='mlflow_model')
  },
  environment=Environment(
    image= "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04",
    conda_file= "conda.yml"),
  compute= "daniel-big",
  experiment_name= "dask-nyctaxi-example"
)

In [24]:
train_submitted = ml_client.jobs.create_or_update(train)
train_submitted

Experiment,Name,Type,Status,Details Page
dask-nyctaxi-example,gentle_muscle_3hznn2h0yd,command,Starting,Link to Azure Machine Learning studio


## Pipeline Job

In [26]:
from azure.ml.dsl import pipeline

@pipeline()
def prep_and_train(dataset: Input):
  prep_job = prep(nyc_taxi_dataset=dataset)
  train_node = train( nyc_taxi_parquet=prep_job.outputs.output_folder,
                      tree_method='auto',
                      learning_rate= 0.3,
                      gamma= 1,
                      max_depth= 7,
                      num_boost_round= 12)
  return dict(model=train_node.outputs.model)

In [29]:
nyc_raw_data = Input(path= 'wasbs://datasets@azuremlexamples.blob.core.windows.net/nyctaxi/')

pipeline_job = prep_and_train(dataset=nyc_raw_data)

ml_client.jobs.create_or_update(pipeline_job)

Experiment,Name,Type,Status,Details Page
nyctaxi,bold_hook_yc7szz0mjw,pipeline,Preparing,Link to Azure Machine Learning studio


## Sweep Job

In [38]:
from azure.ml.sweep import Choice, Uniform, LogUniform
nyc_taxi_processed = Input(path='azureml:azureml_polite_loquat_c3x4fj4l4m_output_data_output_folder:1')
train_job = train(tree_method=Choice(['auto', 'exact', 'approx', 'hist']),
                  learning_rate=Uniform(0, 1),
                  gamma= Choice(range(7)),
                  max_depth= Choice(range(4,8)),
                  num_boost_round= 20,
                  nyc_taxi_parquet=nyc_taxi_processed)

In [41]:
from azure.ml.sweep import BayesianSamplingAlgorithm

sweep_job = train_job.sweep(primary_metric='test-rmse',
                            goal='minimize',
                            sampling_algorithm=BayesianSamplingAlgorithm(),
                            compute='daniel-big')

sweep_job.set_limits(max_concurrent_trials=5,
                     max_total_trials=25)

sweep_job.name=None

In [42]:
ml_client.jobs.create_or_update(sweep_job)

Experiment,Name,Type,Status,Details Page
nyctaxi,bright_grape_l8ycb35m8g,sweep,Running,Link to Azure Machine Learning studio
