In [1]:
from azure.ml import MLClient
from azure.identity import DefaultAzureCredential

In [2]:
subscription_id = "15ae9cb6-95c1-483d-a0e3-b1a1a3b06324"
resource_group = "ray"
workspace = "ray"

ml_client = MLClient(DefaultAzureCredential(), subscription_id, resource_group, workspace)

## Command Job

In [3]:
from azure.ml import command, Input, Output, PyTorchDistribution
from azure.ml.entities import ResourceConfiguration, Environment 
prep = command(
  code='src',
  command=
    "python startDask.py "
    "--script prep-nyctaxi.py "
    "--nyc_taxi_dataset ${{inputs.nyc_taxi_dataset}} "
    "--output_folder ${{outputs.output_folder}}",
  inputs={
    'nyc_taxi_dataset': Input(
        path= 'wasbs://datasets@azuremlexamples.blob.core.windows.net/nyctaxi/',
        mode= 'ro_mount')},
  outputs={
    'output_folder':Output(
      type= 'uri_folder')},
  environment=Environment( 
    image= 'mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04',
    conda_file= 'conda.yml'),
  compute= 'daniel-big',
  resources=ResourceConfiguration(instance_count= 4),
  distribution=PyTorchDistribution(),
  experiment_name= 'dask-nyctaxi-pipeline-example',
  description= 'This sample shows how to run a distributed DASK job on AzureML. The 24GB NYC Taxi dataset is read in CSV format by a 4 node DASK cluster, processed and then written as job output in parquet format.'
)

In [4]:
prep_submitted = ml_client.jobs.create_or_update(prep)
prep_submitted

Experiment,Name,Type,Status,Details Page
dask-nyctaxi-pipeline-example,orange_parang_lwn6p3hn2t,command,Starting,Link to Azure Machine Learning studio


In [8]:
train = command(
  code= 'src',
  command=
    "python train-xgboost.py "
    "--nyc_taxi_parquet ${{inputs.nyc_taxi_parquet}} "
    "--model ${{outputs.model}} "
    "--tree_method ${{inputs.tree_method}} "
    "--learning_rate ${{inputs.learning_rate}} "
    "--gamma ${{inputs.gamma}} "
    "--max_depth ${{inputs.max_depth}} "
    "--num_boost_round ${{inputs.num_boost_round}} ",
  inputs={
    "nyc_taxi_parquet": Input(
      path='azureml:azureml_polite_loquat_c3x4fj4l4m_output_data_output_folder:1',
      mode='ro_mount'),
    "tree_method": "auto",
    "learning_rate": 0.3,
    "gamma": 1,
    "max_depth": 7,
    "num_boost_round": 20,
  },
  outputs={
    "model": Output(type='mlflow_model')
  },
  environment=Environment(
    image= "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04",
    conda_file= "conda.yml"),
  compute= "daniel-big",
  experiment_name= "dask-nyctaxi-example"
)

In [9]:
from azure.ml import command, Input
from azure.ml.entities import Environment

test = command(
  code= 'src',
  command=
    "find ${{inputs.model}} ",
  inputs={
    "model": Input(
      path='azureml:azureml_yellow_nail_h3t6w6r57k_output_model:1',
      mode='ro_mount')
  },
  environment=Environment(
    image= "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04"),
  compute= "daniel-big",
  experiment_name= "dask-nyctaxi-example"
)

## Pipeline Job

In [12]:
from azure.ml.dsl import pipeline

@pipeline()
def prep_train_test(dataset: Input):
  prep_job = prep(nyc_taxi_dataset=dataset)
  train_job = train(nyc_taxi_parquet=prep_job.outputs.output_folder,
                    tree_method='auto',
                    learning_rate= 0.3,
                    gamma= 1,
                    max_depth= 7,
                    num_boost_round= 12)
  test_job = test(model=train_job.outputs.model)
  return {}

In [13]:
nyc_raw_data = Input(path= 'wasbs://datasets@azuremlexamples.blob.core.windows.net/nyctaxi/')

pipeline_job = prep_train_test(dataset=nyc_raw_data)

ml_client.jobs.create_or_update(pipeline_job)

Experiment,Name,Type,Status,Details Page
nyctaxi,coral_wall_26t0mzd9tw,pipeline,Preparing,Link to Azure Machine Learning studio


## Sweep Job

In [18]:
from azure.ml.dsl import pipeline
from azure.ml.sweep import Choice, Uniform

@pipeline()
def prep_sweep_test(dataset: Input):
  prep_job = prep(nyc_taxi_dataset=dataset)

  train_job = train(nyc_taxi_parquet=prep_job.outputs.output_folder,
                    tree_method=Choice(['auto', 'exact', 'approx', 'hist']),
                    learning_rate=Uniform(0, 1),
                    gamma= Choice(range(7)),
                    max_depth= Choice(range(4,8)),
                    num_boost_round= 20,)

  sweep_job = train_job.sweep(primary_metric='test-rmse',
                              goal='minimize',
                              sampling_algorithm='bayesian',
                              compute='daniel-big')

  sweep_job.set_limits(max_concurrent_trials=5,
                      max_total_trials=25)

  test_job = test(model=sweep_job.outputs.model)
  return {}

In [19]:
nyc_raw_data = Input(path= 'wasbs://datasets@azuremlexamples.blob.core.windows.net/nyctaxi/')

pipeline_job = prep_sweep_test(dataset=nyc_raw_data)

ml_client.jobs.create_or_update(pipeline_job)

Experiment,Name,Type,Status,Details Page
nyctaxi,sad_candle_bmxm6nnv0w,pipeline,Preparing,Link to Azure Machine Learning studio


## Standalone Sweep Job

In [38]:
from azure.ml.sweep import Choice, Uniform, LogUniform

train_job = train(tree_method=Choice(['auto', 'exact', 'approx', 'hist']),
                  learning_rate=Uniform(0, 1),
                  gamma= Choice(range(7)),
                  max_depth= Choice(range(4,8)),
                  num_boost_round= 20,)

In [41]:
from azure.ml.sweep import BayesianSamplingAlgorithm

sweep_job = train_job.sweep(primary_metric='test-rmse',
                            goal='minimize',
                            sampling_algorithm=BayesianSamplingAlgorithm(),
                            compute='daniel-big')

sweep_job.set_limits(max_concurrent_trials=5,
                     max_total_trials=25)

sweep_job.outputs.model

In [42]:
ml_client.jobs.create_or_update(sweep_job)

Experiment,Name,Type,Status,Details Page
nyctaxi,bright_grape_l8ycb35m8g,sweep,Running,Link to Azure Machine Learning studio


In [14]:
from azure.ml.dsl import pipeline
from azure.ml.sweep import Choice, Uniform

@pipeline()
def just_sweep(parquet_dataset: Input):

  train_job = train(nyc_taxi_parquet=parquet_dataset,
                    tree_method=Choice(['auto', 'exact', 'approx', 'hist']),
                    learning_rate=Uniform(0, 1),
                    gamma= Choice(range(7)),
                    max_depth= Choice(range(4,8)),
                    num_boost_round= 20,)

  sweep_job = train_job.sweep(primary_metric='test-rmse',
                              goal='minimize',
                              sampling_algorithm='bayesian',
                              compute='daniel-big')

  sweep_job.set_limits(max_concurrent_trials=5,
                      max_total_trials=25)

  
  return dict(model=sweep_job.outputs.model)


nyc_parquet = Input(path='azureml:azureml_polite_loquat_c3x4fj4l4m_output_data_output_folder:1')

pipeline_job = just_sweep(parquet_dataset=nyc_parquet)

ml_client.jobs.create_or_update(pipeline_job)

Experiment,Name,Type,Status,Details Page
nyctaxi,happy_bread_2qlqn224rp,pipeline,Preparing,Link to Azure Machine Learning studio


## Test Sweep Output

In [3]:
from azure.ml import command, Input
from azure.ml.entities import Environment

test = command(
  code= 'src',
  command=
    "find ${{inputs.model}} ",
  inputs={
    "model": Input(
      path='azureml:azureml_yellow_nail_h3t6w6r57k_output_model:1',
      mode='ro_mount')
  },
  environment=Environment(
    image= "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04"),
  compute= "daniel-big",
  experiment_name= "dask-nyctaxi-example"
)

In [4]:
ml_client.jobs.create_or_update(test)

Experiment,Name,Type,Status,Details Page
dask-nyctaxi-example,affable_oxygen_y7nm88zsld,command,Starting,Link to Azure Machine Learning studio


In [14]:
from azure.ml.dsl import pipeline
from azure.ml.sweep import Choice, Uniform

@pipeline()
def sweep_and_test(parquet_dataset: Input):

  train_job = train(nyc_taxi_parquet=parquet_dataset,
                    tree_method=Choice(['auto', 'exact', 'approx', 'hist']),
                    learning_rate=Uniform(0, 1),
                    gamma= Choice(range(7)),
                    max_depth= Choice(range(4,8)),
                    num_boost_round= 20,)

  sweep_job = train_job.sweep(primary_metric='test-rmse',
                              goal='minimize',
                              sampling_algorithm='bayesian',
                              compute='daniel-big')

  sweep_job.set_limits(max_concurrent_trials=5,
                      max_total_trials=25)

  test_job = test(model=sweep_job.outputs.model)
  
  return dict()


nyc_parquet = Input(path='azureml:azureml_polite_loquat_c3x4fj4l4m_output_data_output_folder:1')

pipeline_job = sweep_and_test(parquet_dataset=nyc_parquet)

ml_client.jobs.create_or_update(pipeline_job)

Experiment,Name,Type,Status,Details Page
nyctaxi,sharp_cassava_5d3nv4qkmd,pipeline,Preparing,Link to Azure Machine Learning studio


In [1]:
import pandas as pd 

df = pd.DataFrame({'a':[1,2,3]})

In [2]:
df

Unnamed: 0,a
0,1
1,2
2,3
