
# Prerequisite

In [None]:

#import required libraries
from azure.ml import MLClient
from azure.ml.entities import Code, Dataset
from azure.identity import InteractiveBrowserCredential

In [None]:

#Enter details of your AML workspace
subscription_id = '<SUBSCRIPTION_ID>'
resource_group = '<RESOURCE_GROUP>'
workspace = '<AML_WORKSPACE_NAME>'

In [None]:
#get a handle to the workspace
ml_client = MLClient(InteractiveBrowserCredential(), subscription_id, resource_group, workspace)

# Basic pipeline job

## Build pipeline

In [None]:
from azure.ml import dsl
from azure.ml.dsl import Pipeline
from pathlib import Path

parent_dir = './basic/3a_basic_pipeline'


def generate_dsl_pipeline() -> Pipeline:
    # 1. Load component funcs
    a_func = dsl.load_component(yaml_file=parent_dir + "/componentA.yml")
    b_func = dsl.load_component(yaml_file=parent_dir + "/componentB.yml")
    c_func = dsl.load_component(yaml_file=parent_dir + "/componentC.yml")

    # 2. Construct pipeline
    @dsl.pipeline(
        compute="cpu-cluster",
        description="Basic Pipeline Job with 3 Hello World components",
    )
    def sample_pipeline():
        componentA_job = a_func()
        componentB_job = b_func()
        componentC_job = c_func()

    pipeline = sample_pipeline()
    return pipeline

# Submit pipeline job

In [None]:
# create pipeline instance
pipeline = generate_dsl_pipeline()
# submit job to workspace
ml_client.jobs.create_or_update(pipeline, experiment_name="basic_pipeline", continue_run_on_step_failure=True)

# Pipeline job with registered component
## Register components

In [None]:
from azure.ml.entities import CommandComponent
parent_dir = './basic/1b_e2e_registered_components'
environment = "AzureML-sklearn-0.24-ubuntu18.04-py37-cpu:5"

train_component = CommandComponent(
    name="Train",
    version="32",
    inputs=dict(
        training_data=dict(type="path"),
        max_epocs=dict(type="integer"),
        learning_rate=dict(type="number", default=0.01),
        learning_rate_schedule=dict(type="string", default="time-based")
    ),
    outputs=dict(
        model_output=dict(type="path")
    ),
    code=Code(local_path=parent_dir + "/train_src"),
    environment=environment,
    command="python train.py --training_data ${{inputs.training_data}} --max_epocs ${{inputs.max_epocs}} "
            "--learning_rate ${{inputs.learning_rate}} --learning_rate_schedule ${{"
            "inputs.learning_rate_schedule}} --model_output ${{outputs.model_output}} "
)
ml_client.components.create_or_update(train_component)


In [None]:
score_component = CommandComponent(
    name="Score",   # change name temporarily because of component exist with different code
    version="32",
    inputs=dict(
        model_input=dict(type="path"),
        test_data=dict(type="path"),
    ),
    outputs=dict(
        score_output=dict(type="path")
    ),
    code=Code(local_path=parent_dir + "/score_src"),
    environment=environment,
    command="python score.py --model_input ${{inputs.model_input}} --test_data ${{inputs.test_data}} "
            "--score_output ${{outputs.score_output}} "
)
ml_client.components.create_or_update(score_component)

In [None]:
eval_component = CommandComponent(
    name="Eval",
    version="32",
    inputs=dict(
        scoring_result=dict(type="path"),
    ),
    outputs=dict(
        eval_output=dict(type="path")
    ),
    code=Code(local_path=parent_dir + "/eval_src"),
    environment=environment,
    command="python eval.py --scoring_result ${{inputs.scoring_result}} --eval_output ${{outputs.eval_output}}"
)
ml_client.components.create_or_update(eval_component)

## Build pipeline

In [None]:
from azure.ml import dsl, MLClient
from azure.ml.dsl import Pipeline
from azure.ml.entities import Component as ComponentEntity, Dataset
from pathlib import Path

def generate_dsl_pipeline(
        client: MLClient,
        pipeline_samples_e2e_registered_train_components: ComponentEntity,
        pipeline_samples_e2e_registered_score_components: ComponentEntity,
        pipeline_samples_e2e_registered_eval_components: ComponentEntity,
    ) -> Pipeline:
    # 1. Load component funcs
    train_func = dsl.load_component(
        client=client,
        name=pipeline_samples_e2e_registered_train_components.name,
        version=pipeline_samples_e2e_registered_train_components.version,
    )
    score_func = dsl.load_component(
        client=client,
        name=pipeline_samples_e2e_registered_score_components.name,
        version=pipeline_samples_e2e_registered_score_components.version,
    )
    eval_func = dsl.load_component(
        client=client,
        name=pipeline_samples_e2e_registered_eval_components.name,
        version=pipeline_samples_e2e_registered_eval_components.version,
    )

    # 2. Construct pipeline
    @dsl.pipeline(
        compute="cpu-cluster",
        description="E2E dummy train-score-eval pipeline with registered components",
    )
    def sample_pipeline(
            pipeline_job_training_input,
            pipeline_job_test_input,
            pipeline_job_training_max_epocs,
            pipeline_job_training_learning_rate,
            pipeline_job_learning_rate_schedule,
    ):
        train_job = train_func(
            training_data=pipeline_job_training_input,
            max_epocs=pipeline_job_training_max_epocs,
            learning_rate=pipeline_job_training_learning_rate,
            learning_rate_schedule=pipeline_job_learning_rate_schedule,
        )
        score_job = score_func(model_input=train_job.outputs.model_output, test_data=pipeline_job_test_input)
        score_job.outputs.score_output.mode = "upload"
        evaluate_job = eval_func(scoring_result=score_job.outputs.score_output)
        return {
            "pipeline_job_trained_model": train_job.outputs.model_output,
            "pipeline_job_scored_data": score_job.outputs.score_output,
            "pipeline_job_evaluation_report": evaluate_job.outputs.eval_output,
        }

    pipeline = sample_pipeline(
        Dataset(local_path=parent_dir + "/data/"),
        Dataset(local_path=parent_dir + "/data/"),
        20,
        1.8,
        "time-based",
    )
    pipeline.outputs.pipeline_job_trained_model.mode = "upload"
    pipeline.outputs.pipeline_job_scored_data.mode = "upload"
    pipeline.outputs.pipeline_job_evaluation_report.mode = "upload"
    return pipeline

## Submit pipeline job

In [None]:
# create pipeline instance
pipeline = generate_dsl_pipeline(ml_client, train_component, score_component, eval_component)
# submit job to workspace
ml_client.jobs.create_or_update(pipeline, experiment_name="e2e_registered_components", continue_run_on_step_failure=True)

# Try other samples
You can find more pipeline DSL example under `pipelines-with-components` folder. To try more example, just follow steps:
- Import pipeline func
- Submit pipeline job

Let's take `nyc_taxi_data_regression` as example

## Import pipeline func

In [None]:
from nyc_taxi_data_regression.pipeline import generate_dsl_pipeline

## Submit pipeline job

In [None]:
# Commenting this code since it fails automation tests. It can be run manually though

# create pipeline instance
# pipeline = generate_dsl_pipeline()

# submit job to workspace
# ml_client.jobs.create_or_update(pipeline, experiment_name="nyc_taxi_data_regression", continue_run_on_step_failure=True)