## 1. Connecting to the Azure ML workspace

In [1]:
from azure.ai.ml import MLClient

# Authentication package
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id="59b214a7-65cd-41ae-b95a-6d56d0039476",
    resource_group_name="disease-detection-m-dep-res",
    workspace_name="disease-detection-workspace",
)

# Create a job environment for pipeline steps

In [2]:
from azure.ai.ml.entities import Environment
import os

custom_env_name = "aml-deploynment-demo"

pipeline_job_env = Environment(
    name=custom_env_name,
    description="Custom environment for demo pipeline",
    tags={"scikit-learn": "0.24.2"},
    conda_file=os.path.join("./dependencies", "env_config.yaml"),
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
    version="0.1.1",
)
pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)

print(
    f"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}"
)

Environment with name aml-deploynment-demo is registered to workspace, the environment version is 0.1.1


## Creating ML Components for Our Pipeline

### Create component 1: data prep (using programmatic definition)

In [3]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

data_preprocessing_component = command(
    name="data_preprocessing_component",
    display_name="Data preparation for training",
    description="reads a .xl input, split the input to train and test",
    inputs={
        "data": Input(type="uri_folder"),
        "test_train_ratio": Input(type="number"),
    },
    outputs=dict(
        train_data=Output(type="uri_folder", mode="rw_mount"),
        test_data=Output(type="uri_folder", mode="rw_mount"),
    ),
    # The source folder of the component
    code="./components/data_preprocessing",
    command="""python preprocessing.py \
            --data ${{inputs.data}} --test_train_ratio ${{inputs.test_train_ratio}} \
            --train_data ${{outputs.train_data}} --test_data ${{outputs.test_data}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

# Now we register the component to the workspace
data_preprocessing_component = ml_client.create_or_update(data_preprocessing_component.component)

# Create (register) the component in your workspace
print(
    f"Component {data_preprocessing_component.name} with Version {data_preprocessing_component.version} is registered"
)

[32mUploading data_preprocessing (0.0 MBs):   0%|          | 0/1861 [00:00<?, ?it/s][32mUploading data_preprocessing (0.0 MBs): 100%|██████████| 1861/1861 [00:00<00:00, 40680.23it/s]
[39m



Component data_preprocessing_component with Version 2024-08-24-08-19-15-1754573 is registered


### Create component 2: training (using yaml definition)

- The second component that we'll create will consume the training and test data.
- In model training script after the model is trained, the model file is saved and registered to the workspace. so we can use the registered model in inferencing endpoints.

we create training script and its yml file inside the components/training dir

**create your component using load_component()**

In [4]:
# importing the Component Package
from azure.ai.ml import load_component

# Loading the component from the yml file
training_component = load_component(source=os.path.join("./components/training", "training.yaml"))

**Now create and register the component:**

In [5]:
# Now we register the component to the workspace
training_component = ml_client.create_or_update(training_component)

# Create (register) the component in your workspace
print(
    f"Component {training_component.name} with Version {training_component.version} is registered"
)

[32mUploading training (0.0 MBs):   0%|          | 0/4328 [00:00<?, ?it/s][32mUploading training (0.0 MBs): 100%|██████████| 4328/4328 [00:00<00:00, 91794.21it/s]
[39m



Component train_credit_defaults_model with Version 2024-08-24-08-19-20-8179980 is registered


## Create the pipeline from components

Here, we'll use input data. Then call the components and connect them via their inputs /outputs identifiers.

To code the pipeline, we use a specific `@dsl.pipeline` decorator that identifies the Azure ML pipelines. In the decorator, we can specify the pipeline description and default resources like compute (serverless is used here) and storage. Like a python function, pipelines can have inputs, you can then create multiple instances of a single pipeline with different inputs.

In [6]:
# The dsl decorator tells the sdk that we are defining an Azure ML pipeline
from azure.ai.ml import dsl, Input, Output


@dsl.pipeline(
    compute="serverless",
    description="E2E data_perp-train pipeline",
)
def disease_random_forest_pipeline(
    pipeline_job_data_input,
    pipeline_job_test_train_ratio,
    pipeline_job_n_estimators,
    pipeline_job_registered_model_name,
):
    # using data_prep_function like a python call with its own inputs
    data_preprocessing_job = data_preprocessing_component(
        data=pipeline_job_data_input,
        test_train_ratio=pipeline_job_test_train_ratio,
    )

    # using train_func like a python call with its own inputs
    training_job = training_component(
        train_data=data_preprocessing_job.outputs.train_data,  # note: using outputs from previous step
        test_data=data_preprocessing_job.outputs.test_data,  # note: using outputs from previous step
        n_estimators=pipeline_job_n_estimators,  # note: using a pipeline input as parameter
        registered_model_name=pipeline_job_registered_model_name,
    )

    # a pipeline returns a dictionary of outputs
    # keys will code for the pipeline output identifier
    return {
        "pipeline_job_train_data": data_preprocessing_job.outputs.train_data,
        "pipeline_job_test_data": data_preprocessing_job.outputs.test_data,
    }

**Use our pipeline definition to instantiate a pipeline with your dataset**

In [7]:
from azure.ai.ml.constants import AssetTypes, InputOutputModes
data_asset = ml_client.data.get("disease", version="1")

registered_model_name = "disease_random_forest_model"

# Let's instantiate the pipeline with the parameters of our choice
pipeline = disease_random_forest_pipeline(
    pipeline_job_data_input=Input(path=data_asset.id,
                type=AssetTypes.URI_FILE,
                mode=InputOutputModes.RO_MOUNT
            ),
    pipeline_job_test_train_ratio=0.15,
    pipeline_job_n_estimators=10,
    pipeline_job_registered_model_name=registered_model_name,
)

## Submit the job 

In [8]:
# submit the pipeline job
pipeline_job = ml_client.jobs.create_or_update(
    pipeline,
    # Project's name
    experiment_name="e2e_registered_components_pipeline",
)

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
pathOnCompute is not a known attribute