In [9]:
from azure.ai.ml import load_component
from azure.ai.ml.dsl import pipeline
from azure.ai.ml import MLClient, Input
from azure.identity import DefaultAzureCredential, EnvironmentCredential
from azure.ai.ml.entities import AmlCompute

import pandas as pd

Definimos el cluster donde correrá el pipeline

In [10]:
def get_comput_target(ml_client, name="cpu-cluster", family='Standard_E4s_v3'):
    cpu_compute_target = name
    
    try:
        # let's see if the compute target already exists
        cpu_cluster = ml_client.compute.get(cpu_compute_target)
    except Exception:
        cpu_cluster = AmlCompute(
            name=cpu_compute_target,
            type="amlcompute",
            size=family,
            min_instances=0,
            max_instances=4,
            idle_time_before_scale_down=180,
            tier="Dedicated",
        )
    
        cpu_cluster = ml_client.compute.begin_create_or_update(cpu_cluster).result()

In [11]:
credential = DefaultAzureCredential()
ml_client = MLClient.from_config(credential=credential)

Found the config file in: /config.json


In [12]:
compute_target = get_comput_target(ml_client)

In [16]:
preprocess_component = load_component(source="./preprocess-ds-component/preprocess.yml")
split_component = load_component(source="./split-component/split.yml")
train_logistic_component = load_component(source="./train-logistic-component/train_logistic.yml")
train_tree_component = load_component(source="./train-tree-component/train_tree.yml")
eval_component = load_component(source="./eval-model-component/eval.yml")

In [17]:
@pipeline(
    default_compute='cpu-cluster',
)
def water_potability_pipeline(pipeline_input_data):
    preprocess_node = preprocess_component(
        dataset=pipeline_input_data,
        plot_style='dark'
    )

    split_node = split_component(
        dataset=preprocess_node.outputs.dataset_cleaned,
        test_size=0.2
    )

    train_node_logistic = train_logistic_component(
        X_train=split_node.outputs.X_train,
        y_train=split_node.outputs.y_train
    )

    train_node_tree = train_tree_component(
        X_train=split_node.outputs.X_train,
        y_train=split_node.outputs.y_train,
        criterion='entropy',
        min_samples_split = 3,
        max_depth=4
    )
    
    eval_node_logistic = eval_component(
        model_folder=train_node_logistic.outputs.model_folder,
        X_test=split_node.outputs.X_test,
        y_test=split_node.outputs.y_test
    )

    eval_node_tree = eval_component(
        model_folder=train_node_tree.outputs.model_folder,
        X_test=split_node.outputs.X_test,
        y_test=split_node.outputs.y_test
    )

    return {
        "pair_plot_output": preprocess_node.outputs.pair_plot_folder,
        "logistic_model_output": train_node_logistic.outputs.model_folder,
        "tree_model_output": train_node_tree.outputs.model_folder,
        "logistic_report": eval_node_logistic.outputs.report_folder,
        "tree_report": eval_node_tree.outputs.report_folder
    }


# create a pipeline
water_potability_ds =  Input(
            type="uri_file",
            path="azureml://subscriptions/d4e39a00-586b-4eea-9d7a-5c200a16ba64/resourcegroups/pipeline/workspaces/project-2-pipeline/datastores/workspaceblobstore/paths/UI/2023-11-08_204801_UTC/water_potability_ds.csv",
        )

pipeline_job = water_potability_pipeline(pipeline_input_data=water_potability_ds)


In [18]:
pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job, experiment_name="project_pipeline_water"
)
pipeline_job

[32mUploading preprocess_src (0.0 MBs):   0%|          | 0/1763 [00:00<?, ?it/s][32mUploading preprocess_src (0.0 MBs): 100%|██████████| 1763/1763 [00:00<00:00, 175068.85it/s]
[39m



Experiment,Name,Type,Status,Details Page
project_pipeline_water,wheat_sponge_npy4v0twsg,pipeline,Preparing,Link to Azure Machine Learning studio


In [19]:
# wait until the job completes
ml_client.jobs.stream(pipeline_job.name)


RunId: wheat_sponge_npy4v0twsg
Web View: https://ml.azure.com/runs/wheat_sponge_npy4v0twsg?wsid=/subscriptions/d4e39a00-586b-4eea-9d7a-5c200a16ba64/resourcegroups/pipeline/workspaces/project-2-pipeline

Streaming logs/azureml/executionlogs.txt

[2023-11-10 08:12:40Z] Submitting 1 runs, first five are: 2ecc2c54:45c6f5bc-cab3-464a-9544-dd92dca722b8
[2023-11-10 08:13:36Z] Completing processing run id 45c6f5bc-cab3-464a-9544-dd92dca722b8.
[2023-11-10 08:13:37Z] Submitting 1 runs, first five are: bfed7293:35f87f28-c282-4144-a278-4bf4e8042325
[2023-11-10 08:14:13Z] Completing processing run id 35f87f28-c282-4144-a278-4bf4e8042325.
[2023-11-10 08:14:17Z] Submitting 2 runs, first five are: 6dd9fcea:48cb4832-a01b-4405-9004-f76e90c50f5d,73e280da:4de150b5-5bb7-488e-9251-bd6be530fecf
[2023-11-10 08:14:59Z] Completing processing run id 48cb4832-a01b-4405-9004-f76e90c50f5d.
[2023-11-10 08:15:00Z] Submitting 1 runs, first five are: c67bba5b:4f71b0d3-4c75-4068-aaa0-d3260159a7d7
[2023-11-10 08:15:30Z] 

In [20]:
# Download all the outputs of the job
output = ml_client.jobs.download(name=pipeline_job.name, download_path='./pipeline_output', all=True)

Downloading artifact azureml://subscriptions/d4e39a00-586b-4eea-9d7a-5c200a16ba64/resourcegroups/pipeline/workspaces/project-2-pipeline/datastores/workspaceblobstore/paths/azureml/45c6f5bc-cab3-464a-9544-dd92dca722b8/pair_plot_folder/ to pipeline_output/named-outputs/pair_plot_output
Downloading artifact azureml://subscriptions/d4e39a00-586b-4eea-9d7a-5c200a16ba64/resourcegroups/pipeline/workspaces/project-2-pipeline/datastores/workspaceblobstore/paths/azureml/48cb4832-a01b-4405-9004-f76e90c50f5d/model_folder/ to pipeline_output/named-outputs/logistic_model_output
Downloading artifact azureml://subscriptions/d4e39a00-586b-4eea-9d7a-5c200a16ba64/resourcegroups/pipeline/workspaces/project-2-pipeline/datastores/workspaceblobstore/paths/azureml/4de150b5-5bb7-488e-9251-bd6be530fecf/model_folder/ to pipeline_output/named-outputs/tree_model_output
Downloading artifact azureml://subscriptions/d4e39a00-586b-4eea-9d7a-5c200a16ba64/resourcegroups/pipeline/workspaces/project-2-pipeline/datastores/