In [1]:
# import required libraries
# %pip install azure-ai-ml
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential

from azure.ai.ml import MLClient
from azure.ai.ml.dsl import pipeline
from azure.ai.ml import load_component

In [2]:
import os
## Either get environment variables, or a fallback name, which is the second parameter.
## Currently, fill in the fallback values. Later on, we will make sure to work with Environment values. So we're already preparing for it in here!
workspace_name = os.environ.get('WORKSPACE', 'mlops')
subscription_id = os.environ.get('SUBSCRIPTION_ID', '9dfa7b7b-77cd-4d7c-bcab-e0756bdf40a9')
resource_group = os.environ.get('RESOURCE_GROUP', 'mlops-labo3-tibe')

In [3]:
# Because we are running this in an interactive notebook; we can use the InteractiveBrowserCredential
# This allows us to open a browser window and login there
credential = InteractiveBrowserCredential()

In [4]:
ml_client = MLClient(
    credential, subscription_id, resource_group, workspace_name
)

Class WorkspaceHubOperations: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


# Prepare a Virtual PC if needed

In [5]:
from azure.ai.ml.entities import ComputeInstance, AmlCompute
import datetime

ci_basic_name = "mlopsci"
ci_basic = ComputeInstance(name=ci_basic_name, size="STANDARD_DS3_v2")
ml_client.begin_create_or_update(ci_basic).result()

ComputeInstance({'state': 'Running', 'last_operation': {'operation_name': 'Create', 'operation_time': '2023-10-10T12:09:41.616Z', 'operation_status': 'Succeeded', 'operation_trigger': 'User'}, 'os_image_metadata': <azure.ai.ml.entities._compute._image_metadata.ImageMetadata object at 0x000002342D29E920>, 'services': [{'display_name': 'Jupyter', 'endpoint_uri': 'https://mlopsci.westeurope.instances.azureml.ms/tree/'}, {'display_name': 'Jupyter Lab', 'endpoint_uri': 'https://mlopsci.westeurope.instances.azureml.ms/lab'}], 'type': 'computeinstance', 'created_on': None, 'provisioning_state': 'Succeeded', 'provisioning_errors': None, 'name': 'mlopsci', 'description': None, 'tags': None, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/9dfa7b7b-77cd-4d7c-bcab-e0756bdf40a9/resourceGroups/mlops-labo3-tibe/providers/Microsoft.MachineLearningServices/workspaces/mlops/computes/mlopsci', 'Resource__source_path': None, 'base_path': 'c:\\Users\\tibed\\OneDrive - Hogeschool West-Vlaande

In [None]:
from azure.ai.ml.entities import AmlCompute

# STANDARD_A4M_V2
cpu_compute_target = "cpu-automated-test"


# let's see if the compute target already exists
cpu_machine = ml_client.compute.get(cpu_compute_target)
print(
    f"You already have a machine named {cpu_compute_target}, we'll reuse it as is."
)

In [5]:
from azure.ai.ml.entities import Environment
import os

custom_env_name = "aml-Pillow"

pipeline_job_env = Environment(
    name=custom_env_name,
    description="Custom environment for Image Processing (with Pillow)",
    tags={"Pillow": "10.0.1"},
    conda_file=os.path.join("components", "dataprep", "conda.yaml"),
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
)
pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)

print(
    f"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}"
)

Environment with name aml-Pillow is registered to workspace, the environment version is 1


In [6]:
from azure.ai.ml import Input

# Dataprep

In [12]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

# This registers a component with the name "data_prep_image_resize"
# Which can then be used in the Pipeline editor of the Azure Portal
data_prep_component = command(
    name="data_prep_image_resize",
    display_name="Data preparation, Image Resizing",
    description="Reads a data asset of images and preprocesses them by resizing them to 64 to 64.",
    inputs={
        "data": Input(type="uri_folder"),
    },
    outputs={
        "output_data": Output(type="uri_folder", mode="rw_mount")
    },
    # The source folder of the component
    code=os.path.join("components", "dataprep"),
    command="""python dataprep.py \
            --data ${{inputs.data}} \
            --output_data ${{outputs.output_data}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [13]:
# Now we register the component to the workspace
data_prep_component = ml_client.create_or_update(data_prep_component.component)

# Create (register) the component in your workspace
print(
    f"Component {data_prep_component.name} with Version {data_prep_component.version} is registered"
)

Component data_prep_image_resize with Version 2023-10-10-12-23-40-8725006 is registered


In [15]:
from typing import List

In [14]:
# the dsl decorator tells the sdk that we are defining an Azure ML pipeline
from azure.ai.ml import dsl, Input, Output

@dsl.pipeline(
    compute="mlopsci",
    description="Custom data_prep pipeline",
)
def animal_images_preprocessing_pipeline(
    input_version: str, # Currently we don't use these version numbers, but we will use them later on.
    output_version: str,
):
    # using data_prep_function like a python call with its own inputs
    # These are the animals with the version name as a second item in the tuple
    animals = [
        ('pandas', "1"),
        ('cats', "1"),
        ('dogs', "1")
    ] # They are hardcoded in here, because we should give them from another component otherwise.
    
    jobs = {}
    for animal in animals:

        data_prep_job = data_prep_component(
            data=Input(
                type="uri_folder",
                path=f"azureml:{animal[0]}:{animal[1]}" # There was a typo here that I fixed
            ),
        )
        
        output_name = animal[0] + "_resized"
        output_path = "azureml://subscriptions/9dfa7b7b-77cd-4d7c-bcab-e0756bdf40a9/resourcegroups/mlops-labo3-tibe/workspaces/mlops/datastores/workspaceblobstore/paths/processed_animals/" + animal[0]

        data_prep_job.outputs.output_data = Output(
            type="uri_folder",
            path=output_path,
            name=output_name,
            mode="rw_mount"
        )

        jobs[animal[0]] = data_prep_job

    # a pipeline returns a dictionary of outputs
    # keys will code for the pipeline output identifier
    return {
        k: v.outputs.output_data for k,v in jobs.items()
    }

In [16]:
# Let's instantiate the pipeline with the parameters of our choice
pipeline = animal_images_preprocessing_pipeline()

In [17]:
import webbrowser

# submit the pipeline job
pipeline_job = ml_client.jobs.create_or_update(
    pipeline,
    # Project's name
    experiment_name="image_preprocessing_pipeline",
)
# open the pipeline in web browser
webbrowser.open(pipeline_job.studio_url)

True

# Train test split

In [18]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

data_split_component = command(
    name="data_split",
    display_name="Data Splitting to Train and Test",
    description="Reads a data asset of images and combines them into a training and testing dataset",
    # We want to give the datasets as a dynamic input ...
   inputs={
        "animal_1": Input(type="uri_folder"),
        "animal_2": Input(type="uri_folder"),
        "animal_3": Input(type="uri_folder"),
        "train_test_split_factor": Input(type="number"), # The percentage of the data to use as testing data, always a positive value
    },
    # ... and take the outputs as a dynamic output to override the training and testset locations.
    outputs={
        "training_data": Output(type="uri_folder", mode="rw_mount"),
        "testing_data": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=os.path.join("components", "dataprep"),
    command="""python traintestsplit.py \
            --datasets ${{inputs.animal_1}} ${{inputs.animal_2}} ${{inputs.animal_3}} \
            --training_data ${{outputs.training_data}} \
            --testing_data ${{outputs.testing_data}} \
            --split_size ${{inputs.train_test_split_factor}}
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
    # environment=f"aml-Pillow@latest",
)

In [19]:
# Now we register the component to the workspace
data_split_component = ml_client.create_or_update(data_split_component.component)

# Create (register) the component in your workspace
print(
    f"Component {data_split_component.name} with Version {data_split_component.version} is registered"
)

Component data_split with Version 2023-10-10-12-24-12-9683420 is registered


In [20]:
# the dsl decorator tells the sdk that we are defining an Azure ML pipeline
from azure.ai.ml import dsl, Input, Output

@dsl.pipeline(
    compute="mlopsci",
    description="Custom data_prep pipeline",
)
def animal_images_traintest_split_pipeline(
    train_test_split: int, # Currently we don't use these version numbers, but we will use them later on.
    animal_1: Input,
    animal_2: Input,
    animal_3: Input,
):
    # using data_prep_function like a python call with its own inputs
    # These are the animals with the version name as a second item in the tuple

    # Combining arguments starting with "animals_" into a dictionary
    animals_args = {k: v for k, v in locals().items() if k.startswith("animals_")}

    # Create a component instance by calling the component factory
    data_split_job = data_split_component(
            animal_1=animal_1,
            animal_2=animal_2,
            animal_3=animal_3,
            train_test_split_factor=train_test_split
        )
    
    # Override the training data output and testing data output to a file named "trainingdata" and "testingdata
    data_split_job.outputs.training_data = Output(
        type="uri_folder",
        name="training_data",
        mode="rw_mount"
    )
    data_split_job.outputs.testing_data = Output(
        type="uri_folder",
        name="testing_data",
        mode="rw_mount"
    )


    # a pipeline returns a dictionary of outputs
    # keys will code for the pipeline output identifier
    return {
        "training_data": data_split_job.outputs.training_data,
        "testing_data": data_split_job.outputs.testing_data
    }

In [21]:
# Let's instantiate the pipeline with the parameters of our choice
version = "1" # We can choose which version of the resized_pandas it will use
animals = ["pandas", "cats", "dogs"]

# Apparently, we made a small mistake in the naming conventions, but we will ignore that for now, we can fix it later...
animals_datasets = {
    f"animal_{i+1}": Input(type="uri_folder", path=f"azureml:{animal}_resized:{version}")
    for i, animal in enumerate(animals)
}

print(animals_datasets)

train_test_pipeline = animal_images_traintest_split_pipeline(
    **animals_datasets,
    train_test_split=20
)

{'animal_1': {'type': 'uri_folder', 'path': 'azureml:pandas_resized:1'}, 'animal_2': {'type': 'uri_folder', 'path': 'azureml:cats_resized:1'}, 'animal_3': {'type': 'uri_folder', 'path': 'azureml:dogs_resized:1'}}


In [22]:
import webbrowser

In [23]:
# submit the pipeline job
train_test_pipeline_job = ml_client.jobs.create_or_update(
    train_test_pipeline,
    # Project's name
    experiment_name="image_preprocessing_pipeline",
)
# open the pipeline in web browser
webbrowser.open(train_test_pipeline_job.studio_url)

True

# Training

In [6]:
from azure.ai.ml.entities import Environment
import os

custom_env_name = "aml-Tensorflow-Pillow"

pipeline_job_env = Environment(
    name=custom_env_name,
    description="Custom environment for AI Training (with Pillow)",
    tags={"Pillow": "0.0.1", "Tensorflow": "2.4.1"},
    conda_file=os.path.join("components", "training", "conda.yaml"),
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
)
pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)

print(
    f"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}"
)

Environment with name aml-Tensorflow-Pillow is registered to workspace, the environment version is 2


In [7]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

training_component = command(
    name="training",
    display_name="Training an AI model",
    description="Trains an AI model by inputting a lot of training and testing data.",
    inputs={
        "training_folder": Input(type="uri_folder"),
        "testing_folder": Input(type="uri_folder"),
        "epochs": Input(type="number") # The percentage of the data to use as testing data, always a positive value
    },
    outputs={
        "output_folder": Output(type="uri_folder", mode="rw_mount"),
    },
    # The source folder of the component
    code=os.path.join("components", "training"),
    command="""python train.py \
            --training_folder ${{inputs.training_folder}} \
            --testing_folder ${{inputs.testing_folder}} \
            --output_folder ${{outputs.output_folder}} \
            --epochs ${{inputs.epochs}} \
            """,
    # environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
    environment=f"aml-Tensorflow-Pillow@latest",
)

In [8]:
# Now we register the component to the workspace
training_component = ml_client.create_or_update(training_component.component)

# Create (register) the component in your workspace
print(
    f"Component {training_component.name} with Version {training_component.version} is registered"
)

Component training with Version 2023-10-10-12-16-00-3187527 is registered


In [9]:
# the dsl decorator tells the sdk that we are defining an Azure ML pipeline
from azure.ai.ml import dsl, Input, Output

@dsl.pipeline(
    compute="mlopsci",
    description="Custom Animals Training pipeline",
)
def animals_training_pipeline(
    training_folder: Input, # Currently we don't use these version numbers, but we will use them later on.
    testing_folder: Input,
    epochs: int,
):

    training_job = training_component(
        training_folder=training_folder,
        testing_folder=testing_folder,
        epochs=epochs
    )
    
    # Let Azure decide a unique place everytime
    training_job.outputs.output_folder = Output(
        type="uri_folder",
        name="output_data",
        mode="rw_mount"
    )


    # a pipeline returns a dictionary of outputs
    # keys will code for the pipeline output identifier
    return {
        "output_data": training_job.outputs.output_folder,
    }

In [10]:
# Let's instantiate the pipeline with the parameters of our choice

# Woops, make sure to use the correct version number here!
training_pipeline = animals_training_pipeline(
    # Change these versions if you want to override the choices
    training_folder=Input(type="uri_folder", path=f"azureml:training_data:1"),
    testing_folder=Input(type="uri_folder", path=f"azureml:testing_data:1"),
    epochs=5
)

In [11]:
import webbrowser
# submit the pipeline job
training_pipeline_job = ml_client.jobs.create_or_update(
    training_pipeline,
    # Project's name
    experiment_name="training_pipeline",
)
# open the pipeline in web browser
webbrowser.open(training_pipeline_job.studio_url)

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


True