In [None]:
# handle to the workspace
from azure.ml import MLClient

# Authentication package
from azure.identity import InteractiveBrowserCredential

In [None]:
# get a handle to the workspace
ml_client = MLClient(
    InteractiveBrowserCredential(), 
    subscription_id = '<SUBSCRIPTION_ID>', 
    resource_group = '<RESOURCE_GROUP>', 
    workspace = '<AML_WORKSPACE_NAME>'
)

In [None]:
from azure.ml.entities import Dataset

coco_trainval_path = "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"

coco_trainval_dataset = Dataset(
    name="coco_trainval2017_zip",
    paths=[dict(file=coco_trainval_path)],
    description="annotations_trainval2017.zip",
    tags={'source_type':'web',
          'source':'cocodataset.org'
    },
)

In [None]:
from azure.ml.entities import CommandComponent, JobInput, JobOutput

unzip_component = CommandComponent(
    name="Unzip",
    
    # this component has no code, just a simple unzip command
    command = "ls -lr ${{inputs.archive_path}}; unzip ${{inputs.archive_path}} -d ${{outputs.extracted_data}}",

    # inputs and outputs need to match with the command
    inputs = {
        'archive_path': { 'type': 'path' }
    },
    outputs = {
        'extracted_data': { 'type': 'path' }
    },
    
    # we're using a curated environment
    environment = 'AzureML-sklearn-0.24-ubuntu18.04-py37-cpu:9',
)

In [None]:
from azure.ml import dsl

# we'll package this unzip command as a component to use within a pipeline
unzip_component_func = dsl.load_component(component=unzip_component)
parse_annotations_func = dsl.load_component(yaml_file="./components/coco_extract_annotations/spec.yaml")

In [None]:
help(parse_annotations_func)

In [None]:
from azure.ml import dsl

# we'll package this unzip command as a component to use within a pipeline
unzip_component_func = dsl.load_component(component=unzip_component)
parse_annotations_func = dsl.load_component(yaml_file="./components/coco_extract_annotations/spec.yaml")

# the dsl decorator tells the sdk that we are defining an AML pipeline
@dsl.pipeline(
    compute="cpu-d14-v2", #"cpu-cluster",
    description="e2e images preparation",
)
def coco_preparation_pipeline(annotations_archive, train_archive, valid_archive, category_id, category_name):
    annotations_unzip_step = unzip_component_func(
        archive_path=annotations_archive
    )
    train_unzip_step = unzip_component_func(
        archive_path=train_archive
    )
    valid_unzip_step = unzip_component_func(
        archive_path=valid_archive
    )
    
    parse_annotations_step = parse_annotations_func(
        annotations_dir=annotations_unzip_step.outputs.extracted_data,
        category_id=category_id,
        category_name=category_name
    )

    return {
        "train_images": train_unzip_step.outputs.extracted_data,
        "train_annotations": parse_annotations_step.outputs.train_annotations,
        "valid_images": valid_unzip_step.outputs.extracted_data,
        "valid_annotations": parse_annotations_step.outputs.valid_annotations,
    }

pipeline_instance = coco_preparation_pipeline(
    annotations_archive=JobInput(file="http://images.cocodataset.org/annotations/annotations_trainval2017.zip"),
    train_archive=JobInput(file="http://images.cocodataset.org/zips/train2017.zip"),
    valid_archive=JobInput(file="http://images.cocodataset.org/zips/val2017.zip"),
    category_id=1,
    category_name="contains_person"
)

In [None]:
# submit the pipeline job
returned_job = ml_client.jobs.create_or_update(
    pipeline_instance,
    
    # Project's name
    experiment_name="e2e_image_preparation",
    
    # If there is no dependency, pipeline run will continue even after the failure of one component
    continue_run_on_step_failure=True,
)

# get a URL for the status of the job
returned_job.services["Studio"].endpoint