# Create Azure Machine Learning training pipeline

This notebook shows the way how to create an AML pipeline for clustering model training. It consists of the following steps:
1. AML environment setup 
2. Pipeline configuration preparation 
3. Pipeline creation
4. Pipeline run

The training step will also register the resulting model in the AML workspace as an artifact. 

In [None]:
%load_ext dotenv
%dotenv

import os
from os.path import join
import sys

sys.path.append("../")

import pandas as pd
import numpy as np

from azureml.core import Environment, Datastore, Workspace, Experiment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.data.data_reference import DataReference
from azureml.pipeline.core import Pipeline
from azureml.pipeline.steps import PythonScriptStep
from azureml.core.runconfig import RunConfiguration

from mlops.common.attach_compute import get_compute
from mlops.common.get_datastores import get_blob_datastore

In [None]:
from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication(tenant_id=os.getenv("TENANT_ID"))

In [None]:
# Restore AML workspace from config.json file (can be downloaded through the portal)

ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

compute_target = get_compute(
    workspace=ws,
    compute_name="trainclust",
    vm_size='Standard_NC6',
    vm_priority='lowpriority', 
    min_nodes=0,
    max_nodes=4,
    scale_down=120
)

In [None]:
# Get or register a datastore (blob with our data)
datastore = get_blob_datastore(ws, "data", os.getenv("AML_BLOB_ACCOUNT_NAME"), 
                               os.getenv("AML_BLOB_ACCOUNT_KEY"), "oneweek-sample-dataset")

# Create input and output data references
# WARNING! DataReference works up to 12x times faster than Dataset for small files
img_dir = DataReference(
    datastore=datastore, 
    data_reference_name="books", 
    path_on_datastore="books",
    mode="mount"
)

In [None]:
# Build task-specific environment

from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# Create Pipeline run configuration 

run_config = RunConfiguration()
run_config.environment.docker.enabled = True
run_config.environment.python.conda_dependencies = CondaDependencies.create(
    pip_packages=[
        'azureml-sdk==1.15.0',
        'numpy==1.18.5',
        'pandas==1.1.3',
        'pillow==7.2.0',
        'pyarrow==1.0.1',
        'scikit-image==0.17.2',
        'scikit-learn==0.23.2',
        'scipy==1.5.2',
        'tqdm==4.48.2',
        'opencv-python-headless',
        'tensorflow==2.3.0',
        'PyYAML==5.3.1'
    ]
)

In [None]:
# Create Custom Vision scoring step

source_directory = '../'

train_step = PythonScriptStep(
    script_name="mlops/clustering_pipeline/steps/train.py", 
    arguments=[
        "--input_dir", img_dir,
        "--fraction", 1.0, # fraction of the dataset
        "--recursive", False,
        "--eps", 0.64,
        "--min_samples", 3,
        "--metric", "cosine",
        "--model_name", "dbscan_test"
    ],
    inputs=[img_dir],
    compute_target=compute_target, 
    source_directory=source_directory,
    runconfig=run_config
)

# Create pipeline using existing steps
train_pipeline = Pipeline(workspace=ws, steps=[train_step])

# Check if the pipeline is consistent 
train_pipeline.validate()

# Publish pipeline
published_pipeline = train_pipeline.publish(
    name = "similarity_train",
    description = "Pipeline to train a similarity model for a custom Azure Search skill"
)

In [None]:
# Submit the pipeline
pipeline_run = Experiment(ws, 'similarity-train-exp').submit(train_pipeline)
pipeline_run.wait_for_completion()