# Azure ML Training Pipeline for HIFIS-RNN-MLP
This notebook defines an Azure machine learning pipeline for a multi-train experiment and submits the pipeline as an experiment to be run on an Azure virtual machine. It then publishes the pipeline in the workspace.

In [None]:
# Import statements
import azureml.core
from azureml.core import Experiment
from azureml.core import Workspace, Datastore
from azureml.data.data_reference import DataReference
from azureml.pipeline.core import PipelineData
from azureml.pipeline.core import Pipeline
from azureml.pipeline.steps import PythonScriptStep, EstimatorStep
from azureml.train.dnn import TensorFlow
from azureml.train.estimator import Estimator
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.environment import Environment
from azureml.core.runconfig import RunConfiguration
import shutil
import datetime

### Register the workspace and configure its Python environment.

In [None]:
# Get reference to the workspace
ws = Workspace.from_config("./ws_config.json")

# Set workspace's environment
env = Environment.from_pip_requirements(name = "HIFIS_env", file_path = "./../requirements.txt")
env.python.conda_dependencies.add_pip_package("azureml-core")
env.python.conda_dependencies.add_pip_package("sendgrid")
env.register(workspace=ws)
runconfig = RunConfiguration(conda_dependencies=env.python.conda_dependencies)
print(env.python.conda_dependencies.serialize_to_string())

# Move AML ignore file to root folder
aml_ignore_path = shutil.copy('./.amlignore', './../.amlignore') 

### Create references to persistent and intermediate data
Create DataReference objects that point to our raw data on the blob. Configure a PipelineData object to point to preprocessed data stored on the blob. Pipeline data is intermediate, meaning that it is produced by a step and will be fed as input to a subsequent step.

In [None]:
# Get the blob datastores associated with this workspace
hifis_blob_ds = Datastore(ws, name='hifisrnnmlp_ds')
raw_data_blob_ds = Datastore(ws, name='hifis_raw_ds')

# Create data references to folders on the blobs
raw_data_dr = DataReference(
    datastore=raw_data_blob_ds,
    data_reference_name="raw_data",
    path_on_datastore="hifis/")
inference_dr = DataReference(
    datastore=hifis_blob_ds,
    data_reference_name="inference",
    path_on_datastore="inference/")
outputs_dr = DataReference(
    datastore=hifis_blob_ds,
    data_reference_name="outputs",
    path_on_datastore="outputs/")

# Set up references to pipeline data (intermediate pipeline storage).
preprocess_pd = PipelineData(
    "preprocessed_output",
    datastore=hifis_blob_ds,
    output_name="preprocessed_output",
    output_mode="mount")
train_pd = PipelineData(
    "train_output",
    datastore=hifis_blob_ds,
    output_name="train_output",
    output_mode="mount")
interpretability_pd = PipelineData(
    "interpretability_output",
    datastore=hifis_blob_ds,
    output_name="interpretability_output",
    output_mode="mount")

### Compute Target
Specify and configure the compute target for this workspace. If a compute cluster by the name we specified does not exist, create a new compute cluster.

In [None]:
# Define some constants
CT_NAME = "d13v2-train"          # Name of our compute cluster
VM_SIZE = "STANDARD_D13_V2"      # Specify the Azure VM for execution of our pipeline
MIN_NODES = 0                    # Min number of compute nodes in cluster
MAX_NODES = 3                    # Max number of compute nodes in cluster

# Set up the compute target for this experiment
try:
    compute_target = AmlCompute(ws, CT_NAME)
    print("Found existing compute target.")
except ComputeTargetException:
    print("Creating new compute target")
    provisioning_config = AmlCompute.provisioning_configuration(vm_size=VM_SIZE, min_nodes=MIN_NODES, max_nodes=MAX_NODES)    
    compute_target = ComputeTarget.create(ws, CT_NAME, provisioning_config)  # Create the compute cluster
    
    # Wait for cluster to be provisioned
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) 
    
print("Azure Machine Learning Compute attached")
print("Compute targets: ", ws.compute_targets)
compute_target = ws.compute_targets[CT_NAME]

### Define pipeline and submit experiment.
Define the steps of an Azure machine learning pipeline. Create an Azure Experiment that will run our pipeline. Submit the experiment to the execution environment.

In [None]:
# Define preprocessing step in the ML pipeline
step1 = PythonScriptStep(name="preprocess_step",
                         script_name="azure/preprocess_step/preprocess_step.py",
                         arguments=["--rawdatadir", raw_data_dr, "--inferencedir", inference_dr, "--preprocessedoutputdir", 
                                    preprocess_pd],
                         inputs=[raw_data_dr, inference_dr],
                         outputs=[preprocess_pd],
                         compute_target=compute_target, 
                         source_directory="./../",
                         runconfig=runconfig,
                         params={"PIPELINE": "train"},
                         allow_reuse=False)

# Define training step in the ML pipeline
est = TensorFlow(source_directory='./../',
                   script_params=None,
                   compute_target=compute_target,
                   entry_script='azure/train_step/train_step.py',
                   pip_packages=['tensorboard', 'pandas', 'dill', 'numpy', 'imblearn', 'matplotlib', 'tqdm', 'scikit-learn',
                                'category_encoders', 'scipy', 'xgboost'],
                   use_gpu=True,
                   framework_version='2.0')
step2 = EstimatorStep(name="multi_train_step", 
                      estimator=est, 
                      estimator_entry_script_arguments=["--preprocessedoutputdir", preprocess_pd, "--trainoutputdir", train_pd],
                      runconfig_pipeline_params=None, 
                      inputs=[preprocess_pd], 
                      outputs=[train_pd], 
                      compute_target=compute_target)

# Define interpretability step in the ML pipeline
step3 = PythonScriptStep(name="interpretability_step",
                         script_name="azure/interpretability_step/interpretability_step.py",
                         arguments=["--preprocessedoutputdir", preprocess_pd, "--trainoutputdir", train_pd, 
                                    "--interpretabilityoutputdir", interpretability_pd],
                         inputs=[preprocess_pd, train_pd],
                         outputs=[interpretability_pd],
                         compute_target=compute_target, 
                         source_directory="./../",
                         runconfig=runconfig,
                         allow_reuse=False)

# Define final step to save all produced files to persistent blob storage
step4 = PythonScriptStep(name="save_step",
                         script_name="azure/save_step/save_step.py",
                         arguments=["--preprocessedoutputdir", preprocess_pd, "--trainoutputdir", train_pd, 
                                    "--interpretabilityoutputdir", interpretability_pd, "--outputsdir", outputs_dr],
                         inputs=[preprocess_pd, train_pd, interpretability_pd, outputs_dr],
                         outputs=[],
                         compute_target=compute_target, 
                         source_directory="./../",
                         runconfig=runconfig,
                         allow_reuse=False)

# Construct the ML pipeline from the steps
steps = [step1, step2, step3, step4]
single_train_pipeline = Pipeline(workspace=ws, steps=steps)
single_train_pipeline.validate()

# Define a new experiment and submit a new pipeline run to the compute target.
experiment = Experiment(workspace=ws, name='MultiTrainExperiment_v1')
train_run = experiment.submit(single_train_pipeline, regenerate_outputs=False)
print("Pipeline is submitted for execution")

# Move AML ignore file back to original folder
aml_ignore_path = shutil.move(aml_ignore_path, './.amlignore') 

### Publish the pipeline
Wait for the pipeline run to finish. Then publish the pipeline. The pipeline will be visible as an endpoint in the Pipelines tab in the workspace on Azure Machine Learning studio. Delete the training compute cluster to prevent further cost.

In [None]:
# Wait for the pipeline to finish running.
train_run.wait_for_completion()

# Publish the pipeline.
published_pipeline = train_run.publish_pipeline(
     name="HIFIS-RNN-MLP Training Pipeline",
     description="Azure ML Pipeline that trains HIFIS-RNN-MLP model and runs LIME submodular pick.",
     version="1.0")

### Delete the compute target
Delete the training compute cluster to prevent further cost.

In [None]:
# Wait for the pipeline to finish running.
train_run.wait_for_completion()

# Delete compute cluster to avoid extra charges
compute_target.delete()