# Build AML Pipeline with "CDML Interpret ML" module and built-in modules

In this tutorial you will learn how to use Designer built-in module and custom module together to create a pipeline.

1. Setup enrivonment - install module CLI and module/pipeline SDK
2. Register "CDML Interpret ML" module into your aml workspace using CLI
3. Use module/pipeline SDK to create a pipeline with modules registered in step 2 and the built-in module available in AML designer

## Prerequisite
* Install azure cli with azure-cli-ml extension and module sdk following the [instructions here](setup-environment.ipynb)


In [None]:
# you need to configure your ws information here
subscription_id = '<your subscription ID>'
workspace_name = '<your workspace name>'
resource_group = '<your resource group>'

## Register azureml module

In [None]:
# login and set the default workspace using az ml folder attach command
!az login -o none
!az account set -s $subscription_id
!az ml folder attach -w $workspace_name -g $resource_group 

In [None]:
# register a custom module
# note, if it's a new azure ml workspace, yo need to open designer to activate datatype, otherwise you'll get error message,
# "Error occurred when loading YAML file iml_module_spec.yaml, details: Module CDML Interpret ML has invalid DataType references: Input Trained_model uses DataType ModelDirectory which does not exist."
!az ml module register --spec-file=aml_module/InterpretML/iml_module_spec.yaml

## Setup azureml workspace

In [None]:
import json
from azureml.data.data_reference import DataReference
from azureml.core import Workspace, Run, Dataset, Datastore
from azureml.pipeline.wrapper import Pipeline, Module, dsl

ws = Workspace.get(name=workspace_name, subscription_id=subscription_id, resource_group=resource_group)

In [None]:

# Specify available aml compute in workspace
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException

cluster_name = "cpu-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target {}.'.format(cluster_name))
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_v2",
                                                               max_nodes=4)

    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
    compute_target.wait_for_completion(show_output=True, timeout_in_minutes=20)

print("Azure Machine Learning Compute attached")

## Prepare datasets and load modules

In [None]:
data_name = 'Automobile_price_data'

if data_name not in ws.datasets:
    global_datastore = Datastore(ws, name="azureml_globaldatasets")
    automobile_data = Dataset.File.from_files(global_datastore.path('GenericCSV/Automobile_price_data_(Raw)')).register(workspace=ws, name='Automobile_price_data', description='Automobile_price_data')
    print('Registerd')

blob_input_data = ws.datasets[data_name]

In [None]:
# get built-in module
select_column_func = Module.load(ws, namespace='azureml', name='Select Columns in Dataset')
clean_data_func = Module.load(ws, namespace='azureml', name='Clean Missing Data')
split_data_func = Module.load(ws, namespace='azureml', name='Split Data')
linear_regression_func = Module.load(ws, namespace='azureml', name='Linear Regression')
train_func = Module.load(ws, namespace='azureml', name='Train Model')
score_func = Module.load(ws, namespace='azureml', name='Score Model')
eval_func = Module.load(ws, namespace='azureml', name='Evaluate Model')

#get custom module
iml_func = Module.load(ws, namespace='microsoft.com/cosinedata', name='CDML Interpret ML')

## Create azure ml pipeline

In [None]:
# define your pipeline

@dsl.pipeline(name = 'Designer Sample with CDML Interpret ML', 
              description = 'Regression - Automobile Price Prediction with Interpret machine learning',
              default_compute_target = cluster_name)
def sample1_pipeline():
    select = select_column_func(
        dataset=blob_input_data, 
        select_columns="{\"isFilter\":true,\"rules\":"
                          "[{\"exclude\":false,\"ruleType\":\"ColumnNames\",\"columns\":"
                          "[\"engine-size\", \"horsepower\", \"compression-ratio\", \"city-mpg\", \"price\"]}]}"
    )   
    
    clean = clean_data_func(
        dataset=select.outputs.results_dataset,
        columns_to_be_cleaned="{\"isFilter\":true,\"rules\":[{\"ruleType\":\"AllColumns\",\"exclude\":false}]}",
        minimum_missing_value_ratio=0.0,
        maximum_missing_value_ratio=1.0,
        cleaning_mode='Remove entire row'
    )
    
    split = split_data_func(
        dataset=clean.outputs.cleaned_dataset,
        splitting_mode='Split Rows',
        fraction_of_rows_in_the_first_output_dataset=0.7,
        randomized_split='True',
        stratified_split='False'
    )
    
    splittedTrainDataFeatures = select_column_func(
        dataset=split.outputs.results_dataset1, 
        select_columns="{\"isFilter\":true,\"rules\":"
                          "[{\"exclude\":false,\"ruleType\":\"ColumnNames\",\"columns\":"
                          "[\"engine-size\", \"horsepower\", \"compression-ratio\", \"city-mpg\"]}]}"
    ) 
    
    splittedTestDataFeatures = select_column_func(
        dataset=split.outputs.results_dataset2, 
        select_columns="{\"isFilter\":true,\"rules\":"
                          "[{\"exclude\":false,\"ruleType\":\"ColumnNames\",\"columns\":"
                          "[\"engine-size\", \"horsepower\", \"compression-ratio\", \"city-mpg\"]}]}"
    ) 
    
    algo = linear_regression_func(
        solution_method='Ordinary Least Squares',
        l2_regularization_weight=0.001,
        include_intercept_term='True',
        random_number_seed=0
    )
    
    train = train_func(
        dataset=split.outputs.results_dataset1,
        untrained_model=algo.outputs.untrained_model,
        label_column="{\"isFilter\":true,\"rules\":"
                        "[{\"exclude\":false,\"ruleType\":\"ColumnNames\",\"columns\":[\"price\"]}]}"
    )
    
    iml = iml_func(
        trained_model=train.outputs.trained_model,
        dataset_to_train=splittedTrainDataFeatures.outputs.results_dataset,
        dataset_to_test=splittedTestDataFeatures.outputs.results_dataset,
        feature_names="engine-size,horsepower,compression-ratio,city-mpg"
    )
    
    return {**iml.outputs}

In [None]:
# create a pipeline
pipeline = sample1_pipeline()

In [None]:
# validate pipeline and visualize the graph
pipeline.validate()

In [None]:
# save as a draft
pipeline.save(experiment_name = 'pipeline-with-cdml-iml-module')

In [None]:
run = pipeline.submit(
    experiment_name = 'pipeline-with-cdml-iml-module',
    tags={'mode':'module-SDK'}
)
    
run.wait_for_completion()