## Check Azure ML SDK version

In [27]:
from azureml.core import ComputeTarget, Dataset, Datastore, Experiment, Workspace
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import RunConfiguration

from azureml.pipeline.core import Pipeline, PipelineData, TrainingOutput
from azureml.pipeline.core.graph import PipelineParameter
from azureml.pipeline.steps import AutoMLStep, PythonScriptStep

from azureml.train.automl import AutoMLConfig

import os

In [28]:
import azureml.core
print("This notebook was created and tested using version 1.3.0 of the Azure ML SDK")
print("You are currently using version", azureml.core.VERSION, "of the Azure ML SDK")

This notebook was created and tested using version 1.3.0 of the Azure ML SDK
You are currently using version 1.3.0 of the Azure ML SDK


## Retrieve initial dataset

In [29]:
from azureml.core import Workspace, Dataset

ws = Workspace.from_config()
if not 'titanic_ds' in ws.datasets.keys() :
    # create a TabularDataset from Titanic training data
    web_paths = ['https://dprepdata.blob.core.windows.net/demo/Titanic.csv',
                 'https://dprepdata.blob.core.windows.net/demo/Titanic2.csv']
    titanic_ds = Dataset.Tabular.from_delimited_files(path=web_paths)

    titanic_ds.register(workspace = ws,
                                     name = 'titanic_ds',
                                     description = 'Titanic baseline data',
                                     create_new_version = True)

titanic_ds = Dataset.get_by_name(ws, 'titanic_ds')

## Configure your storage and compute target

In [30]:
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core import Datastore

datastore = ws.get_default_datastore()

compute_name = 'cpu-compute'
if not compute_name in ws.compute_targets :
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                                min_nodes=0,
                                                                max_nodes=1)
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)

    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # Show the result
    print(compute_target.get_status().serialize())

compute_target = ws.compute_targets[compute_name]

The intermediate data between the data preparation and the automated ML step can be stored in the workspace's default datastore (object 'datastore' in the notebook), so we don't need to do more than call get_default_datastore() on the Workspace object.

### Configure the training run

The next step is making sure that the remote training run has all the dependencies that are required by the training steps. Dependencies and the runtime context are set by creating and configuring a RunConfiguration object.

In [31]:
from azureml.core.runconfig import RunConfiguration, CondaDependencies

aml_run_config = RunConfiguration()
# Use just-specified compute target ("cpu-compute")
aml_run_config.target = compute_target
aml_run_config.environment.python.user_managed_dependencies = False

# Add some packages relied on by data prep step
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['pandas','scikit-learn'], 
    pip_packages=['azureml-sdk', 'azureml-dataprep[fuse,pandas]'], 
    pin_sdk_version=False)

### Prepare data for automated machine learning

#### Write the data preparation code
The baseline Titanic dataset consists of mixed numerical and text data, with some values missing. To prepare it for automated machine learning, the data preparation pipeline step will:

Fill missing data with either random data or a category corresponding to "Unknown"
Transform categorical data to integers
Drop columns that we don't intend to use
Split the data into training and testing sets
Write the transformed data to the PipelineData output paths

In [32]:
%%writefile dataprep.py
# dataprep.py
from azureml.core import Run
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
import argparse
import pyarrow as pa
import pyarrow.parquet as pq

RANDOM_SEED=42

def prepare_age(df):
    # Fill in missing Age values from distribution of present Age values 
    mean = df["Age"].mean()
    std = df["Age"].std()
    is_null = df["Age"].isnull().sum()
    # compute enough (== is_null().sum()) random numbers between the mean, std
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    # fill NaN values in Age column with random values generated
    age_slice = df["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    df["Age"] = age_slice
    df["Age"] = df["Age"].astype(int)
    
    # Quantize age into 5 classes
    df['Age_Group'] = pd.qcut(df['Age'],5, labels=False)
    df.drop(['Age'], axis=1, inplace=True)
    return df

def prepare_fare(df):
    df['Fare'].fillna(0, inplace=True)
    df['Fare_Group'] = pd.qcut(df['Fare'],5,labels=False)
    df.drop(['Fare'], axis=1, inplace=True)
    return df 

def prepare_genders(df):
    genders = {"male": 0, "female": 1, "unknown": 2}
    df['Sex'] = df['Sex'].map(genders)
    df['Sex'].fillna(2, inplace=True)
    df['Sex'] = df['Sex'].astype(int)
    return df

def prepare_embarked(df):
    df['Embarked'].replace('', 'U', inplace=True)
    df['Embarked'].fillna('U', inplace=True)
    ports = {"S": 0, "C": 1, "Q": 2, "U": 3}
    df['Embarked'] = df['Embarked'].map(ports)
    return df
    
parser = argparse.ArgumentParser()
parser.add_argument('--output_path', dest='output_path', required=True)
args = parser.parse_args()
    
titanic_ds = Run.get_context().input_datasets['titanic_ds']
df = titanic_ds.to_pandas_dataframe().drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
df = prepare_embarked(prepare_genders(prepare_fare(prepare_age(df))))

os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
pq.write_table(pa.Table.from_pandas(df), args.output_path)

print(f"Wrote test to {args.output_path} and train to {args.output_path}")

Overwriting dataprep.py


The code parses the input argument, which is the path to which we want to write our data. (These values will be determined by PipelineData objects that will be discussed in the next step.) The code retrieves the registered 'titanic_cs' Dataset and calls the various data preparation functions.

The code uses mkdirs to create the directory for the output data file (args.output_path) and then writes the datasets as a Parquet file at that destination.

#### Write the data preparation pipeline step (PythonScriptStep)

The data preparation code described above must be associated with a PythonScripStep object in order to be used with a pipeline. The path to which the Parquet data-preparation output is written is generated by a PipelineData object. The resources prepared earlier, such as the ComputeTarget, the RunConfig, and the 'titanic_ds' Dataset are used to complete the specification.

In [33]:
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep

prepped_data_path = PipelineData("titanic_train", datastore).as_dataset()

dataprep_step = PythonScriptStep(
    name="dataprep", 
    script_name="dataprep.py", 
    compute_target=compute_target, 
    runconfig=aml_run_config,
    arguments=["--output_path", prepped_data_path],
    inputs=[titanic_ds.as_named_input("titanic_ds")],
    outputs=[prepped_data_path],
    allow_reuse=True
)

The prepped_data_path object is of type PipelineOutputFileDataset. Notice that it is specified in both the arguments and outputs arguments. If you review the previous step, you'll see that within the data preparation code, the value of the argument '--output_path' is the file path to which the Parquet file was written.

### Train with AutoMLStep

Configuring an automated ML pipeline step is done with the AutoMLConfig class. This flexible class is described in Configure automated ML experiments in Python. Data input and output are the only aspects of configuration that require special attention in an ML pipeline. Input and output for AutoMLConfig in pipelines is discussed in detail below. Beyond data, an advantage of ML pipelines is the ability to use different compute targets for different steps. You might choose to use a more powerful ComputeTarget only for the automated ML process. Doing so is as straightforward as assigning a more powerful RunConfiguration to the AutoMLConfig object's run_configuration parameter.

#### Send data to AutoMLStep

As discussed above, configuring input to your automated ML step requires the use of certain configurations. In an ML pipeline, you must provide your data using an `X,y` technique and cannot use the `training_data` technique. You may provide all your data in `X` and `y` and use `n_cross_validations` or you may provide your own validation data in `X_valid` and `y_valid` and leave `n_cross_validations` to the default `None` value.

In an ML pipeline, the input data must be a Dataset object. The highest-performing way is to provide the input data in the form of `PipelineOutputTabularDataset` objects. You create an object of that type with the `parse_parquet_files()` or `parse_delimited_files()` on a `PipelineOutputFileDataset`, such as the `prepped_data_path` object.

In [34]:
# type(prepped_data_path) == PipelineOutputFileDataset
# type(prepped_data_potds) == PipelineOutputTabularDataset
prepped_data_potds = prepped_data_path.parse_parquet_files(file_extension=None)

X = prepped_data_potds.drop_columns('Survived')
y = prepped_data_potds.keep_columns('Survived')

#### Specify automated ML outputs

The outputs of the `AutoMLStep` are the final metric scores of the higher-performing model and that model itself. To use these outputs in further pipeline steps, prepare `PipelineData` objects to receive them.

**** What's the difference between this line:

dstor = Datastore.get_default(ws)

and the initial one that we also got a default Datastore?:

datastore = ws.get_default_datastore()

Can't we use the same DataStore object? (Confirm)

In [35]:
from azureml.pipeline.core import TrainingOutput

dstor = Datastore.get_default(ws)

metrics_data = PipelineData(name='metrics_data',
                           datastore=dstor,
                           pipeline_output_name='metrics_output',
                           training_output=TrainingOutput(type='Metrics'))
model_data = PipelineData(name='best_model_data',
                           datastore=dstor,
                           pipeline_output_name='model_output',
                           training_output=TrainingOutput(type='Model'))

The snippet above assigns the default datastore of the workspace to `dstor`. Then, it creates the two `PipelineData` objects for the metrics and model output. Each is named, assigned `dstor` as the datastore on which the output will be stored, and associated with the particular `type` of `TrainingOutput` from the `AutoMLStep`.

###  Configure and create the automated ML pipeline step

In [36]:
from azureml.train.automl import AutoMLConfig
from azureml.pipeline.steps import AutoMLStep

# Change timeouts and increase iterations to a reasonable number (e.g., 50) for better accuracy
automl_settings = {
    "iteration_timeout_minutes" : 10,
    "iterations" : 2,
    "experiment_timeout_hours" : 0.25,
    "primary_metric" : 'AUC_weighted',
    "n_cross_validations" : 3
}

automl_config = AutoMLConfig(task = 'classification',
                             path = '.',
                             debug_log = 'automated_ml_errors.log',
                             compute_target = compute_target,
                             run_configuration = aml_run_config,
                             featurization = 'auto',
                             X = X,
                             y = y,
                             **automl_settings)

train_step = AutoMLStep(name='AutoML_Classification',
                                 automl_config=automl_config,
                                 passthru_automl_config=False,
                                 outputs=[metrics_data,model_data],
                                 allow_reuse=True)



### Register the model generated by automated ML

In [37]:
%%writefile register_model.py

# register_model.py
from azureml.core.model import Model, Dataset
from azureml.core.run import Run, _OfflineRun
from azureml.core import Workspace
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--model_name", required=True)
parser.add_argument("--model_path", required=True)
args = parser.parse_args()

print(f"model_name : {args.model_name}")
print(f"model_path: {args.model_path}")

run = Run.get_context()
ws = Workspace.from_config() if type(run) == _OfflineRun else run.experiment.workspace

model = Model.register(workspace=ws,
                       model_path=args.model_path,
                       model_name=args.model_name)

print("Registered version {0} of model {1}".format(model.version, model.name))

Overwriting register_model.py


#### Write the PythonScriptStep code

In [38]:
from azureml.pipeline.core.graph import PipelineParameter

# The model name with which to register the trained model in the workspace.
model_name = PipelineParameter("model_name", default_value="TitanicSurvivalInitial")

register_step = PythonScriptStep(script_name="register_model.py",
                                       name="register_model",
                                       allow_reuse=False,
                                       arguments=["--model_name", model_name, "--model_path", model_data],
                                       inputs=[model_data],
                                       compute_target=compute_target,
                                       runconfig=aml_run_config)

### Create and run your automated ML pipeline

In [39]:
from azureml.pipeline.core import Pipeline

pipeline = Pipeline(ws, [dataprep_step, train_step, register_step])

In [40]:
from azureml.core import Experiment

experiment = Experiment(workspace=ws, 
                        name='titanic_automl')

run = experiment.submit(pipeline, show_output=True)
run.wait_for_completion()

Created step dataprep [f976bbfb][73f6cd66-6673-40fe-9663-0c22be511500], (This step will run and generate new outputs)
Created step AutoML_Classification [33b9d1f1][7e415b64-6545-4001-bd68-3c8ef67bea23], (This step will run and generate new outputs)
Created step register_model [e4f1e548][b0e0dea0-1724-4b21-a37f-c456eaad11dd], (This step will run and generate new outputs)
Submitted PipelineRun ebeab107-cd1b-4c4a-8fb4-f397fbdc52be
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/titanic_automl/runs/ebeab107-cd1b-4c4a-8fb4-f397fbdc52be?wsid=/subscriptions/381b38e9-9840-4719-a5a0-61d9585e1e91/resourcegroups/cesardl-automl-ncentralus-demo-ws-resgrp/workspaces/cesardl-automl-ncentralus-demo-ws
PipelineRunId: ebeab107-cd1b-4c4a-8fb4-f397fbdc52be
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/titanic_automl/runs/ebeab107-cd1b-4c4a-8fb4-f397fbdc52be?wsid=/subscriptions/381b38e9-9840-4719-a5a0-61d9585e1e91/resourcegroups/cesardl-automl-ncentralus-de




StepRunId: 403702f5-cabd-4748-b4b0-4c8dfaa39100
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/titanic_automl/runs/403702f5-cabd-4748-b4b0-4c8dfaa39100?wsid=/subscriptions/381b38e9-9840-4719-a5a0-61d9585e1e91/resourcegroups/cesardl-automl-ncentralus-demo-ws-resgrp/workspaces/cesardl-automl-ncentralus-demo-ws
StepRun( AutoML_Classification ) Status: NotStarted
StepRun( AutoML_Classification ) Status: Queued
StepRun( AutoML_Classification ) Status: Running

StepRun(AutoML_Classification) Execution Summary
StepRun( AutoML_Classification ) Status: Failed


ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "UserError",
        "message": "User program failed with ImportError: cannot import name 'CHILD_RUNS_SUMMARY_PATH'",
        "detailsUri": "https://aka.ms/azureml-known-errors",
        "details": [],
        "debugInfo": {
            "type": "ImportError",
            "message": "cannot import name 'CHILD_RUNS_SUMMARY_PATH'",
            "stackTrace": "  File \"/mnt/batch/tasks/shared/LS_root/jobs/cesardl-automl-ncentralus-demo-ws/azureml/403702f5-cabd-4748-b4b0-4c8dfaa39100_setup/mounts/workspaceblobstore/azureml/403702f5-cabd-4748-b4b0-4c8dfaa39100_setup/azureml-setup/context_manager_injector.py\", line 127, in execute_with_context\n    runpy.run_path(sys.argv[0], globals(), run_name=\"__main__\")\n  File \"/azureml-envs/azureml_bb2553c4c8454b03df8cbb6c7c3bc444/lib/python3.6/runpy.py\", line 263, in run_path\n    pkg_name=pkg_name, script_name=fname)\n  File \"/azureml-envs/azureml_bb2553c4c8454b03df8cbb6c7c3bc444/lib/python3.6/runpy.py\", line 96, in _run_module_code\n    mod_name, mod_spec, pkg_name, script_name)\n  File \"/azureml-envs/azureml_bb2553c4c8454b03df8cbb6c7c3bc444/lib/python3.6/runpy.py\", line 85, in _run_code\n    exec(code, run_globals)\n  File \"setup_403702f5-cabd-4748-b4b0-4c8dfaa39100.py\", line 26, in <module>\n    from azureml.train.automl import automl\n  File \"/azureml-envs/azureml_bb2553c4c8454b03df8cbb6c7c3bc444/lib/python3.6/site-packages/azureml/train/automl/__init__.py\", line 28, in <module>\n    from azureml.train.automl.automlconfig import AutoMLConfig\n  File \"/azureml-envs/azureml_bb2553c4c8454b03df8cbb6c7c3bc444/lib/python3.6/site-packages/azureml/train/automl/automlconfig.py\", line 25, in <module>\n    from . import constants\n  File \"/azureml-envs/azureml_bb2553c4c8454b03df8cbb6c7c3bc444/lib/python3.6/site-packages/azureml/train/automl/constants.py\", line 22, in <module>\n    from azureml.automl.core.shared.constants import (\n"
        },
        "messageParameters": {}
    },
    "time": "0001-01-01T00:00:00.000Z"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"UserError\",\n        \"message\": \"User program failed with ImportError: cannot import name 'CHILD_RUNS_SUMMARY_PATH'\",\n        \"detailsUri\": \"https://aka.ms/azureml-known-errors\",\n        \"details\": [],\n        \"debugInfo\": {\n            \"type\": \"ImportError\",\n            \"message\": \"cannot import name 'CHILD_RUNS_SUMMARY_PATH'\",\n            \"stackTrace\": \"  File \\\"/mnt/batch/tasks/shared/LS_root/jobs/cesardl-automl-ncentralus-demo-ws/azureml/403702f5-cabd-4748-b4b0-4c8dfaa39100_setup/mounts/workspaceblobstore/azureml/403702f5-cabd-4748-b4b0-4c8dfaa39100_setup/azureml-setup/context_manager_injector.py\\\", line 127, in execute_with_context\\n    runpy.run_path(sys.argv[0], globals(), run_name=\\\"__main__\\\")\\n  File \\\"/azureml-envs/azureml_bb2553c4c8454b03df8cbb6c7c3bc444/lib/python3.6/runpy.py\\\", line 263, in run_path\\n    pkg_name=pkg_name, script_name=fname)\\n  File \\\"/azureml-envs/azureml_bb2553c4c8454b03df8cbb6c7c3bc444/lib/python3.6/runpy.py\\\", line 96, in _run_module_code\\n    mod_name, mod_spec, pkg_name, script_name)\\n  File \\\"/azureml-envs/azureml_bb2553c4c8454b03df8cbb6c7c3bc444/lib/python3.6/runpy.py\\\", line 85, in _run_code\\n    exec(code, run_globals)\\n  File \\\"setup_403702f5-cabd-4748-b4b0-4c8dfaa39100.py\\\", line 26, in <module>\\n    from azureml.train.automl import automl\\n  File \\\"/azureml-envs/azureml_bb2553c4c8454b03df8cbb6c7c3bc444/lib/python3.6/site-packages/azureml/train/automl/__init__.py\\\", line 28, in <module>\\n    from azureml.train.automl.automlconfig import AutoMLConfig\\n  File \\\"/azureml-envs/azureml_bb2553c4c8454b03df8cbb6c7c3bc444/lib/python3.6/site-packages/azureml/train/automl/automlconfig.py\\\", line 25, in <module>\\n    from . import constants\\n  File \\\"/azureml-envs/azureml_bb2553c4c8454b03df8cbb6c7c3bc444/lib/python3.6/site-packages/azureml/train/automl/constants.py\\\", line 22, in <module>\\n    from azureml.automl.core.shared.constants import (\\n\"\n        },\n        \"messageParameters\": {}\n    },\n    \"time\": \"0001-01-01T00:00:00.000Z\"\n}"
    }
}

#### Download the results of an automated ML run

In [None]:
# Run on local machine
ws = Workspace.from_config()

experiment = ws.experiments['titanic_automl']
run = next(run for run in ex.get_runs() if run.id == 'aaaaaaaa-bbbb-cccc-dddd-0123456789AB')
automl_run = next(r for r in run.get_children() if r.name == 'AutoML_Classification')
outputs = automl_run.get_outputs()
metrics = outputs['default_metrics_AutoML_Classification']
model = outputs['default_model_AutoML_Classification']

metrics.get_port_data_reference().download('.')
model.get_port_data_reference().download('.')