In [1]:
from azureml.core import ComputeTarget, Dataset, Datastore, Experiment, Workspace
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import RunConfiguration

from azureml.pipeline.core import Pipeline, PipelineData, TrainingOutput
from azureml.pipeline.core.graph import PipelineParameter
from azureml.pipeline.steps import AutoMLStep, PythonScriptStep

from azureml.train.automl import AutoMLConfig

import os

In [3]:
# ws = Workspace.from_config(auth=InteractiveLoginAuthentication(tenant_id=os.environ["AML_TENANT_ID"]))

ws = Workspace.from_config()
ws

Workspace.create(name='cesardl-automl-ncentralus-demo-ws', subscription_id='381b38e9-9840-4719-a5a0-61d9585e1e91', resource_group='cesardl-automl-ncentralus-demo-ws-resgrp')

In [4]:
compute_name = "cpu-compute3"
if not compute_name in ws.compute_targets :
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2",
                                                                min_nodes=0,
                                                                max_nodes=1)
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)

    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # Show the result
    print(compute_target.get_status().serialize())

creating a new compute target...
Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-04-29T18:43:29.045000+00:00', 'errors': None, 'creationTime': '2020-04-29T18:43:20.856719+00:00', 'modifiedTime': '2020-04-29T18:43:36.931537+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 1, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


In [8]:
compute = AmlCompute(ws, compute_name)
print(compute)

AmlCompute(workspace=Workspace.create(name='cesardl-automl-ncentralus-demo-ws', subscription_id='381b38e9-9840-4719-a5a0-61d9585e1e91', resource_group='cesardl-automl-ncentralus-demo-ws-resgrp'), name=cpu-compute3, id=/subscriptions/381b38e9-9840-4719-a5a0-61d9585e1e91/resourceGroups/cesardl-automl-ncentralus-demo-ws-resgrp/providers/Microsoft.MachineLearningServices/workspaces/cesardl-automl-ncentralus-demo-ws/computes/cpu-compute3, type=AmlCompute, provisioning_state=Succeeded, location=northcentralus, tags=None)


In [9]:
datastore = ws.get_default_datastore()

In [10]:
aml_run_config = RunConfiguration()
aml_run_config.target = compute

# Use conda_dependencies.yml to create a conda environment in the Docker image for execution
aml_run_config.environment.python.user_managed_dependencies = False

# Specify CondaDependencies obj, add necessary packages
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['pandas','scikit-learn', 'pyarrow'], 
    pip_packages=['azureml-sdk', 'azureml-dataprep[fuse,pandas]'], 
    pin_sdk_version=False)

## Step 0: Grab an open dataset and register it

This is baseline data. If the `Dataset` does not exist, create and register it. Not a part of the Pipeline.

In [11]:
if not 'titanic_ds' in ws.datasets.keys() :
    # create a TabularDataset from Titanic training data
    web_paths = ['https://dprepdata.blob.core.windows.net/demo/Titanic.csv',
                 'https://dprepdata.blob.core.windows.net/demo/Titanic2.csv']
    titanic_ds = Dataset.Tabular.from_delimited_files(path=web_paths)

    titanic_ds.register(workspace = ws,
                                     name = 'titanic_ds',
                                     description = 'new titanic training data',
                                     create_new_version = True)

titanic_ds = Dataset.get_by_name(ws, 'titanic_ds')

In [12]:
type(titanic_ds)

azureml.data.tabular_dataset.TabularDataset

In [13]:
if not 'titanic_files_ds' in ws.datasets.keys() :
    # create a TabularDataset from Titanic training data
    web_paths = ['https://dprepdata.blob.core.windows.net/demo/Titanic.csv',
                 'https://dprepdata.blob.core.windows.net/demo/Titanic2.csv']
    titanic_ds = Dataset.File.from_files(path=web_paths)

    titanic_ds.register(workspace = ws,
                                     name = 'titanic_files_ds',
                                     description = 'File Dataset of titanic training data',
                                     create_new_version = True)

## Step 1: Dataprep

In [14]:
%%writefile dataprep.py
# dataprep.py
from azureml.core import Run
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
import argparse
import pyarrow as pa
import pyarrow.parquet as pq

RANDOM_SEED=42

def prepare_age(df):
    # Fill in missing Age values from distribution of present Age values 
    mean = df["Age"].mean()
    std = df["Age"].std()
    is_null = df["Age"].isnull().sum()
    # compute enough (== is_null().sum()) random numbers between the mean, std
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    # fill NaN values in Age column with random values generated
    age_slice = df["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    df["Age"] = age_slice
    df["Age"] = df["Age"].astype(int)
    
    # Quantize age into 5 classes
    df['Age_Group'] = pd.qcut(df['Age'],5, labels=False)
    df.drop(['Age'], axis=1, inplace=True)
    return df

def prepare_fare(df):
    df['Fare'].fillna(0, inplace=True)
    df['Fare_Group'] = pd.qcut(df['Fare'],5,labels=False)
    df.drop(['Fare'], axis=1, inplace=True)
    return df 

def prepare_genders(df):
    genders = {"male": 0, "female": 1, "unknown": 2}
    df['Sex'] = df['Sex'].map(genders)
    df['Sex'].fillna(2, inplace=True)
    df['Sex'] = df['Sex'].astype(int)
    return df

def prepare_embarked(df):
    df['Embarked'].replace('', 'U', inplace=True)
    df['Embarked'].fillna('U', inplace=True)
    ports = {"S": 0, "C": 1, "Q": 2, "U": 3}
    df['Embarked'] = df['Embarked'].map(ports)
    return df
    
parser = argparse.ArgumentParser()
parser.add_argument('--output_path', dest='output_path', required=True)
args = parser.parse_args()
    
titanic_ds = Run.get_context().input_datasets['titanic_ds']
df = titanic_ds.to_pandas_dataframe().drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
df = prepare_embarked(prepare_genders(prepare_fare(prepare_age(df))))

os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
pq.write_table(pa.Table.from_pandas(df), args.output_path)

print(f"Wrote test to {args.output_path} and train to {args.output_path}")

Overwriting dataprep.py


In [15]:
prepped_data_path = PipelineData("titanic_train", datastore).as_dataset()

In [16]:
dataprep_step = PythonScriptStep(
    name="dataprep", 
    script_name="dataprep.py", 
    compute_target=compute, 
    runconfig=aml_run_config,
    arguments=["--output_path", prepped_data_path],
    inputs=[titanic_ds.as_named_input("titanic_ds")],
    outputs=[prepped_data_path],
    allow_reuse=True
)

### Step 2: Train with AutoMLStep

In [17]:
prepped_data_potds = prepped_data_path.parse_parquet_files(file_extension=None)

X = prepped_data_potds.drop_columns('Survived')
y = prepped_data_potds.keep_columns('Survived')


# Change iterations to a reasonable number (50) to get better accuracy
automl_settings = {
    "iteration_timeout_minutes" : 10,
    "iterations" : 50,
    "experiment_timeout_hours" : 1,
    "primary_metric" : 'AUC_weighted',
    "n_cross_validations" : 2
}

automl_config = AutoMLConfig(task = 'classification',
                             path = '.',
                             debug_log = 'automated_ml_errors.log',
                             compute_target = compute,
                             run_configuration = aml_run_config,
                             featurization = 'auto',
                             X = X,
                             y = y,
                             **automl_settings)
                             
print("AutoML config created.")



AutoML config created.


In [18]:
dstor = Datastore.get_default(ws)

metrics_data = PipelineData(name='metrics_data',
                           datastore=dstor,
                           pipeline_output_name='metrics_output',
                           training_output=TrainingOutput(type='Metrics'))
model_data = PipelineData(name='best_model_data',
                           datastore=dstor,
                           pipeline_output_name='model_output',
                           training_output=TrainingOutput(type='Model'))


train_step = AutoMLStep(name='AutoML_Classification',
                                 automl_config=automl_config,
                                 passthru_automl_config=False,
                                 outputs=[metrics_data,model_data],
                                 allow_reuse=True)
print("train_step created.")

train_step created.


## Step 3: Register the model

In [19]:
%%writefile register_model.py
from azureml.core.model import Model, Dataset
from azureml.core.run import Run, _OfflineRun
from azureml.core import Workspace
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--model_name", required=True)
parser.add_argument("--model_path", required=True)
args = parser.parse_args()

print(f"model_name : {args.model_name}")
print(f"model_path: {args.model_path}")

run = Run.get_context()
ws = Workspace.from_config() if type(run) == _OfflineRun else run.experiment.workspace

model = Model.register(workspace=ws,
                       model_path=args.model_path,
                       model_name=args.model_name)

print("Registered version {0} of model {1}".format(model.version, model.name))


Overwriting register_model.py


In [20]:
# The model name with which to register the trained model in the workspace.
model_name = PipelineParameter("model_name", default_value="TitanicSurvival")

register_step = PythonScriptStep(script_name="register_model.py",
                                       name="register_model",
                                       allow_reuse=False,
                                       arguments=["--model_name", model_name, "--model_path", model_data],
                                       inputs=[model_data],
                                       compute_target=compute,
                                       runconfig=aml_run_config)

## Submit it

In [21]:
if not 'titanic_automl' in ws.experiments.keys() :
    Experiment(ws, 'titanic_automl')
experiment = ws.experiments['titanic_automl']

In [22]:
pipeline = Pipeline(ws, [dataprep_step, train_step, register_step])

In [23]:
run = experiment.submit(pipeline, show_output=True)

Created step dataprep [3e4ebdcb][dd4ac83b-8444-4c84-b536-2e6aafeab11a], (This step will run and generate new outputs)
Created step AutoML_Classification [3460776d][ea57090e-b15a-4b04-81ac-1807bc7f5220], (This step will run and generate new outputs)
Created step register_model [b522e10f][05bd3f97-f8cb-457c-a578-d88be5ee553b], (This step will run and generate new outputs)
Submitted PipelineRun dd4537ff-efb2-4d0f-bf1a-6f345e4e0327
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/titanic_automl/runs/dd4537ff-efb2-4d0f-bf1a-6f345e4e0327?wsid=/subscriptions/381b38e9-9840-4719-a5a0-61d9585e1e91/resourcegroups/cesardl-automl-ncentralus-demo-ws-resgrp/workspaces/cesardl-automl-ncentralus-demo-ws


In [24]:
run.wait_for_completion()

PipelineRunId: dd4537ff-efb2-4d0f-bf1a-6f345e4e0327
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/titanic_automl/runs/dd4537ff-efb2-4d0f-bf1a-6f345e4e0327?wsid=/subscriptions/381b38e9-9840-4719-a5a0-61d9585e1e91/resourcegroups/cesardl-automl-ncentralus-demo-ws-resgrp/workspaces/cesardl-automl-ncentralus-demo-ws
PipelineRun Status: Running


StepRunId: a9ad403d-ab3b-44bb-a5d0-5af9d887f0f0
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/titanic_automl/runs/a9ad403d-ab3b-44bb-a5d0-5af9d887f0f0?wsid=/subscriptions/381b38e9-9840-4719-a5a0-61d9585e1e91/resourcegroups/cesardl-automl-ncentralus-demo-ws-resgrp/workspaces/cesardl-automl-ncentralus-demo-ws
StepRun( dataprep ) Status: NotStarted
StepRun( dataprep ) Status: Running

Streaming azureml-logs/20_image_build_log.txt
2020/04/29 18:49:06 Downloading source code...
2020/04/29 18:49:07 Finished downloading source code
2020/04/29 18:49:08 Creating Docker network: acb_default_network, driver: 

[91m
mkl-2019.4           | 204.1 MB  | ########   |  80% [0m[91m
mkl-2019.4           | 204.1 MB  | ########1  |  82% [0m[91m
mkl-2019.4           | 204.1 MB  | ########2  |  83% [0m[91m
mkl-2019.4           | 204.1 MB  | ########3  |  84% [0m[91m
mkl-2019.4           | 204.1 MB  | ########4  |  84% [0m[91m
mkl-2019.4           | 204.1 MB  | ########4  |  85% [0m[91m
mkl-2019.4           | 204.1 MB  | ########4  |  85% [0m[91m
mkl-2019.4           | 204.1 MB  | ########5  |  85% [0m[91m
mkl-2019.4           | 204.1 MB  | ########5  |  85% [0m
[91m
mkl-2019.4           | 204.1 MB  | ########5  |  85% [0m[91m
mkl-2019.4           | 204.1 MB  | ########5  |  86% [0m[91m
mkl-2019.4           | 204.1 MB  | ########5  |  86% [0m[91m
mkl-2019.4           | 204.1 MB  | ########5  |  86% [0m[91m
mkl-2019.4           | 204.1 MB  | ########5  |  86% [0m[91m
mkl-2019.4           | 204.1 MB  | ########5  |  86% [0m[91m
mkl-2019.4           | 204.1 MB  | ########5  | 

[91m
mkl-2019.4           | 204.1 MB  | #########7 |  97% [0m[91m
mkl-2019.4           | 204.1 MB  | #########7 |  97% [0m[91m
mkl-2019.4           | 204.1 MB  | #########7 |  97% [0m[91m
mkl-2019.4           | 204.1 MB  | #########7 |  97% [0m[91m
mkl-2019.4           | 204.1 MB  | #########7 |  98% [0m[91m
mkl-2019.4           | 204.1 MB  | #########7 |  98% [0m[91m
mkl-2019.4           | 204.1 MB  | #########7 |  98% [0m[91m
mkl-2019.4           | 204.1 MB  | #########7 |  98% [0m[91m
mkl-2019.4           | 204.1 MB  | #########7 |  98% [0m[91m
mkl-2019.4           | 204.1 MB  | #########7 |  98% [0m[91m
mkl-2019.4           | 204.1 MB  | #########7 |  98% [0m[91m
mkl-2019.4           | 204.1 MB  | #########7 |  98% [0m[91m
mkl-2019.4           | 204.1 MB  | #########8 |  98% [0m[91m
mkl-2019.4           | 204.1 MB  | #########8 |  98% [0m[91m
mkl-2019.4           | 204.1 MB  | #########8 |  98% [0m[91m
mkl-2019.4           | 204.1 MB  | #########8 |  


lz4-c-1.8.1.2        | 158 KB    |            |   0% [0m[91m
lz4-c-1.8.1.2        | 158 KB    | ########## | 100% [0m[91m

openssl-1.0.2u       | 3.1 MB    |            |   0% [0m[91m
openssl-1.0.2u       | 3.1 MB    | #######6   |  76% [0m[91m
openssl-1.0.2u       | 3.1 MB    | #########1 |  91% [0m[91m
openssl-1.0.2u       | 3.1 MB    | ########## | 100% [0m[91m

libgcc-ng-9.1.0      | 8.1 MB    |            |   0% [0m[91m
libgcc-ng-9.1.0      | 8.1 MB    | ####6      |  46% [0m[91m
libgcc-ng-9.1.0      | 8.1 MB    | #######5   |  76% [0m[91m
libgcc-ng-9.1.0      | 8.1 MB    | #########4 |  95% [0m[91m
libgcc-ng-9.1.0      | 8.1 MB    | ########## | 100% [0m[91m

snappy-1.1.8         | 39 KB     |            |   0% [0m[91m
snappy-1.1.8         | 39 KB     | ########## | 100% [0m[91m

pyarrow-0.13.0       | 2.2 MB    |            |   0% [0m[91m
pyarrow-0.13.0       | 2.2 MB    | #######5   |  75% [0m[91m
pyarrow-0.13.0       | 2.2 MB    | #########7 |  9

  Created wheel for fusepy: filename=fusepy-3.0.1-py3-none-any.whl size=10503 sha256=81bffda4757a1ffbcf8dedaeb61b87d07293ca0cc1267c8d3297890f20a042bc
  Stored in directory: /root/.cache/pip/wheels/21/5c/83/1dd7e8a232d12227e5410120f4374b33adeb4037473105b079
Successfully built fusepy
Installing collected packages: mccabe, pyflakes, entrypoints, pycodestyle, flake8, chardet, idna, urllib3, requests, oauthlib, requests-oauthlib, isodate, msrest, pycparser, cffi, cryptography, PyJWT, adal, msrestazure, azureml-train-restclients-hyperdrive, azure-common, websocket-client, docker, jmespath, pyopenssl, azure-mgmt-resource, pyasn1, ndg-httpsclient, ruamel.yaml, azure-mgmt-keyvault, azure-mgmt-authorization, jeepney, SecretStorage, azure-mgmt-containerregistry, zipp, importlib-metadata, jsonpickle, azure-mgmt-storage, pathspec, contextlib2, backports.weakref, backports.tempfile, azure-graphrbac, azureml-core, applicationinsights, azureml-telemetry, azureml-train-core, azureml-train, cloudpickle,

a6c378d11cbf: Verifying Checksum
a6c378d11cbf: Download complete
a1298f4ce990: Pull complete
04a3282d9c4b: Pull complete
9b0d3db6dc03: Pull complete
8269c605f3f1: Pull complete
dd8bc8ef7897: Verifying Checksum
dd8bc8ef7897: Download complete
6504d449e70c: Pull complete
4e38f320d0d4: Pull complete
b0a763e8ee03: Pull complete
11917a028ca4: Pull complete
a6c378d11cbf: Pull complete
6cc007ad9140: Pull complete
6c1698a608f3: Pull complete
b42202f2f29b: Pull complete
0293ce4cde4b: Pull complete
faf4a144b597: Pull complete
9353f057e3d3: Pull complete
dd8bc8ef7897: Pull complete
6bb4f3f71377: Pull complete
Digest: sha256:7bd670545e6f02976bcc589eab181c573d9b971f24e5b53d19e7bad625610064
Status: Downloaded newer image for cesardlautoma5f87185.azurecr.io/azureml/azureml_b75709396e112eb3c528ff9421e54d7f:latest
78e2cea629f571d8885f2e518b2a923fcff5b0bdfe9013c7f37f74621e81ed47
2020/04/29 19:00:33 Version: 3.0.01196.0002 Branch: hotfix1 Commit: bc95bff5
2020/04/29 19:00:33 /dev/infiniband/uverbs0 found

ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "UserError",
        "message": "User program failed with AttributeError: 'FileDataset' object has no attribute 'to_pandas_dataframe'",
        "detailsUri": "https://aka.ms/azureml-known-errors",
        "details": [],
        "debugInfo": {
            "type": "AttributeError",
            "message": "'FileDataset' object has no attribute 'to_pandas_dataframe'",
            "stackTrace": "  File \"/mnt/batch/tasks/shared/LS_root/jobs/cesardl-automl-ncentralus-demo-ws/azureml/a9ad403d-ab3b-44bb-a5d0-5af9d887f0f0/mounts/workspaceblobstore/azureml/a9ad403d-ab3b-44bb-a5d0-5af9d887f0f0/azureml-setup/context_manager_injector.py\", line 127, in execute_with_context\n    runpy.run_path(sys.argv[0], globals(), run_name=\"__main__\")\n  File \"/azureml-envs/azureml_0bdf841bedfca5de2e4cf97ccc64431d/lib/python3.6/runpy.py\", line 263, in run_path\n    pkg_name=pkg_name, script_name=fname)\n  File \"/azureml-envs/azureml_0bdf841bedfca5de2e4cf97ccc64431d/lib/python3.6/runpy.py\", line 96, in _run_module_code\n    mod_name, mod_spec, pkg_name, script_name)\n  File \"/azureml-envs/azureml_0bdf841bedfca5de2e4cf97ccc64431d/lib/python3.6/runpy.py\", line 85, in _run_code\n    exec(code, run_globals)\n  File \"dataprep.py\", line 55, in <module>\n    df = titanic_ds.to_pandas_dataframe().drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)\n"
        },
        "messageParameters": {}
    },
    "time": "0001-01-01T00:00:00.000Z"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"UserError\",\n        \"message\": \"User program failed with AttributeError: 'FileDataset' object has no attribute 'to_pandas_dataframe'\",\n        \"detailsUri\": \"https://aka.ms/azureml-known-errors\",\n        \"details\": [],\n        \"debugInfo\": {\n            \"type\": \"AttributeError\",\n            \"message\": \"'FileDataset' object has no attribute 'to_pandas_dataframe'\",\n            \"stackTrace\": \"  File \\\"/mnt/batch/tasks/shared/LS_root/jobs/cesardl-automl-ncentralus-demo-ws/azureml/a9ad403d-ab3b-44bb-a5d0-5af9d887f0f0/mounts/workspaceblobstore/azureml/a9ad403d-ab3b-44bb-a5d0-5af9d887f0f0/azureml-setup/context_manager_injector.py\\\", line 127, in execute_with_context\\n    runpy.run_path(sys.argv[0], globals(), run_name=\\\"__main__\\\")\\n  File \\\"/azureml-envs/azureml_0bdf841bedfca5de2e4cf97ccc64431d/lib/python3.6/runpy.py\\\", line 263, in run_path\\n    pkg_name=pkg_name, script_name=fname)\\n  File \\\"/azureml-envs/azureml_0bdf841bedfca5de2e4cf97ccc64431d/lib/python3.6/runpy.py\\\", line 96, in _run_module_code\\n    mod_name, mod_spec, pkg_name, script_name)\\n  File \\\"/azureml-envs/azureml_0bdf841bedfca5de2e4cf97ccc64431d/lib/python3.6/runpy.py\\\", line 85, in _run_code\\n    exec(code, run_globals)\\n  File \\\"dataprep.py\\\", line 55, in <module>\\n    df = titanic_ds.to_pandas_dataframe().drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)\\n\"\n        },\n        \"messageParameters\": {}\n    },\n    \"time\": \"0001-01-01T00:00:00.000Z\"\n}"
    }
}

In [21]:
# automl_run = next(r for r in run.get_children() if r.name == 'AutoML_Classification')
# outputs = automl_run.get_outputs()
# metrics = outputs['default_metrics_AutoML_Classification']
# model = outputs['default_model_AutoML_Classification']

# metrics.get_port_data_reference().download('.')
# model.get_port_data_reference().download('.')

In [22]:
#metrics

In [23]:
#type(model)

In [24]:
#model