In [1]:
from azureml.core import Workspace, Experiment, Environment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-141195
Azure region: southcentralus
Subscription id: aa7cf8e8-d23f-4bce-a7b9-1f0b4e0ac8ee
Resource group: aml-quickstarts-141195


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

# cpu_cluster_name is just that: a name. Nothing more, nothing less.
cpu_cluster_name= "udacity-project"
compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_V2", max_nodes=4)
compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

# additional code, that is not needed for our cluster to work, but gives us info about 
# the status and properties of the new cluster

compute_target.wait_for_completion(show_output=True)
print(compute_target.get_status().serialize())


Running
{'errors': [], 'creationTime': '2021-03-23T07:45:09.404869+00:00', 'createdBy': {'userObjectId': '6008a759-f99a-46f6-a63e-3b0fa2448e12', 'userTenantId': '660b3398-b80e-49d2-bc5b-ac1dc93b5254', 'userName': 'ODL_User 141195'}, 'modifiedTime': '2021-03-23T07:47:41.555346+00:00', 'state': 'Running', 'vmSize': 'STANDARD_DS3_V2'}


In [7]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.core import ScriptRunConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os
import shutil

## Reference: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters

ps = RandomParameterSampling({
    "--C" : choice(0.01, 0.1, 1),
    "--max_iter" : choice(20, 40, 60, 100, 150, 200)
})

# Specify an early termination Policy. Early termination improves computational efficiency.
# Bandit is an aggresive policy, saving a lot of time but mabe missing some promising candidates

policy = BanditPolicy(slack_factor = 0.15, evaluation_interval=1, delay_evaluation=5)


if "training" not in os.listdir():
    os.mkdir("./training")

est = SKLearn(source_directory ='.', entry_script="train.py",compute_target=compute_target, vm_size="Standard_d2_V2", vm_priority="lowpriority")

hyperdrive_config = HyperDriveConfig(hyperparameter_sampling= ps,
                                    primary_metric_name='Accuracy',
                                    primary_metric_goal= PrimaryMetricGoal.MAXIMIZE,
                                    max_total_runs=4,
                                    policy = policy,
                                    estimator=est)



In [8]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

hyperdrive_run = exp.submit(hyperdrive_config)

# visualize with widget
from azureml.widgets import RunDetails
RunDetails(hyperdrive_run).show()




_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [9]:
import joblib
# Get your best run and save the model from that run.

best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()


print('Best Run Id: ', best_run.id)
print('\n Best run metrics:', best_run_metrics)
print('\n Filename:', best_run.get_file_names())
print('\n Run details:',parameter_values)

Best Run Id:  HD_2351a703-71fa-47d4-b164-e17bc178212d_0

 Best run metrics: {'Regularization Strength:': 1.0, 'Max iterations:': 200, 'Accuracy': 0.908649468892261}

 Filename: ['azureml-logs/55_azureml-execution-tvmps_f31324087969327116086b43e4bc39c86c10d554d41eaaffd2da84ebc6d4020a_d.txt', 'azureml-logs/65_job_prep-tvmps_f31324087969327116086b43e4bc39c86c10d554d41eaaffd2da84ebc6d4020a_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_f31324087969327116086b43e4bc39c86c10d554d41eaaffd2da84ebc6d4020a_d.txt', 'logs/azureml/106_azureml.log', 'logs/azureml/dataprep/backgroundProcess.log', 'logs/azureml/dataprep/backgroundProcess_Telemetry.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log']

 Run details: {'runId': 'HD_2351a703-71fa-47d4-b164-e17bc178212d_0', 'target': 'udacity-project', 'status': 'Completed', 'startTimeUtc': '2021-03-23T09:07:49.731015Z', 'endTimeUtc': '2021-03-23T09:08:32.660039Z', 'properties': {'_azureml.ComputeTarge

In [11]:
model = best_run.register_model(model_name="hyerdrive_best_run",
model_path="azureml-logs/model.joblib",
tags={"Method":"Hyperdrive"},
properties={"Accuracy": best_run.get_metrics()["Accuracy"]})

In [12]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = TabularDatasetFactory.from_delimited_files('https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv')

In [25]:
from train import clean_data
from azureml.core import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
# Use the clean_data function to clean your data.
x, y = clean_data(ds)

cleaned_data =pd.concat([x,y], axis=1)

# Since we´e working with a VM, we need to reupload our cleaned data. Reference:  https://docs.microsoft.com/en-us/azure/machine-learning/how-to-create-register-datasets#create-a-tabulardataset
train_data, test_data = train_test_split(cleaned_data, test_size=0.2, random_state=66)
train_data.to_csv("./training_data.csv")
datastore = ws.get_default_datastore()
datastore.upload(src_dir="./")

train_ds = Dataset.Tabular.from_delimited_files(path=[(datastore, ("./training_data.csv"))])




Uploading an estimated of 10 files
Target already exists. Skipping upload for .amlignore
Target already exists. Skipping upload for .amlignore.amltmp
Target already exists. Skipping upload for README.md
Target already exists. Skipping upload for train.py
Target already exists. Skipping upload for training
Target already exists. Skipping upload for training_data.csv
Target already exists. Skipping upload for udacity-project.ipynb
Target already exists. Skipping upload for udacity-project.ipynb.amltmp
Target already exists. Skipping upload for .ipynb_aml_checkpoints/udacity-project-checkpoint2021-2-23-7-51-54.ipynb
Target already exists. Skipping upload for __pycache__/train.cpython-36.pyc
Uploaded 0 files


In [26]:
from azureml.train.automl import AutoMLConfig
import pandas as pd
# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task="classification",
    primary_metric='accuracy',
    training_data=train_ds,
    label_column_name="y",
    n_cross_validations=5,
    compute_target=compute_target,
    enable_early_stopping=True)

In [27]:
# Submit your automl run
remote_run = exp.submit(config=automl_config, show_output=True)

Running on remote.
No run_configuration provided, running on udacity-project with default configuration
Running on remote compute: udacity-project
Parent Run ID: AutoML_856b37e5-4bcf-4a40-ad42-583f49016138

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead 



In [14]:
# Retrieve and save your best automl model.

print(remote_run.get_metrics())
print(remote_run.get_tags())

from azureml.automl.runtime.onnx_convert import OnnxConverter

automl_best_run_onnx, automl_fitted_model_onnx = remote_run.get_output(return_onnx_model=True)
OnnxConverter.save_onnx_model(automl_fitted_model_onnx, "./outputs/best_automl_model.onnx")

NameError: name 'remote_run' is not defined

In [15]:
remote_run

NameError: name 'remote_run' is not defined

In [21]:
#cluster cleanup

compute_target.delete()

Current provisioning state of AmlCompute is "Deleting"

