In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.get(name="quick-starts-ws-126349")
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Performing interactive authentication. Please follow the instructions on the terminal.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code RWB8ADKSG to authenticate.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.
Workspace name: quick-starts-ws-126349
Azure region: southcentralus
Subscription id: 8e713106-916f-4177-890e-435b90d7adc4
Resource group: aml-quickstarts-126349


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
aml_compute_target = "cpu-cluster"
try:
  aml_compute = AmlCompute(ws, aml_compute_target)
  print("Found existing compute target!")
except ComputeTargetException:
  print("Creating new compute cluster...")
  provisioning_config = AmlCompute.provisioning_configuration(vm_size = "STANDARD_D2_V2", min_nodes = 1, max_nodes = 4)
  aml_compute = ComputeTarget.create(ws, aml_compute_target, provisioning_config)
  aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

print("Azure Machine Learning Compute Cluster Created!")

Creating new compute cluster...
Creating
Succeeded........................
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
Azure Machine Learning Compute Cluster Created!


In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os

# Specify parameter sampler
ps = RandomParameterSampling(
    {
        "--C": uniform(0.01, 1.0),
        "--max_iter": choice(10, 50, 90, 100, 150, 200)
    }
)

# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

if "training" not in os.listdir():
    os.mkdir("./training")

# creating an Azure ML environment
from azureml.core import Environment
sklearn_env = Environment.get(workspace=ws, name='AzureML-Tutorial')

# Create an SKLearn estimator for use with train.py
est = SKLearn(source_directory=".",
              compute_target=aml_compute,
              entry_script='train.py',
              environment_definition=sklearn_env)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(estimator=est,
                                     hyperparameter_sampling=ps, 
                                     primary_metric_name="Accuracy",
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=40,
                                     max_concurrent_runs=4,
                                     policy=policy)



In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###
hyperdrive_run = exp.submit(hyperdrive_config, show_output=True)
RunDetails(hyperdrive_run).show()



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [5]:
hyperdrive_run.wait_for_completion(show_output=True)

RunId: HD_c834d800-d195-4959-827b-b464437a3f6e
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_c834d800-d195-4959-827b-b464437a3f6e?wsid=/subscriptions/8e713106-916f-4177-890e-435b90d7adc4/resourcegroups/aml-quickstarts-126349/workspaces/quick-starts-ws-126349

Streaming azureml-logs/hyperdrive.txt

"<START>[2020-11-15T18:47:58.705569][API][INFO]Experiment created<END>\n""<START>[2020-11-15T18:47:59.276177][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2020-11-15T18:47:59.462020][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2020-11-15T18:48:00.4063566Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>

Execution Summary
RunId: HD_c834d800-d195-4959-827b-b464437a3f6e
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_c834d800-d195-4959-827b-b464437a3f6e?wsid=/subscriptions/8e713

{'runId': 'HD_c834d800-d195-4959-827b-b464437a3f6e',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2020-11-15T18:47:58.439237Z',
 'endTimeUtc': '2020-11-15T19:09:34.167477Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '8c82e34e-a64f-4dc2-8167-803ceee86818',
  'score': '0.9088012139605463',
  'best_child_run_id': 'HD_c834d800-d195-4959-827b-b464437a3f6e_7',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg126349.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_c834d800-d195-4959-827b-b464437a3f6e/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=F5HdK%2FgXVL6LVoZDxUCC2hpbG3dIa%2F2aB2D8A8rE4Sw%3D&st=2020-11-15T18%3A59%3A37Z&se=2020-11-16T03%3A09%3A37Z&sp=r'}}

In [6]:
assert(hyperdrive_run.get_status() == "Completed")

In [7]:
import joblib
# Get your best run and save the model from that run.

### YOUR CODE HERE ###
best_run = hyperdrive_run.get_best_run_by_primary_metric()

# details associated with the best HyperDrive run
print("Run ID:", best_run.id)
print(best_run.get_details()['runDefinition']['arguments'])
print("Accuracy =", best_run.get_metrics()['Accuracy'])

# list the model files uploaded during the run
print("\n\n", best_run.get_file_names())

# register the folder as a model
h_model = best_run.register_model(model_name='hyperdrive-model', model_path='outputs/hyperdrive-model.joblib')

Run ID: HD_c834d800-d195-4959-827b-b464437a3f6e_7
['--C', '0.22220228393275646', '--max_iter', '90']
Accuracy = 0.9088012139605463


 ['azureml-logs/55_azureml-execution-tvmps_0a0bb0692b30eaee2e5aad894a65600ed974e603e2c410ff3c82cb719738cf02_d.txt', 'azureml-logs/65_job_prep-tvmps_0a0bb0692b30eaee2e5aad894a65600ed974e603e2c410ff3c82cb719738cf02_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_0a0bb0692b30eaee2e5aad894a65600ed974e603e2c410ff3c82cb719738cf02_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/106_azureml.log', 'logs/azureml/dataprep/backgroundProcess.log', 'logs/azureml/dataprep/backgroundProcess_Telemetry.log', 'logs/azureml/dataprep/engine_spans_42538d7b-9ada-453e-b559-244995cff2e3.jsonl', 'logs/azureml/dataprep/python_span_42538d7b-9ada-453e-b559-244995cff2e3.jsonl', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log', 'outputs/hyperdrive-model.joblib']


In [8]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
ds = TabularDatasetFactory.from_delimited_files(path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv", separator=',', encoding='utf8')

In [9]:
from train import clean_data
import pandas as pd

# Use the clean_data function to clean your data.
x, y = clean_data(ds)
x = x.join(y)

In [10]:
from azureml.core.datastore import Datastore
datas=Datastore.get_default(ws)

# Registering a pandas dataframe as a TabularDataset
x_tdf = TabularDatasetFactory.register_pandas_dataframe(x, target=datas, name='tdf', description=None, tags=None, show_progress=True)



Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/b530b242-c3b5-4332-9bd0-d1e96618f958/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


In [11]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=x_tdf,
    label_column_name='y',
    n_cross_validations=10,
    compute_target=aml_compute,
    num_classes=2,
    max_concurrent_iterations=4)

In [12]:
# Submit your automl run

### YOUR CODE HERE ###
automl_run = exp.submit(config=automl_config, show_output=True)

Running on remote.
Running on remote compute: cpu-cluster
Parent Run ID: AutoML_f0c3a86c-6ef8-4548-b5e7-b475fc1da2e0

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetBalancing. Generating individually featurized CV splits.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+--------------------------

In [15]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###
best_automl_run, model = automl_run.get_output()
joblib.dump(model, 'outputs/automl-model.joblib')

# register the folder as a model
#a_model = best_automl_run.register_model(model_name='automl-model', model_path='outputs/automl-model.joblib')

['outputs/automl-model.joblib']

In [16]:
# details associated with the best AutoML run
print("Run ID:", best_automl_run.id)
print(best_automl_run.get_details()['runDefinition']['arguments'])
#print("Accuracy =", best_automl_run.get_metrics()['Accuracy'])

# list the model files uploaded during the run
print("\n\n", best_automl_run.get_file_names())

Run ID: AutoML_f0c3a86c-6ef8-4548-b5e7-b475fc1da2e0_62
[]


 ['accuracy_table', 'automl_driver.py', 'azureml-logs/55_azureml-execution-tvmps_c68897f839327a173880358f86e8c4d600d4e1e86aaa2d4ad907ab96d1a50073_d.txt', 'azureml-logs/65_job_prep-tvmps_c68897f839327a173880358f86e8c4d600d4e1e86aaa2d4ad907ab96d1a50073_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_c68897f839327a173880358f86e8c4d600d4e1e86aaa2d4ad907ab96d1a50073_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'confusion_matrix', 'explanation/3d53ef51/classes.interpret.json', 'explanation/3d53ef51/expected_values.interpret.json', 'explanation/3d53ef51/features.interpret.json', 'explanation/3d53ef51/global_names/0.interpret.json', 'explanation/3d53ef51/global_rank/0.interpret.json', 'explanation/3d53ef51/global_values/0.interpret.json', 'explanation/3d53ef51/local_importance_values.interpret.json', 'explanation/3d53ef51/per_class_names/0.interpret.json', 'explanation/3d53ef5

In [17]:
# Clean up deployed resources.

aml_compute.delete()