In [4]:
from azureml.core import Workspace, Experiment, Environment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-141143
Azure region: southcentralus
Subscription id: 1b944a9b-fdae-4f97-aeb1-b7eea0beac53
Resource group: aml-quickstarts-141143


In [5]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

# cpu_cluster_name is just that: a name. Nothing more, nothing less.
cpu_cluster_name= "udacity-project"
compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_V2", max_nodes=4)
compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

# additional code, that is not needed for our cluster to work, but gives us info about 
# the status and properties of the new cluster

compute_target.wait_for_completion(show_output=True)
print(compute_target.get_status().serialize())

SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 1, 'targetNodeCount': 1, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 1, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-03-22T16:35:18.122000+00:00', 'errors': None, 'creationTime': '2021-03-22T16:00:06.502536+00:00', 'modifiedTime': '2021-03-22T16:00:21.970937+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


In [18]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.core import ScriptRunConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os
import shutil

ps = RandomParameterSampling({
    "--C" : choice(0.01, 0.1, 1),
    "--max_iter" : choice(20, 40, 60, 100, 150, 200)
})
# Specify an early termination Policy. Early termination improves computational efficiency.
# Bandit is an aggresive policy, saving a lot of time but mabe missing some promising candidates

policy = BanditPolicy(slack_factor = 0.15, evaluation_interval=1, delay_evaluation=5)

ps = RandomParameterSampling({
    "--C" : choice(0.01, 0.1, 1),
    "--max_iter" : choice(20, 40, 60, 100, 150, 200)
})
# Specify an early termination Policy. Early termination improves computational efficiency.
# Bandit is an aggresive policy, saving a lot of time but mabe missing some promising candidates
if "training" not in os.listdir():
    os.mkdir("./traning")

script_folder = "./training"
os.makedirs(script_folder, exist_ok=True)
shutil.copy("./train.py", script_folder)

est = SKLearn(source_directory = script_folder, entry_script="train.py",compute_target=compute_target, vm_size="Standard_d2_V2", vm_priority="lowpriority")

hyperdrive_config = HyperDriveConfig(hyperparameter_sampling= ps,
                                    primary_metric_name='Accuracy',
                                    primary_metric_goal= PrimaryMetricGoal.MAXIMIZE,
                                    max_total_runs=4,
                                    policy = policy,
                                    estimator=est)

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.


In [19]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

hyperdrive_run = exp.submit(hyperdrive_config)

# visualize with widget
from azureml.widgets import RunDetails
RunDetails(hyperdrive_run).show()




_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [20]:
import joblib
# Get your best run and save the model from that run.

best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()


print('Best Run Id: ', best_run.id)
print('\n Best run metrics:', best_run_metrics)
print('\n Filename:', best_run.get_file_names())
print('\n Run details:',parameter_values)

Best Run Id:  HD_e2e09f28-5fdc-414c-b16b-af046be69b66_1

 Best run metrics: {'Regularization Strength:': 1.0, 'Max iterations:': 100, 'Accuracy': 0.908649468892261}

 Filename: ['azureml-logs/55_azureml-execution-tvmps_0dbfb494511a95e4d1419d5fd35a1ee9bd84c44c05fd14ab3d574a922a6f07fc_d.txt', 'azureml-logs/65_job_prep-tvmps_0dbfb494511a95e4d1419d5fd35a1ee9bd84c44c05fd14ab3d574a922a6f07fc_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_0dbfb494511a95e4d1419d5fd35a1ee9bd84c44c05fd14ab3d574a922a6f07fc_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/104_azureml.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log']

 Run details: {'runId': 'HD_e2e09f28-5fdc-414c-b16b-af046be69b66_1', 'target': 'udacity-project', 'status': 'Completed', 'startTimeUtc': '2021-03-22T16:23:04.494089Z', 'endTimeUtc': '2021-03-22T16:25:28.995286Z', 'properties': {'_azureml.ComputeTargetType': 'amlcompute', 'ContentSnap

In [21]:
model = best_run.register_model(model_name="hyerdrive_log_reg_best_run",
model_path="azureml-logs/model.joblib",
tags={"Method":"Hyperdrive"},
properties={"Accuracy": best_run.get_metrics()["Accuracy"]})

In [1]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = TabularDatasetFactory.from_delimited_files('https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv')

In [13]:
from train import clean_data
from azureml.core import Dataset
from sklearn.model_selection import train_test_split

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

cleaned_data =pd.concat([x,y], axis=1)

train_data, test_data = train_test_split(cleaned_data, test_size=0.2, random_state=66)
train_data.to_csv("training/training_data.csv")
datastore = ws.get_default_datastore()
datastore.upload(src_dir="training", target_path="training_data")

train_ds = Dataset.Tabular.from_delimited_files(path=[(datastore, ("training_data/training_data.csv"))])

Uploading an estimated of 2 files
Target already exists. Skipping upload for training_data/train.py
Target already exists. Skipping upload for training_data/training_data.csv
Uploaded 0 files


In [14]:
from azureml.train.automl import AutoMLConfig
import pandas as pd
# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task="classification",
    primary_metric='accuracy',
    training_data=train_ds,
    label_column_name="y",
    n_cross_validations=5,
    compute_target=compute_target,
    enable_early_stopping=True)

In [15]:
# Submit your automl run
remote_run = exp.submit(config=automl_config, show_output=True)

Running on remote.
No run_configuration provided, running on udacity-project with default configuration
Running on remote compute: udacity-project
Parent Run ID: AutoML_d841eeff-58e0-4628-b050-8bf39d149a72

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because t

In [20]:
# Retrieve and save your best automl model.


print(remote_run.get_metrics())
print(remote_run.get_tags())

from azureml.automl.runtime.onnx_convert import OnnxConverter

automl_best_run_onnx, automl_fitted_model_onnx = remote_run.get_output(return_onnx_model=True)
OnnxConverter.save_onnx_model(automl_fitted_model_onnx, "./outputs/best_automl_model.onnx")

{'experiment_status': ['DatasetEvaluation', 'FeaturesGeneration', 'DatasetFeaturization', 'DatasetFeaturizationCompleted', 'DatasetBalancing', 'DatasetCrossValidationSplit', 'ModelSelection', 'BestRunExplainModel', 'ModelExplanationDataSetSetup', 'PickSurrogateModel', 'EngineeredFeatureExplanations', 'EngineeredFeatureExplanations', 'RawFeaturesExplanations', 'RawFeaturesExplanations', 'BestRunExplainModel'], 'experiment_status_description': ['Gathering dataset statistics.', 'Generating features for the dataset.', 'Beginning to fit featurizers and featurize the dataset.', 'Completed fit featurizers and featurizing the dataset.', 'Performing class balancing sweeping', 'Generating individually featurized CV splits.', 'Beginning model selection.', 'Best run model explanations started', 'Model explanations data setup completed', 'Choosing LightGBM as the surrogate model for explanations', 'Computation of engineered features started', 'Computation of engineered features completed', 'Computa

In [21]:
#cluster cleanup

compute_target.delete()

Current provisioning state of AmlCompute is "Deleting"

