In [2]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

In [3]:
from azureml.core.compute import ComputeTarget, AmlCompute

cluster_name = "ebailey-udacity-cluster"

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.
try:
    cluster = ComputeTarget(ws, cluster_name)
    print("cluster exists already")
except: 
    cluster_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_V2", max_nodes=4)
    cluster = ComputeTarget.create(ws, cluster_name, cluster_config)

cluster.wait_for_completion(show_output=True)


InProgress..
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [17]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy, MedianStoppingPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os

# Specify parameter sampler
ps = RandomParameterSampling({
    "C": uniform(0.25, 1.75), 
    "max_iter": choice(60, 80, 100, 120, 140)
})

# Specify a Policy
policy = MedianStoppingPolicy()

if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create a ScriptRunConfig Object to specify the configuration details of your training job
src = ScriptRunConfig(".", "train.py", compute_target=cluster, environment=sklearn_env)

# Create a HyperDriveConfig using the src object, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(hyperparameter_sampling=ps, primary_metric_name="Accuracy", primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, max_total_runs=20, policy=policy, run_config=src)

In [18]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

hd_run = exp.submit(hyperdrive_config)

In [19]:
RunDetails(hd_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [13]:
import joblib
# Get your best run and save the model from that run.
best_run = hd_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['arguments']

print(best_run.id)
print('Accuracy:', best_run_metrics['Accuracy'])
print('Best Params:', parameter_values)


HD_4f3fdaa3-52f0-4c87-bd5f-c989f34e7e91_1
Accuracy: 0.9153262518968134
Best Params: ['--C', '1.4135706071135155', '--max_iter', '80']


In [16]:
best_fitted_model = best_run.download_file('outputs/model.pkl')

In [4]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

automl_ds = TabularDatasetFactory().from_delimited_files("https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv")

In [5]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(automl_ds)
automl_cleaned = x.join(y)
automl_training_data = TabularDatasetFactory.register_pandas_dataframe(automl_cleaned, ws.get_default_datastore(), "bankmarketing-cleaned")

Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/53a89a74-54c5-4819-a744-fda082d13a2a/
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'emp.var.rate' -> 'emp_var_rate'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'cons.price.idx' -> 'cons_price_idx'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'cons.conf.idx' -> 'cons_conf_idx'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'nr.employed' -> 'nr_employed'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'job_admin.' -> 'job_admin_'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'education_basic.4y'

In [7]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data=automl_training_data,
    label_column_name='y',
    n_cross_validations=5, 
    compute_target=cluster)

In [8]:
# Submit your automl run

automl_run = exp.submit(automl_config, show_output=True)

Submitting remote run.
No run_configuration provided, running on ebailey-udacity-cluster with default configuration
Running on remote compute: ebailey-udacity-cluster


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,AutoML_b675a962-c075-4462-9b94-e4bc51c77a72,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+------------------------------+--------------------------------+-------------------------------------

In [12]:
# Retrieve and save your best automl model.

best_automl_run = automl_run.get_best_child()
print(best_automl_run.properties['model_name'])
best_automl_run.get_metrics()
best_automl_run.download_file('outputs/model.pkl', 'automl_best_model.pkl')

AutoMLb675a962c29


In [13]:
cluster.delete()
cluster.wait_for_completion()
print('Deleted compute resource')

Provisioning operation finished, operation "Succeeded"


ComputeTargetException: ComputeTargetException:
	Message: ComputeTargetNotFound: Compute Target with name ebailey-udacity-cluster not found in provided workspace
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "ComputeTargetNotFound: Compute Target with name ebailey-udacity-cluster not found in provided workspace"
    }
}