In [28]:
from azureml.core import Workspace, Experiment

ws = Workspace.get(name="quick-starts-ws-127542")
exp = Experiment(workspace=ws, name="project-1")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-127542
Azure region: southcentralus
Subscription id: 422c78cb-7b9f-41e7-a94c-e144f7afde5e
Resource group: aml-quickstarts-127542


In [29]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
cpu_cluster_name = "cluster-project"
vm_size = "Standard_D2_V2"
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print("Found existing cluster, use it.")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size, max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

CreatingAmlCompute is getting created. Consider calling wait_for_completion() first

AmlCompute is getting created. Consider calling wait_for_completion() first


Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [30]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy, MedianStoppingPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, normal, choice
import os

ps = RandomParameterSampling( {

        "--C": uniform(0.1, 1.0),
        "--max_iter": choice(10, 20, 40, 50)
    })

policy = BanditPolicy(evaluation_interval=2, slack_factor=0.2)

if "training" not in os.listdir():
    os.mkdir("./training")

est = SKLearn(source_directory="./training", compute_target=cpu_cluster, entry_script="train.py")
hyperdrive_config = HyperDriveConfig(hyperparameter_sampling=ps, primary_metric_name='Accuracy', estimator=est, policy=policy,
                    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, max_total_runs=4)

In [31]:
hd_run = exp.submit(hyperdrive_config, show_output=True)
RunDetails(hd_run).show()



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [32]:
import joblib
best_run = hd_run.get_best_run_by_primary_metric()
best_model = best_run.register_model(model_name="project-1-hd", model_path="./")

In [33]:
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()

print("Best Run Id: ", best_run.id)
print("Accuracy: ", best_run_metrics["Accuracy"])
print("Parameters: ", parameter_values["runDefinition"]["arguments"])

Best Run Id:  HD_3924ded8-61c2-4549-8275-63aedc87742f_0
Accuracy:  0.95
Parameters:  ['--C', '0.6827778770979058', '--max_iter', '50']


In [34]:
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.widgets import RunDetails
### YOUR CODE HERE ###
ds = TabularDatasetFactory.from_delimited_files("https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv")

In [35]:
import numpy as np
import pandas as pd
def clean_data(data):
    # Dict for cleaning data
    months = {"jan":1, "feb":2, "mar":3, "apr":4, "may":5, "jun":6, "jul":7, "aug":8, "sep":9, "oct":10, "nov":11, "dec":12}
    weekdays = {"mon":1, "tue":2, "wed":3, "thu":4, "fri":5, "sat":6, "sun":7}

    # Clean and one hot encode data
    x_df = data.to_pandas_dataframe().dropna()
    jobs = pd.get_dummies(x_df.job, prefix="job")
    x_df.drop("job", inplace=True, axis=1)
    x_df = x_df.join(jobs)
    x_df["marital"] = x_df.marital.apply(lambda s: 1 if s == "married" else 0)
    x_df["default"] = x_df.default.apply(lambda s: 1 if s == "yes" else 0)
    x_df["housing"] = x_df.housing.apply(lambda s: 1 if s == "yes" else 0)
    x_df["loan"] = x_df.loan.apply(lambda s: 1 if s == "yes" else 0)
    contact = pd.get_dummies(x_df.contact, prefix="contact")
    x_df.drop("contact", inplace=True, axis=1)
    x_df = x_df.join(contact)
    education = pd.get_dummies(x_df.education, prefix="education")
    x_df.drop("education", inplace=True, axis=1)
    x_df = x_df.join(education)
    x_df["month"] = x_df.month.map(months)
    x_df["day_of_week"] = x_df.day_of_week.map(weekdays)
    x_df["poutcome"] = x_df.poutcome.apply(lambda s: 1 if s == "success" else 0)

    y_df = x_df.pop("y").apply(lambda s: 1 if s == "yes" else 0)
    return x_df, y_df

x, y = clean_data(ds)

from sklearn.model_selection import train_test_split

x_train, x_test , y_train, y_test = train_test_split(x, y, test_size=20, random_state=42)
x_train = pd.concat([x_train, y_train], axis=1)

In [36]:
from azureml.train.automl import AutoMLConfig

automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task="classification",
    primary_metric="accuracy",
    training_data=x_train,
    label_column_name="y",
    n_cross_validations=3)

In [37]:
import sys
!{sys.executable} -m pip install zipp==3.1.0



In [38]:
# Submit your automl run
### YOUR CODE HERE ###

automl_run = exp.submit(automl_config, show_output=True)

RunDetails(automl_run).show()

Running on local machine
Parent Run ID: AutoML_d511bd9c-ad55-4804-b246-8cefb24a4352

Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely p

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [39]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###
best_run_automl, _ = automl_run.get_output()

best_model_automl = best_run_automl.register_model(model_name="project-1-automl", model_path="./")

In [40]:
best_run_metrics = best_run_automl.get_metrics()
parameter_values = best_run_automl.get_details()

print("Best Run Id: ", best_run_automl.id)
print("\n")
print("\n")
print("Accuracy: ", best_run_metrics["accuracy"])
print("\n")
print("\n")
print("Parameters: ", parameter_values)
print("\n")
print("\n")
print("Metrics for best run: ", best_run_automl.get_metrics())

Best Run Id:  AutoML_d511bd9c-ad55-4804-b246-8cefb24a4352_44




Accuracy:  0.9168843146324238




Parameters:  {'runId': 'AutoML_d511bd9c-ad55-4804-b246-8cefb24a4352_44', 'status': 'Completed', 'startTimeUtc': '2020-11-23T15:47:52.480792Z', 'endTimeUtc': '2020-11-23T15:48:57.063625Z', 'properties': {'runTemplate': 'automl_child', 'pipeline_id': '__AutoML_Ensemble__', 'pipeline_spec': '{"pipeline_id":"__AutoML_Ensemble__","objects":[{"module":"azureml.train.automl.ensemble","class_name":"Ensemble","spec_class":"sklearn","param_args":[],"param_kwargs":{"automl_settings":"{\'task_type\':\'classification\',\'primary_metric\':\'accuracy\',\'verbosity\':20,\'ensemble_iterations\':15,\'is_timeseries\':False,\'name\':\'project-1\',\'compute_target\':\'local\',\'subscription_id\':\'422c78cb-7b9f-41e7-a94c-e144f7afde5e\',\'region\':\'southcentralus\',\'spark_service\':None}","ensemble_run_id":"AutoML_d511bd9c-ad55-4804-b246-8cefb24a4352_44","experiment_name":null,"workspace_name":"quick-starts-

In [41]:
cpu_cluster.delete()
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print("Cluster Deleted !")
except:
    print("Cannot find cluster. Can't be deleted.")

Cluster Deleted !
