In [1]:
from azureml.core import Workspace, Experiment

# subscription_id = ''
# resource_group = ''
# workspace_name = ''

# ws = Workspace(subscription_id, resource_group, workspace_name)

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-139362
Azure region: southcentralus
Subscription id: f5091c60-1c3c-430f-8d81-d802f6bf2414
Resource group: aml-quickstarts-139362


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

from azureml.core.compute_target import ComputeTargetException

cpu_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                            max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [5]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os

# Specify parameter sampler
ps = RandomParameterSampling({
       "--C": uniform(0, 1),
       "--max_iter": choice(50, 100, 150, 200)
   })

# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

if "training" not in os.listdir():
    os.mkdir("./training")

from azureml.core import ScriptRunConfig

# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory='./', compute_target=cpu_cluster, entry_script='train.py')
# est = ScriptRunConfig(source_directory='./', compute_target=cpu_cluster, script='train.py')

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(# run_config=est,
                                estimator=est,
                                hyperparameter_sampling=ps,
                                policy=policy,
                                primary_metric_name='Accuracy',
                                primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                max_total_runs=20,
                                max_concurrent_runs=4)



In [6]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

hdr = exp.submit(config = hyperdrive_config)
RunDetails(hdr).show()



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [9]:
import joblib
# Get your best run and save the model from that run.


best_run = hdr.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['arguments']

print(best_run_metrics, parameter_values)

model = best_run.register_model(model_name='project_model'
                            , model_path='outputs/model.joblib'
                            , tags = {"Method": "Hyperdrive"}
                            , properties = {"Accuracy": best_run_metrics["Accuracy"]})

{'Regularization Strength:': 0.15800619280767247, 'Max iterations:': 100, 'Accuracy': 0.9156348628307842} ['--C', '0.15800619280767247', '--max_iter', '100']


In [10]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = TabularDatasetFactory.from_delimited_files(path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv")

In [11]:
from train import clean_data
from azureml.core.datastore import Datastore

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

x['target'] = y

datastore = Datastore.get(ws, 'workspacefilestore')
dataset = TabularDatasetFactory.register_pandas_dataframe(x, name = 'ProjectData', target = datastore)



Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/6919b22a-a8b6-485d-827e-a70b110f2236/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


In [12]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.

automl_settings = {
       "n_cross_validations": 3,
       "primary_metric": 'accuracy',
       "enable_early_stopping": True,
       "experiment_timeout_hours": 0.5,
       "max_concurrent_iterations": 4,
       "max_cores_per_iteration": -1,
       "verbosity": logging.INFO,
   }

automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    training_data=dataset,
    label_column_name='target',
    compute_target = cpu_cluster,
    **automl_settings)

In [13]:
# Submit your automl run

automl_run = exp.submit(automl_config, show_output=True)

Running on remote.
No run_configuration provided, running on cpu-cluster with default configuration
Running on remote compute: cpu-cluster
Parent Run ID: AutoML_41557248-c943-4b95-8bd0-4b4db9afc09c

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a fal

In [14]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###

best_auto_run, fitted_model = automl_run.get_output()

joblib.dump(fitted_model, 'outputs/model2.joblib')
# model = best_run.register_model(model_name='project-model', model_path='outputs/model.joblib')

Package:azureml-automl-runtime, training version:1.22.0, current version:1.20.0
Package:azureml-core, training version:1.22.0, current version:1.20.0
Package:azureml-dataprep, training version:2.9.1, current version:2.7.3
Package:azureml-dataprep-native, training version:29.0.0, current version:27.0.0
Package:azureml-dataprep-rslex, training version:1.7.0, current version:1.5.0
Package:azureml-dataset-runtime, training version:1.22.0, current version:1.20.0
Package:azureml-defaults, training version:1.22.0, current version:1.20.0
Package:azureml-interpret, training version:1.22.0, current version:1.20.0
Package:azureml-pipeline-core, training version:1.22.0, current version:1.20.0
Package:azureml-telemetry, training version:1.22.0, current version:1.20.0
Package:azureml-train-automl-client, training version:1.22.0, current version:1.20.0
Package:azureml-train-automl-runtime, training version:1.22.0, current version:1.20.0


['outputs/model2.joblib']

In [15]:
best_auto_run.get_metrics()

{'matthews_correlation': 0.5466949004536809,
 'average_precision_score_weighted': 0.9558866406098433,
 'recall_score_macro': 0.7476835595145617,
 'AUC_micro': 0.9808022015910606,
 'weighted_accuracy': 0.9587786552206112,
 'precision_score_micro': 0.9167525992022322,
 'norm_macro_recall': 0.4953671190291234,
 'average_precision_score_micro': 0.9816157791713662,
 'balanced_accuracy': 0.7476835595145617,
 'f1_score_weighted': 0.9126940409745611,
 'average_precision_score_macro': 0.8265346648776983,
 'precision_score_weighted': 0.9107206964563432,
 'f1_score_macro': 0.7707410281049357,
 'log_loss': 0.17673347395291927,
 'accuracy': 0.9167525992022322,
 'precision_score_macro': 0.8019566172523597,
 'recall_score_weighted': 0.9167525992022322,
 'recall_score_micro': 0.9167525992022322,
 'AUC_macro': 0.9477213207536668,
 'AUC_weighted': 0.9477213207536668,
 'f1_score_micro': 0.9167525992022322,
 'accuracy_table': 'aml://artifactId/ExperimentRun/dcid.AutoML_41557248-c943-4b95-8bd0-4b4db9afc09c

In [34]:


from pprint import pprint

# Function to list the hyperparameters 

def print_model(model, prefix=""):
    '''
    Source: https://github.com/gauravyadav04/Optimizing_ML_Pipeline_Azure
    '''
    for step in model.steps:
        print(prefix + step[0])
        
        if hasattr(step[1], 'estimators') and hasattr(step[1], 'weights'):
            pprint({'estimators' : list(e[0] for e in step[1].estimators), 'weights' : step[1].weights})
            print()

            for estimator in step[1].estimators:
                print_model(estimator[1], estimator[0] + ' - ')
        
        else:
            pprint(step[1].get_params())
            print()
        
print_model(fitted_model)

datatransformer
{'enable_dnn': None,
 'enable_feature_sweeping': None,
 'feature_sweeping_config': None,
 'feature_sweeping_timeout': None,
 'featurization_config': None,
 'force_text_dnn': None,
 'is_cross_validation': None,
 'is_onnx_compatible': None,
 'logger': None,
 'observer': None,
 'task': None,
 'working_dir': None}

prefittedsoftvotingclassifier
{'estimators': ['31', '37', '17', '40', '1', '42', '30', '33', '6', '34'],
 'weights': [0.06666666666666667,
             0.26666666666666666,
             0.06666666666666667,
             0.06666666666666667,
             0.13333333333333333,
             0.13333333333333333,
             0.06666666666666667,
             0.06666666666666667,
             0.06666666666666667,
             0.06666666666666667]}

31 - maxabsscaler
{'copy': True}

31 - lightgbmclassifier
{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.2977777777777778,
 'importance_type': 'split',
 'learning_rate': 0.0842121052631579,
 'max_bin