In [1]:
from azureml.core import Workspace, Experiment

# ws = Workspace.get(name="quick-starts-ws-146877")
ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-147016
Azure region: southcentralus
Subscription id: 976ee174-3882-4721-b90a-b5fef6b72f24
Resource group: aml-quickstarts-147016


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
compute_cluster_name = "compute-cluster"

try:
    compute_target = ComputeTarget(workspace = ws, name = compute_cluster_name )
    print("Found the cluster, you can use it.")
except:
    print("Creating a new compute cluster...")
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4, min_nodes=0,
     vm_priority= 'lowpriority' )
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
    
# Can poll for a minimum number of nodes and for a specific timeout. 
# If no min node count is provided it uses the scale settings for the cluster.
compute_target.wait_for_completion(show_output=True, min_node_count = None, timeout_in_minutes = 60)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())


Found the cluster, you can use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 4, 'targetNodeCount': 4, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 4, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-06-11T20:40:11.445000+00:00', 'errors': None, 'creationTime': '2021-06-11T20:29:09.121695+00:00', 'modifiedTime': '2021-06-11T20:29:54.539307+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT1800S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_DS2_V2'}


In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice, uniform
import os

# Specify parameter sampler
ps = RandomParameterSampling(
    {
        '--C' : choice(0.001,0.01,0.1,1,10,20,50,100,200,500,1000),
        '--max_iter': choice(50,100,200,300,400,500)
    }
)

# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)


if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory = "./",
            compute_target=compute_target,
            vm_size='STANDARD_D2_V2',
            entry_script="train.py")

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(hyperparameter_sampling=ps, 
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     policy=policy,
                                     estimator=est,
                                     max_total_runs=16,
                                     max_concurrent_runs = 4
                                    )

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.
'enabled' is deprecated. Please use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param instead.


In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###
hyperdrive_run = exp.submit(hyperdrive_config)
# Monitor HyperDrive runs You can monitor the progress of the runs with the following Jupyter widget
# RunDetails(hyperdrive_run).show()

hyperdrive_run.wait_for_completion(show_output=True)





RunId: HD_032f49e9-65d7-48fe-adae-bdb0f8892db3
Web View: https://ml.azure.com/runs/HD_032f49e9-65d7-48fe-adae-bdb0f8892db3?wsid=/subscriptions/976ee174-3882-4721-b90a-b5fef6b72f24/resourcegroups/aml-quickstarts-147016/workspaces/quick-starts-ws-147016&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-06-11T20:44:42.980889][API][INFO]Experiment created<END>\n""<START>[2021-06-11T20:44:43.430861][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-06-11T20:44:43.609476][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"

Execution Summary
RunId: HD_032f49e9-65d7-48fe-adae-bdb0f8892db3
Web View: https://ml.azure.com/runs/HD_032f49e9-65d7-48fe-adae-bdb0f8892db3?wsid=/subscriptions/976ee174-3882-4721-b90a-b5fef6b72f24/resourcegroups/aml-quickstarts-147016/workspaces/quick-starts-ws-147016&tid=660b3398-b80e-49d2-bc5b-ac1dc93b5254



{'runId': 'HD_032f49e9-65d7-48fe-adae-bdb0f8892db3',
 'target': 'compute-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-06-11T20:44:42.702087Z',
 'endTimeUtc': '2021-06-11T20:50:16.706045Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': 'cc2c52c9-04c7-4f13-8eb1-2765cd276f77',
  'score': '0.9137076378351037',
  'best_child_run_id': 'HD_032f49e9-65d7-48fe-adae-bdb0f8892db3_1',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg147016.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_032f49e9-65d7-48fe-adae-bdb0f8892db3/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=O5sVZOCXr6YeuOtnaRQckeQD1Gs2Z6eSyCrNHfzuA%2BM%3D&st=2021-06-11T20%3A40%3A33Z&se=2021-06-12T04%3A50%3A33Z&sp=r'},
 'submittedBy': 'ODL_User 147016'

In [7]:
import joblib
# Get your best run and save the model from that run.

### YOUR CODE HERE ###
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_hyperdrive_model =  best_run.register_model(model_name  = "best_hyperdrive_model", 
                                          model_path  = "./outputs/model.joblib",
                                          tags = best_run.get_metrics())


In [9]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
ds = TabularDatasetFactory.from_delimited_files(['https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv'])


In [10]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

In [13]:
import os
import pandas as pd
path = "./data"
try:
    os.makedirs(path, exist_ok=True)
    print("Dicrectory '%s' created...")
except OSError:
    print("Directory '%s' cannot be created...")

Dicrectory '%s' created...


In [15]:
cleaned_df  = x.copy()
cleaned_df['y'] = y
cleaned_df.to_csv('./data/cleaned_df.csv')
print(x.shape, y.shape, cleaned_df.shape)
print(cleaned_df.head())

(32950, 39) (32950,) (32950, 40)
   age  marital  default  housing  loan  month  day_of_week  duration  \
0   57        1        0        0     1      5            1       371   
1   55        1        0        1     0      5            4       285   
2   33        1        0        0     0      5            5        52   
3   36        1        0        0     0      6            5       355   
4   27        1        0        1     0      7            5       189   

   campaign  pdays  ...  contact_telephone  education_basic.4y  \
0         1    999  ...                  0                   0   
1         2    999  ...                  1                   0   
2         1    999  ...                  0                   0   
3         4    999  ...                  1                   0   
4         2    999  ...                  0                   0   

   education_basic.6y  education_basic.9y  education_high.school  \
0                   0                   0                      

In [17]:
# split data into test and train sets

from sklearn.model_selection import train_test_split
import pandas as pd

#spliting the data
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.3)

# copying series y column to the train data
train_df = train_x.copy(deep=True)
train_df['y'] = train_y
print(train_x.shape, train_y.shape)

# joining series y column to the test data
test_df = test_x.copy(deep=True)
test_df['y'] = test_y
print(test_x.shape, test_y.shape)

print(train_df.shape)
print(test_df.shape)

# saving test and train data locally
train_df.to_csv("./data/train_data.csv")
test_df.to_csv("./data/test_data.csv")

(23065, 39) (23065,)
(9885, 39) (9885,)
(23065, 40)
(9885, 40)


In [43]:
# upload the Data directory to the datastore

default_store = ws.get_default_datastore()
default_store.upload(src_dir="./data", target_path="./data", overwrite=True)

Uploading an estimated of 5 files
Uploading ./data/.amlignore
Uploaded ./data/.amlignore, 1 files out of an estimated total of 5
Uploading ./data/.amlignore.amltmp
Uploaded ./data/.amlignore.amltmp, 2 files out of an estimated total of 5
Uploading ./data/cleaned_df.csv
Uploaded ./data/cleaned_df.csv, 3 files out of an estimated total of 5
Uploading ./data/test_data.csv
Uploaded ./data/test_data.csv, 4 files out of an estimated total of 5
Uploading ./data/train_data.csv
Uploaded ./data/train_data.csv, 5 files out of an estimated total of 5
Uploaded 5 files


$AZUREML_DATAREFERENCE_c4665492a2bb4fe8b8a1b1cd71f3cc26

In [44]:
default_store

{
  "name": "workspaceblobstore",
  "container_name": "azureml-blobstore-56ae27ec-2965-4bcd-8be2-ec669d89221d",
  "account_name": "mlstrg147016",
  "protocol": "https",
  "endpoint": "core.windows.net"
}

In [46]:
# Convert the DataFrame to TabularDataset
from azureml.core import Dataset, Datastore

clean_data = Dataset.Tabular.from_delimited_files(path=(default_store, "data/cleaned_df.csv"))
train_data = Dataset.Tabular.from_delimited_files(path=(default_store, "data/train_data.csv"))
test_data = Dataset.Tabular.from_delimited_files(path=(default_store, "data/test_data.csv"))

In [51]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task= 'classification',
    primary_metric= 'accuracy',
    training_data= train_data,
    label_column_name= 'y',
    n_cross_validations=5,
    compute_target = compute_target)

In [52]:
# Submit your automl run

### YOUR CODE HERE ###
remote_run = exp.submit(automl_config, show_output = False)
remote_run.wait_for_completion()

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,AutoML_7fc03281-25e3-4a13-b192-2ec3a9e44928,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


{'runId': 'AutoML_7fc03281-25e3-4a13-b192-2ec3a9e44928',
 'target': 'compute-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-06-11T21:42:24.763785Z',
 'endTimeUtc': '2021-06-11T22:24:06.379791Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'compute-cluster',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"2a74c216-0e6f-45a5-93da-b9dfc1278677\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.28.0", "azureml-train": "1.28.0", "azureml-train-restclients-hyperdrive": "1.28.0", "azureml-train-core": "1.28.0", "azureml-train-automl": "1.28.0", "azureml-train-automl-runtime": "1.28.0", "azureml-train-automl-client": "1.28.

In [53]:
# Retrieve and save your best automl model.

### YOUR CODE HERE ###
best_run, fitted_model = remote_run.get_output()

print(best_run)
print(fitted_model)

# get_metrics()
# Returns the metrics
print("Best run metrics :",best_run.get_metrics())
# get_details()
# Returns a dictionary with the details for the run
print("Best run details :",best_run.get_details())

Package:azureml-automl-runtime, training version:1.29.0, current version:1.28.0.post2
Package:azureml-core, training version:1.29.0, current version:1.28.0
Package:azureml-dataset-runtime, training version:1.29.0, current version:1.28.0
Package:azureml-defaults, training version:1.29.0, current version:1.28.0
Package:azureml-interpret, training version:1.29.0, current version:1.28.0
Package:azureml-mlflow, training version:1.29.0, current version:1.28.0
Package:azureml-pipeline-core, training version:1.29.0, current version:1.28.0
Package:azureml-telemetry, training version:1.29.0, current version:1.28.0
Package:azureml-train-automl-client, training version:1.29.0, current version:1.28.0
Package:azureml-train-automl-runtime, training version:1.29.0, current version:1.28.0


Run(Experiment: udacity-project,
Id: AutoML_7fc03281-25e3-4a13-b192-2ec3a9e44928_27,
Type: azureml.scriptrun,
Status: Completed)
Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=False, enable_feature_sweeping=True, feature_sweeping_config={}, feature_sweeping_timeout=86400, featurization_config=None, force_text_dnn=False, is_cross_validation=True, is_onnx_compatible=False, observer=None, task='classification', working_dir='/mnt/batch/tasks/shared/LS_root/mount...
), random_state=0, reg_alpha=0, reg_lambda=0.7291666666666667, subsample=0.9, tree_method='auto'))], verbose=False)), ('9', Pipeline(memory=None, steps=[('maxabsscaler', MaxAbsScaler(copy=True)), ('logisticregression', LogisticRegression(C=2.559547922699533, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='saga', tol=0.0001, verbose=0, warm_start=False))]

In [54]:
best_run.register_model(model_name="AutoML_best_model.pkl", model_path ="./outputs/")

Model(workspace=Workspace.create(name='quick-starts-ws-147016', subscription_id='976ee174-3882-4721-b90a-b5fef6b72f24', resource_group='aml-quickstarts-147016'), name=AutoML_best_model.pkl, id=AutoML_best_model.pkl:1, version=1, tags={}, properties={})

In [55]:
best_run.get_tags()

{'_aml_system_azureml.automlComponent': 'AutoML',
 '_aml_system_ComputeTargetStatus': '{"AllocationState":"steady","PreparingNodeCount":0,"RunningNodeCount":1,"CurrentNodeCount":1}',
 'mlflow.source.type': 'JOB',
 'mlflow.source.name': 'automl_driver.py',
 '_aml_system_automl_is_child_run_end_telemetry_event_logged': 'True',
 'model_explain_run_id': 'AutoML_7fc03281-25e3-4a13-b192-2ec3a9e44928_ModelExplain',
 'model_explanation': 'True'}

In [56]:
best_run

Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project,AutoML_7fc03281-25e3-4a13-b192-2ec3a9e44928_27,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [58]:
fitted_model

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=False, enable_feature_sweeping=True, feature_sweeping_config={}, feature_sweeping_timeout=86400, featurization_config=None, force_text_dnn=False, is_cross_validation=True, is_onnx_compatible=False, observer=None, task='classification', working_dir='/mnt/batch/tasks/shared/LS_root/mount...
), random_state=0, reg_alpha=0, reg_lambda=0.7291666666666667, subsample=0.9, tree_method='auto'))], verbose=False)), ('9', Pipeline(memory=None, steps=[('maxabsscaler', MaxAbsScaler(copy=True)), ('logisticregression', LogisticRegression(C=2.559547922699533, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='saga', tol=0.0001, verbose=0, warm_start=False))], verbose=False))], flatten_transform=None, weights=[0.13333333333333333, 0.26666666666666666, 0.13333333333333333, 0.13333333333