# Hyperparameter Tuning using HyperDrive

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [1]:
from azureml.core import Workspace, Experiment

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, loguniform, choice, randint
import joblib
from azureml.data.dataset_factory import TabularDatasetFactory

In [2]:
# Fetch workspace and Create Experiment
ws = Workspace.from_config()
experiment_name = 'fraud-detection'

experiment=Experiment(ws, experiment_name)

Performing interactive authentication. Please follow the instructions on the terminal.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code FCBJRNN27 to authenticate.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.


## Dataset

We have imported the Credit Card Fraud detection dataset in the Azure ML studio

In [3]:
found = False
key = "creditcardfraud"
description_text = "Dataset for capstone project"

if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:
        print("Please create a dataset")

df = dataset.to_pandas_dataframe()
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284806.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.841366,3.918649e-15,5.682686e-16,-8.761736e-15,2.811118e-15,-1.552103e-15,2.04013e-15,-1.698953e-15,-1.958151e-16,-3.14764e-15,...,1.471982e-16,8.042109e-16,5.28245e-16,4.458267e-15,1.426896e-15,1.70164e-15,-3.671606e-16,-1.218152e-16,88.349619,0.001727
std,47488.22833,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.25,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84691.5,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.75,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [4]:
# Create Compute

try:
    compute_target = ComputeTarget(workspace=ws, name='cluster')
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2', max_nodes=4)
    compute_target = ComputeTarget.create(ws, 'cluster', compute_config)

compute_target.wait_for_completion(show_output=True)

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Hyperdrive Configuration

We are going to Randomly Sample the hyperparameters `n_estimators` and `max_depth` in the search space using RandomParameterSampler in our hyperdrive experiment.

In [5]:
# TODO: Create an early termination policy. This is not required if you are using Bayesian sampling.
etp = BanditPolicy(
            slack_factor = 0.1,
            evaluation_interval=3
        )

#TODO: Create the different params that you will be using during training
ps = RandomParameterSampling(
            {
                "--n_estimators": choice(25,50,75, 100),
                "--max_depth": choice(2, 5, 10, 15)
            }
        )

#TODO: Create your estimator and hyperdrive config
est = SKLearn(
            source_directory = ".",
            compute_target = compute_target,
            entry_script='train.py'
        )

hyperdrive_run_config = HyperDriveConfig(
                        estimator=est,
                        hyperparameter_sampling=ps,
                        policy=etp,
                        primary_metric_name='AUC_weighted',
                        primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                        max_total_runs=8,
                        max_concurrent_runs=4
                    )

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.


In [6]:
#TODO: Submit your experiment
hdr = experiment.submit(config=hyperdrive_run_config,show_output=True)



## Run Details


In [7]:
RunDetails(hdr).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [8]:
hdr.wait_for_completion(show_output=True)

RunId: HD_8bbe25c0-9b7b-4043-a238-36513509eb18
Web View: https://ml.azure.com/experiments/fraud-detection/runs/HD_8bbe25c0-9b7b-4043-a238-36513509eb18?wsid=/subscriptions/d7f39349-a66b-446e-aba6-0053c2cf1c11/resourcegroups/aml-quickstarts-138174/workspaces/quick-starts-ws-138174

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-02-08T17:45:51.288240][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-02-08T17:45:50.372127][API][INFO]Experiment created<END>\n""<START>[2021-02-08T17:45:51.593269][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2021-02-08T17:45:52.3330123Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>

Execution Summary
RunId: HD_8bbe25c0-9b7b-4043-a238-36513509eb18
Web View: https://ml.azure.com/experiments/fraud-detection/runs/HD_8bbe25c0-9b7b-4043-a238-36513509eb18?wsid=/subscriptions/d7f39

{'runId': 'HD_8bbe25c0-9b7b-4043-a238-36513509eb18',
 'target': 'cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-02-08T17:45:50.097131Z',
 'endTimeUtc': '2021-02-08T17:58:20.970007Z',
 'properties': {'primary_metric_config': '{"name": "AUC_weighted", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '9ce4bac8-b8fd-46e5-adc5-729ef994248c',
  'score': '0.9822392202519724',
  'best_child_run_id': 'HD_8bbe25c0-9b7b-4043-a238-36513509eb18_1',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://mlstrg138174.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_8bbe25c0-9b7b-4043-a238-36513509eb18/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=CYk4TlK5meP9BVMmQqBOirxPt%2FGUYCkN7iV1kC%2BUvaA%3D&st=2021-02-08T17%3A48%3A22Z&se=2021-02-09T01%3A58%3A22Z&sp=r'},
 'submittedBy': 'ODL_User 138174'}

## Best Model

In [9]:
fraud_detector = hdr.get_best_run_by_primary_metric()
fraud_detector.get_file_names()

['azureml-logs/55_azureml-execution-tvmps_5bb793346ddb66ba129a50203d28feb5489a6ff846c35706d4078aba8dbff1d3_d.txt',
 'azureml-logs/65_job_prep-tvmps_5bb793346ddb66ba129a50203d28feb5489a6ff846c35706d4078aba8dbff1d3_d.txt',
 'azureml-logs/70_driver_log.txt',
 'azureml-logs/75_job_post-tvmps_5bb793346ddb66ba129a50203d28feb5489a6ff846c35706d4078aba8dbff1d3_d.txt',
 'azureml-logs/process_info.json',
 'azureml-logs/process_status.json',
 'logs/azureml/106_azureml.log',
 'logs/azureml/job_prep_azureml.log',
 'logs/azureml/job_release_azureml.log',
 'outputs/run_0.9822392202519724__100_15.joblib']

In [10]:
fraud_detector

Experiment,Id,Type,Status,Details Page,Docs Page
fraud-detection,HD_8bbe25c0-9b7b-4043-a238-36513509eb18_1,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [15]:
#TODO: Save the best model
fraud_detector.download_file(name='outputs/run_0.9822392202519724__100_15.joblib', output_file_path='./outputs/run_0.9995259997893332__75_7.joblib')

##  Best Model Details:

In [16]:
fraud_detector_metrics = fraud_detector.get_metrics()
fraud_detector_parameters = fraud_detector.get_details()['runDefinition']

print('Best Run Id: ', fraud_detector.id)
print('\n Metrics', fraud_detector_metrics)

Best Run Id:  HD_8bbe25c0-9b7b-4043-a238-36513509eb18_1

 Metrics {'n_estimators:': 100.0, 'max_depth:': 15, 'AUC_weighted': 0.9822392202519724}


In [17]:
fraud_detector.get_details()

{'runId': 'HD_8bbe25c0-9b7b-4043-a238-36513509eb18_1',
 'target': 'cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-02-08T17:49:38.917306Z',
 'endTimeUtc': '2021-02-08T17:54:25.675399Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': '9ce4bac8-b8fd-46e5-adc5-729ef994248c',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': '5cb844ff-e534-4b73-91df-5301685c3005'}, 'consumptionDetails': {'type': 'Reference'}}],
 'outputDatasets': [],
 'runDefinition': {'script': 'train.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--max_depth', '15', '--n_estimators', '100'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'cluster',
  'dataReferences': {},
  'data': {},
  'outputData': {},
  'jobName': None,
  'maxRunDurationSeconds': None,
  'nodeCount': 1,
  'priority': None,
  'credenti

## Effect of Hyperparameters on Primary Metric

In [18]:
import pandas as pd
pd.DataFrame(hdr.get_children_sorted_by_primary_metric())

Unnamed: 0,run_id,hyperparameters,best_primary_metric,status
0,HD_8bbe25c0-9b7b-4043-a238-36513509eb18_1,"{""--max_depth"": 15, ""--n_estimators"": 100}",0.982239,Completed
1,HD_8bbe25c0-9b7b-4043-a238-36513509eb18_7,"{""--max_depth"": 10, ""--n_estimators"": 100}",0.982197,Completed
2,HD_8bbe25c0-9b7b-4043-a238-36513509eb18_5,"{""--max_depth"": 15, ""--n_estimators"": 75}",0.980011,Completed
3,HD_8bbe25c0-9b7b-4043-a238-36513509eb18_0,"{""--max_depth"": 5, ""--n_estimators"": 75}",0.97531,Completed
4,HD_8bbe25c0-9b7b-4043-a238-36513509eb18_2,"{""--max_depth"": 15, ""--n_estimators"": 25}",0.966491,Completed
5,HD_8bbe25c0-9b7b-4043-a238-36513509eb18_3,"{""--max_depth"": 2, ""--n_estimators"": 100}",0.946405,Completed
6,HD_8bbe25c0-9b7b-4043-a238-36513509eb18_4,"{""--max_depth"": 5, ""--n_estimators"": 50}",0.938298,Completed
7,HD_8bbe25c0-9b7b-4043-a238-36513509eb18_6,"{""--max_depth"": 2, ""--n_estimators"": 50}",0.936977,Completed
8,HD_8bbe25c0-9b7b-4043-a238-36513509eb18_prepar...,,,Completed


Using above information We can see that as the `max_depth` increases the AUC increases. Along with this as the number of estimators increase the accuracy of the model increases

In [25]:
# Registering the best model
model=fraud_detector.register_model( model_name='credit-card-fraud-detection-hyperdrive',
                    model_path='outputs/run_0.9822392202519724__100_15.joblib')

In [26]:
model

Model(workspace=Workspace.create(name='quick-starts-ws-138174', subscription_id='d7f39349-a66b-446e-aba6-0053c2cf1c11', resource_group='aml-quickstarts-138174'), name=credit-card-fraud-detection-hyperdrive, id=credit-card-fraud-detection-hyperdrive:1, version=1, tags={}, properties={})