# Hyperparameter Tuning using HyperDrive


In [1]:
from azureml.train.hyperdrive import RandomParameterSampling
from azureml.train.hyperdrive import normal, uniform, choice
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.dataset import Dataset
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
import os

ws = Workspace.from_config()
experiment_name = 'predict_rain_hyperdrive'

experiment=Experiment(ws, experiment_name)
run = experiment.start_logging()

amlcompute_cluster_name = "notebook144713"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

Found existing cluster, use it.

Jobrunning.........................................
Running


## Dataset

In [2]:
found = False
key = "rain-in-australia"
description_text = """
This dataset contains about 10 years of daily weather observations from many locations across Australia. 
RainTomorrow is the target variable to predict. It means -- did it rain the next day, Yes or No? This column is Yes if the rain for that day was 1mm or more.

Source & Acknowledgements
Observations were drawn from numerous weather stations. The daily observations are available from http://www.bom.gov.au/climate/data.
An example of latest weather observations in Canberra: http://www.bom.gov.au/climate/dwo/IDCJDW2801.latest.shtml

Definitions adapted from http://www.bom.gov.au/climate/dwo/IDCJDW0000.shtml
Data source: http://www.bom.gov.au/climate/dwo/ and http://www.bom.gov.au/climate/data.

Copyright Commonwealth of Australia 2010, Bureau of Meteorology.2
Dataset compiled by Joe Young, hosted on kaggle.com 
https://www.kaggle.com/jsphyg/weather-dataset-rattle-package?select=weatherAUS.csv
"""
if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:
        # Create AML Dataset and register it into Workspace
        import requests
        url = 'https://raw.githubusercontent.com/Aschteroth/udacity_capstone_project/main/weatherAUS.csv'
        dataset = Dataset.Tabular.from_delimited_files(url)        
        #Register Dataset in Workspace
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description=description_text)


df = dataset.to_pandas_dataframe()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
Date             145460 non-null object
Location         145460 non-null object
MinTemp          143975 non-null float64
MaxTemp          144199 non-null float64
Rainfall         142199 non-null float64
Evaporation      82670 non-null object
Sunshine         75625 non-null object
WindGustDir      135134 non-null object
WindGustSpeed    135197 non-null float64
WindDir9am       134894 non-null object
WindDir3pm       141232 non-null object
WindSpeed9am     143693 non-null float64
WindSpeed3pm     142398 non-null float64
Humidity9am      142806 non-null float64
Humidity3pm      140953 non-null float64
Pressure9am      130395 non-null float64
Pressure3pm      130432 non-null float64
Cloud9am         89572 non-null float64
Cloud3pm         86102 non-null float64
Temp9am          143693 non-null float64
Temp3pm          141851 non-null float64
RainToday        142199 non-null object

## Hyperdrive Configuration

For the HyperDirve experiment, I chose a [DecisionTreeClassifier](https://scikit-learn.org/stable/modules/tree.html) model from the sklearn-library. A decision tree is a non-parametric supervised learning method used for classification and regression that will predict a label column by learning from the data features.

I decided to use config parameters similar to the parameters used in the first project of this course: 
- Early termination improves computational efficiency, but might return a slightly worse result by missing some good candidates. I chose the "Bandit" policy, an aggressive policy based on slack factor/slack amount and evaluation interval, that early terminates any runs where the primary metric is not within the specified slack factor/slack amount with respect to the best performing training run. I specified a slack factor of 0.1 and an evaluation interval of 3.[Source and further reading](https://azure.github.io/azureml-sdk-for-r/reference/bandit_policy.html)
- Random search is a technique where random combinations of the hyperparameters are used to find the best solution for the built model. Compared to a Grid search, where we try every combination of a preset list of values of the hyper-parameters and evaluate the model for each combination, Random search yields equal or even better results with comparably less resources. Since the provided lab is limited to 4 hours, I chose the faster RandomParameterSampling. [Source and further reading](https://medium.com/@senapati.dipak97/grid-search-vs-random-search-d34c92946318#:~:text=Random%20search%20works%20best%20for,are%20less%20number%20of%20dimensions)




In [10]:
early_termination_policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=3)

param_sampling = RandomParameterSampling({"--criterion": choice("gini", "entropy"),"--splitter": choice("best", "random"), "--max_depth": choice(3,4,5,6,7,8,9,10)})

estimator = SKLearn(source_directory ='.',compute_target=compute_target, entry_script="train.py")

hyperdrive_run_config = HyperDriveConfig(hyperparameter_sampling= param_sampling,
                                    primary_metric_name="AUC_weighted",
                                    primary_metric_goal= PrimaryMetricGoal.MAXIMIZE,
                                    max_total_runs=8, 
                                    max_concurrent_runs=4,
                                    policy = early_termination_policy,
                                    estimator=estimator)



In [11]:
#Submit experiment
hyperdrive_run = experiment.submit(hyperdrive_run_config)



## Run Details

In [12]:
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

## Best Model

The best run from the hyperdrive experiments had a **weighted AUC** of 0.86481, the parameters were criterion:gini, splitter: best and a max depth of 6.
The run took 5 minutes and 48 seconds to complete.

In [13]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
 
print('Best Run Id: ', best_run.id)
print('AUC_weighted of Best Run is:', best_run_metrics['AUC_weighted'])
print('Parameter Values are:',best_run.get_details()['runDefinition']['arguments'])

Best Run Id:  HD_32cb3902-72f1-44c4-aaa5-9cec212577f9_6
AUC_weighted of Best Run is: 0.8648180704984888
Parameter Values are: ['--criterion', 'gini', '--max_depth', '6', '--splitter', 'best']


In [15]:
#Save the best model
hyperdrivemodel = best_run.register_model(model_name='rain_hyperdrive_model', 
                                    model_path='outputs/model.joblib',
                                    tags={'Method':'Hyperdrive'},
                                    properties={'AUC_weighted': best_run_metrics['AUC_weighted']})

print(hyperdrivemodel)

Model(workspace=Workspace.create(name='quick-starts-ws-144713', subscription_id='61c5c3f0-6dc7-4ed9-a7f3-c704b20e3b30', resource_group='aml-quickstarts-144713'), name=rain_hyperdrive_model, id=rain_hyperdrive_model:1, version=1, tags={'Method': 'Hyperdrive'}, properties={'AUC_weighted': '0.8648180704984888'})


## Model Deployment

Since the AutoML experiment performed slightly better than the HyperDrive Run, I will deploy the best AutoML model instead of a Hyperdrive Model.