## Hyper Parameter Tuning with XGBoost

In [1]:
import boto3
import sagemaker
import pandas as pd
from sagemaker.inputs import TrainingInput

In [2]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

### Load S3 Location of Data

In [3]:
processed_train_data_s3_uri = 's3://ads-508-group-6-final/churn_model_data/train/data.csv'
processed_validation_data_s3_uri = 's3://ads-508-group-6-final/churn_model_data/validation/data.csv'
processed_test_data_s3_uri = 's3://ads-508-group-6-final/churn_model_data/test/data.csv'

In [4]:
s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri)
s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri)
s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri)

print(s3_input_train_data.config)
print(s3_input_validation_data.config)
print(s3_input_test_data.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://ads-508-group-6-final/churn_model_data/train/data.csv', 'S3DataDistributionType': 'FullyReplicated'}}}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://ads-508-group-6-final/churn_model_data/validation/data.csv', 'S3DataDistributionType': 'FullyReplicated'}}}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://ads-508-group-6-final/churn_model_data/test/data.csv', 'S3DataDistributionType': 'FullyReplicated'}}}


### Setup Static Hyper-Parameters

In [11]:
max_depth = 6
eta = .3
gamma = 0
min_child_weight = 1
subsample = 1
verbosity = 1
objective = 'binary:hinge'
tree_method = 'auto'
predictor = 'auto'

enable_sagemaker_debugger = False
enable_checkpointing = False
enable_tensorboard = False
input_mode = "File"
run_validation = True
run_test = True
run_sample_predictions = True

### Setup Hyper-Parameter Ranges to Explore

In [12]:
from sagemaker.tuner import IntegerParameter
from sagemaker.tuner import ContinuousParameter
from sagemaker.tuner import CategoricalParameter
from sagemaker.tuner import HyperparameterTuner

hyperparameter_ranges = {"max_depth": IntegerParameter(3, 15, scaling_type = 'Auto'),
                         "min_child_weight": IntegerParameter(1, 7, scaling_type = 'Auto'),
                         "gamma": ContinuousParameter(0, .4, scaling_type = 'Auto'), 
                         "eta": ContinuousParameter(.3, .7, scaling_type = 'Auto')}

### Setup Metrics

In [13]:
metrics_definitions = [{"Name": "train:accuracy", "Regex": "accuracy: ([0-9\\.]+)"},
                       {"Name": "validation:accuracy", "Regex": "val_accuracy: ([0-9\\.]+)"}]

### Create Estimator

In [18]:
from sagemaker.xgboost.estimator import XGBoost

xgb_estimator = XGBoost(
    entry_point="abilone.py",
    source_dir="src",
    role=role,
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    framework_version="1.0-1",    
    hyperparameters={
        "max_depth": max_depth,
        "eta": eta,
        "gamma": gamma,
        "min_child_weight": min_child_weight,
        "subsample": subsample,
        "verbosity": verbosity,
        "objective": objective,
        "tree_method": tree_method,
        "predictor": predictor},
    input_mode=input_mode,
    metric_definitions=metrics_definitions
#                       max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute
)

### Setup HyperparameterTuner with Estimator and Hyper-Parameter Ranges

In [15]:
objective_metric_name = "validation:accuracy"

tuner = HyperparameterTuner(
    estimator=xgb_estimator,
    objective_type="Maximize",
    objective_metric_name=objective_metric_name,
    hyperparameter_ranges=hyperparameter_ranges,
    metric_definitions=metrics_definitions,
    max_jobs=2,
    max_parallel_jobs=1,
    strategy="Bayesian",
    early_stopping_type="Auto",
)

### Start Tuning Job

In [16]:
tuner.fit(
    inputs={"train": s3_input_train_data, "validation": s3_input_validation_data, "test": s3_input_test_data},
    include_cls_metadata=False,
    wait=False,
)

ClientError: An error occurred (ValidationException) when calling the CreateHyperParameterTuningJob operation: You can't override the metric definitions for Amazon SageMaker algorithms. Please retry the request without specifying metric definitions.

### Tuning Job Status

In [13]:
from pprint import pprint

tuning_job_name = tuner.latest_tuning_job.job_name

In [14]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/hyper-tuning-jobs/{}">Hyper-Parameter Tuning Job</a></b>'.format(
            region, tuning_job_name
        )
    )
)

### Show the Tuning Job
### _Note:  This will fail at first.  Please wait about 15-30 seconds and re-run._

In [None]:
from sagemaker.analytics import HyperparameterTuningJobAnalytics

hp_results = HyperparameterTuningJobAnalytics(sagemaker_session=sess, hyperparameter_tuning_job_name=tuning_job_name)

df_results = hp_results.dataframe()
df_results.shape

In [None]:
df_results.sort_values("FinalObjectiveValue", ascending=0)

# Show the Best Candidate

In [None]:
df_results.sort_values("FinalObjectiveValue", ascending=0).head(1)

# Log the Best Hyper-Parameter and Objective Metric in the Experiment

Logging `learning_rate` parameter and `accuracy` metric

In [None]:
best_learning_rate = df_results.sort_values("FinalObjectiveValue", ascending=0).head(1)["learning_rate"]
print(best_learning_rate)

In [None]:
best_accuracy = df_results.sort_values("FinalObjectiveValue", ascending=0).head(1)["FinalObjectiveValue"]
print(best_accuracy)

In [None]:
tracker_optimize.log_parameters({"learning_rate": float(best_learning_rate)})

# must save after logging
tracker_optimize.trial_component.save()

In [None]:
tracker_optimize.log_metric("accuracy", float(best_accuracy))

# must save after logging
tracker_optimize.trial_component.save()

# Show Experiment Analytics

In [None]:
from sagemaker.analytics import ExperimentAnalytics

lineage_table = ExperimentAnalytics(
    sagemaker_session=sess,
    experiment_name=experiment_name,
    metric_names=["validation:accuracy"],
    sort_by="CreationTime",
    sort_order="Descending",
)

df_lineage = lineage_table.dataframe()
df_lineage.shape

In [None]:
df_lineage

# Pass `tuning_job_name` to the Next Notebook

In [None]:
print(tuning_job_name)

In [None]:
%store tuning_job_name

In [None]:
%store

# Release Resources

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}