# 2) Model choice and hpo

In [None]:
%%capture
!pip install smdebug
!pip install torchvision --no-cache-dir  

In [3]:
import json
import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session
from sagemaker.pytorch import PyTorch, PyTorchModel
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.debugger import DebuggerHookConfig, ProfilerConfig, FrameworkProfile
from sagemaker.debugger import Rule, ProfilerRule, rule_configs

import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import host_subplot

import boto3
import os
import numpy as np

from PIL import Image
import IPython

session = sagemaker.Session()

bucket = session.default_bucket()
print("Default Bucket: {}".format(bucket))

region = session.boto_region_name
print("AWS Region: {}".format(region))

role = get_execution_role()
print("RoleArn: {}".format(role))

prefix = "capstone-inventory-project"

Default Bucket: sagemaker-us-east-1-837030799965
AWS Region: us-east-1
RoleArn: arn:aws:iam::837030799965:role/service-role/AmazonSageMaker-ExecutionRole-20211207T163039


This is the part where we will finetune a pretrained model with hyperparameter tuning.

We will use the hpo.py script to perform hyperparameter tuning.

I implemented my training in a slighty different way than in the exercises:

At the end of each epoch, the loss and accuracy is calculated on the entire validation set. I then use those metrics to keep track of the best performing model (model weights), which will be saved a the end of the training.
Once the training is done, the model is evaluated against the test dataset. It is on this metric that the hyperparameter job will focus.

In [12]:
hyperparameter_ranges = {
    "lr": ContinuousParameter(0.001, 0.1),
    "batch-size": CategoricalParameter([16, 32, 64]), 
    "model": CategoricalParameter(["resnet", "vgg", "alexnet"])
}

In [13]:
estimator = PyTorch(
    entry_point="scripts/hpo.py",
    role=role,
    py_version='py36',
    framework_version="1.8",
    instance_count=1,
    instance_type="ml.g4dn.xlarge",
    output_path = 's3://{}/{}/hyperparameter_tuning'.format(bucket, prefix),  # The training jobs output (mainly model artefacts) will go there.
    use_spot_instances=True,  # Using spot?
    max_run=1800,             # Max billing time allowed
    max_wait=3600,            # Maximum waiting time to find the spot instance
)

objective_metric_name = "average test loss"
objective_type = "Minimize"
metric_definitions = [{"Name": objective_metric_name, "Regex": "Validation set: Average loss: ([0-9\\.]+)"}]

tuner = HyperparameterTuner(estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=4,
    max_parallel_jobs=1,
    objective_type=objective_type,
)

In [None]:
# Fit the HP Tuner
tuner.fit({"train": "s3://{}/{}/data".format(bucket, prefix)})

...........................................................................................................................................................................................................

In [None]:
# Get the best estimators and the best HPs
best_estimator = tuner.best_estimator()

# Get the hyperparameters of the best trained model
best_estimator.hyperparameters()

The kernel died, I had to manually re-instantiate the best estimator.

In [None]:
BestTrainingJobName='pytorch-training-211211-0226-002-b56fce2a'

In [None]:
my_estimator = sagemaker.estimator.Estimator.attach(BestTrainingJobName)
my_estimator.hyperparameters()