In [None]:
'''
0. Prerequisites

Python 3
TensorFlow 1.15
'''

# download the CIFAR-10 dataset, a collection of images with 10 target classes
!pip install ipywidgets
!wget https://raw.githubusercontent.com/awslabs/amazon-sagemaker-examples/master/advanced_functionality/tensorflow_bring_your_own/utils/generate_cifar10_tfrecords.py
!python generate_cifar10_tfrecords.py --data-dir cifar10

In [None]:
# import libraries
import time, os, sys
import sagemaker, boto3
import numpy as np
import pandas as pd

# connect to sagemaker client endpoint
sess = boto3.Session()
sm   = sess.client('sagemaker')
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session(boto_session=sess)

# stage the dataset in the regional S3 bucket
datasets = sagemaker_session.upload_data(path='cifar10', key_prefix='datasets/cifar10-dataset')
datasets

In [None]:
''' 
1. Start a SageMaker Experiment

A SageMaker Experiment is the collection of processing and training jobs
related to the same machine learning project. SageMaker Experiments
automatically tracks training runs for the project.
'''

# import the sagemaker experiments package
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent

# define the experiment, and attach the current sagemaker client to it
training_experiment = Experiment.create(
    experiment_name = "sagemaker-training-experiments", 
    description     = "Experiment to track cifar10 training trials", 
    sagemaker_boto_client=sm)

# it will appear on the left toolbar, under
# "Components and registries" -> "Experiments and trials"

In [None]:
'''
2. Create the Trial and Training Script.

These scripts will be used to train a classifier on the CIFAR-10 dataset.
A Trial is an iteration of the end-to-end training job.
A Trial can also track:
    - pre-processing jobs
    - post-processing jobs
    - metadata
    - datasets
The training job is defined in ./training_script.py.

A single SageMaker Experiment can contain multiple trials, allowing
for tracking multiple training iterations over time in the left toolbar.
'''

# create trial associated with this experiment
single_gpu_trial = Trial.create(
    trial_name = 'sagemaker-single-gpu-training', 
    experiment_name = training_experiment.experiment_name,
    sagemaker_boto_client = sm,
)

trial_comp_name = 'single-gpu-training-job'
experiment_config = {"ExperimentName": training_experiment.experiment_name, 
   "TrialName": single_gpu_trial.trial_name,
   "TrialComponentDisplayName": trial_comp_name}

# it will appear on the left toolbar
# double-click the experiment name, and its associated trials will be listed

In [None]:
'''
3. Run the TensorFlow training job and visualize the results.

The training job is defined in ./training_script.py.
'''

from sagemaker.tensorflow import TensorFlow

# specify training job hyperparameters
hyperparams={'epochs'       : 30,
             'learning-rate': 0.01,
             'batch-size'   : 256,
             'weight-decay' : 2e-4,
             'momentum'     : 0.9,
             'optimizer'    : 'adam'}

bucket_name = sagemaker_session.default_bucket()
output_path = f's3://{bucket_name}/jobs'
metric_definitions = [{'Name': 'val_acc', 'Regex': 'val_acc: ([0-9\\.]+)'}]

# define the sagemaker resources and configurations for the training job
tf_estimator = TensorFlow(entry_point          = 'training_script.py', 
                          output_path          = f'{output_path}/',
                          code_location        = output_path,
                          role                 = role,
                          train_instance_count = 1,
                          train_instance_type  = 'ml.g4dn.xlarge',
                          framework_version    = '1.15.2', 
                          py_version           = 'py3',
                          script_mode          = True,
                          metric_definitions   = metric_definitions,
                          sagemaker_session    = sagemaker_session,
                          hyperparameters      = hyperparams)

# initiate the training job by fitting the model to the data
job_name=f'tensorflow-single-gpu-{time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())}'
tf_estimator.fit({'training'  : datasets,
                  'validation': datasets,
                  'eval'      : datasets},
                 job_name = job_name,
                 experiment_config=experiment_config)

In [None]:
# Test accuracy is about ~67%

'''
4. Automatically tune the model with SageMaker for better results.

In this step, we run a SageMaker automatic model tuning job to find the best
hyperparameters for the model. Instead of specifying exact hyperparameter
values, we specify ranges (integer, continuous, categorical) and run training
jobs for each set of values.
'''

from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

hyperparameter_ranges = {
    'epochs'        : IntegerParameter(5, 30),
    'learning-rate' : ContinuousParameter(0.001, 0.1, scaling_type='Logarithmic'), 
    'batch-size'    : CategoricalParameter(['128', '256', '512']),
    'momentum'      : ContinuousParameter(0.9, 0.99),
    'optimizer'     : CategoricalParameter(['sgd', 'adam'])
}

objective_metric_name = 'val_acc'
objective_type = 'Maximize'
metric_definitions = [{'Name': 'val_acc', 'Regex': 'val_acc: ([0-9\\.]+)'}]

tf_estimator = TensorFlow(entry_point          = 'cifar10-training-sagemaker.py', 
                          output_path          = f'{output_path}/',
                          code_location        = output_path,
                          role                 = role,
                          train_instance_count = 1, 
                          train_instance_type  = 'ml.g4dn.xlarge',
                          framework_version    = '1.15', 
                          py_version           = 'py3',
                          script_mode          = True,
                          metric_definitions   = metric_definitions,
                          sagemaker_session    = sagemaker_session)

# the tuner is now what will be fitted to the data
tuner = HyperparameterTuner(estimator             = tf_estimator,
                            objective_metric_name = objective_metric_name,
                            hyperparameter_ranges = hyperparameter_ranges,
                            metric_definitions    = metric_definitions,
                            max_jobs              = 16,
                            max_parallel_jobs     = 8,
                            objective_type        = objective_type)

job_name=f'tf-hpo-{time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())}'
tuner.fit({'training'  : datasets,
           'validation': datasets,
           'eval'      : datasets},
            job_name = job_name)

In [None]:
# Best test accuracy is about ~80%

'''
5. Clean up resources.
'''
!aws s3 rm --recursive s3://sagemaker-REGION-ACCOUNTNUMBER/datasets/cifar10-dataset
!aws s3 rm --recursive s3://sagemaker-REGION-ACCOUNTNUMBER/jobs