In [31]:
import os
import json

import sagemaker
from sagemaker.tensorflow import TensorFlow
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()

In [34]:
import logging
import boto3
from botocore.exceptions import ClientError

public_bucket = "sagemaker-sample-files"
local_data_dir = "/tmp/data"

def download_from_s3(data_dir="/tmp/data", train=True):
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
        
    if train:
        images_file = "train-images-idx3-ubyte.gz"
        labels_file = "train-labels-idx1-ubyte.gz"
    else:
        images_file = "t10k-images-idx3-ubyte.gz"
        labels_file = "t10k-labels-idx1-ubyte.gz"
        
    s3 = boto3.client("s3")
    bucket = public_bucket
    for obj in [images_file, labels_file]:
        key = os.path.join("datasets/image/MNIST", obj)
        dest = os.path.join(data_dir, obj)
        if not os.path.exists(dest):
            s3.download_file(bucket, key, dest)
    return


download_from_s3(local_data_dir, True)
download_from_s3(local_data_dir, False)

In [35]:
est = TensorFlow(
    entry_point="train.py",
    source_dir="code",  # directory of your training script
    role=role,
    framework_version="2.3.1",
    model_dir="/opt/ml/model",
    py_version="py37",
    instance_type="ml.m5.4xlarge",
    instance_count=1,
    volume_size=250,
    hyperparameters={
        "batch-size": 512,
        "epochs": 4,
    },
)

In [36]:
prefix = "mnist"

bucket = "test-sagemaker-examples-1357942113492"
print(bucket)
loc = sess.upload_data(path=local_data_dir, bucket=bucket, key_prefix=prefix)

channels = {"training": loc, "testing": loc}

test-sagemaker-examples-1357942113492


In [37]:
from sagemaker.tuner import ContinuousParameter, HyperparameterTuner

hyperparameter_range = {"learning_rate": ContinuousParameter(1e-4, 1e-3)}

In [38]:
objective_metric_name = "average test loss"
objective_type = "Minimize"
metric_definitions = [
    {
        "Name": objective_metric_name,
        "Regex": "Test Loss: ([0-9\\.]+)",
    }
]

In [33]:
from sagemaker.debugger import ProfilerConfig, FrameworkProfile

profiler_config = ProfilerConfig(
    framework_profile_params=FrameworkProfile()
)

Framework profiling will be deprecated from tensorflow 2.12 and pytorch 2.0 in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [42]:
tuner = HyperparameterTuner (
    est,
    objective_metric_name,
    hyperparameter_range,
    metric_definitions,
    max_jobs=3,
    max_parallel_jobs=3,
    objective_type=objective_type
)

tuner.fit(inputs=channels, profiler_config=profiler_config)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


Using provided s3_resource
.............................*


UnexpectedStatusException: Error for HyperParameterTuning job tensorflow-training-230517-1205: Failed. Reason: All training jobs failed. Please take a look at the training jobs failures to get more details.

In [None]:
predictor = tuner.deploy(initial_instance_count=1, instance_type="ml.m5.xlarge")