In [2]:
import boto3
import sagemaker

In [3]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

bucket = sagemaker_session.default_bucket()

In [4]:
bucket

'sagemaker-us-east-1-322961843176'

### Upload training data to S3

In [8]:
data_dir = 'robot_reboot_data'
prefix = 'sagemaker/robot_reboot'
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

In [9]:
sagemaker_session.upload_data(path='robot_reboot_model', bucket=bucket, key_prefix=prefix)

's3://sagemaker-us-east-1-322961843176/sagemaker/robot_reboot'

In [10]:
for obj in boto3.resource('s3').Bucket(bucket).objects.all():
    print(obj.key)

sagemaker/robot_reboot/model_0/.data-00000-of-00001
sagemaker/robot_reboot/model_0/.index
sagemaker/robot_reboot/model_0/checkpoint
sagemaker/robot_reboot/train.tfrecords
sagemaker/robot_reboot/validation.tfrecords


In [11]:
!pygmentize src/ml/train.py

[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mlogging[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m

[34mimport[39;49;00m [04m[36mtensorflow[39;49;00m [34mas[39;49;00m [04m[36mtf[39;49;00m
[34mfrom[39;49;00m [04m[36mtensorflow[39;49;00m[04m[36m.[39;49;00m[04m[36mkeras[39;49;00m[04m[36m.[39;49;00m[04m[36mcallbacks[39;49;00m [34mimport[39;49;00m ModelCheckpoint
[34mfrom[39;49;00m [04m[36mmodel[39;49;00m [34mimport[39;49;00m get_model

logging.getLogger().setLevel(logging.INFO)

HEIGHT = [34m31[39;49;00m
WIDTH = [34m31[39;49;00m
DEPTH = [34m9[39;49;00m
NUM_CLASSES = [34m16[39;49;00m
SHUFFLE_BUFFER_SIZE = [34m100[39;49;00m


[34mdef[39;49;00m [32mget_filenames[39;49;00m(channel_name, channel):
    [34mif[39;49;00m channel_name [35min[39;49;00m [[33m'[39;49;00m[33mtrain[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mvalidation[39;49;00m[33m'[39;49;00m, [33m'

### Create estimator

In [12]:
!pip install --upgrade sagemaker

Collecting sagemaker
  Downloading sagemaker-2.32.0.tar.gz (405 kB)
[K     |████████████████████████████████| 405 kB 17.2 MB/s eta 0:00:01
Building wheels for collected packages: sagemaker
  Building wheel for sagemaker (setup.py) ... [?25ldone
[?25h  Created wheel for sagemaker: filename=sagemaker-2.32.0-py2.py3-none-any.whl size=570899 sha256=0e480fdc36e8b22e324376ecede44846de207f119bbceee47903af4b694bd9e9
  Stored in directory: /home/ec2-user/.cache/pip/wheels/69/97/97/3cc021580a00f6ec531125f88d49baf9ec4385afabe45ecf11
Successfully built sagemaker
Installing collected packages: sagemaker
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.31.0
    Uninstalling sagemaker-2.31.0:
      Successfully uninstalled sagemaker-2.31.0
Successfully installed sagemaker-2.32.0


In [None]:
from sagemaker.tensorflow import TensorFlow
estimator = TensorFlow(base_job_name='robot_reboot',
                      entry_point='train.py',
                      source_dir='src/ml',
                      role=role,
                      framework_version="2.2.0",
                      py_version='py37',
                      hyperparameters={
                          'epochs': 1,
                          'model': 's3://sagemaker-us-east-1-322961843176/sagemaker/robot_reboot/model_0/',
                          'train': 's3://sagemaker-us-east-1-322961843176/sagemaker/robot_reboot',
                          'validation': 's3://sagemaker-us-east-1-322961843176/sagemaker/robot_reboot',
                          'eval': 's3://sagemaker-us-east-1-322961843176/sagemaker/robot_reboot',
                          'model_version': 1, 
                          'optimizer': 'sgd',
                          'learning_rate': 0.01
                      },
                      train_instance_count=1, 
                      train_instance_type='local')

In [11]:
estimator.fit({'train': 's3://sagemaker-us-east-1-322961843176/sagemaker/robot_reboot/train.tfrecords',
              'validation': 's3://sagemaker-us-east-1-322961843176/sagemaker/robot_reboot/validation.tfrecords',
              'eval': 's3://sagemaker-us-east-1-322961843176/sagemaker/robot_reboot/validation.tfrecords'})

Creating 9a4wozzkmt-algo-1-575yo ... 
Creating 9a4wozzkmt-algo-1-575yo ... done
Attaching to 9a4wozzkmt-algo-1-575yo
[36m9a4wozzkmt-algo-1-575yo |[0m 2021-03-31 22:13:08.749991: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:425] Initializing the SageMaker Profiler.
[36m9a4wozzkmt-algo-1-575yo |[0m 2021-03-31 22:13:08.750172: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:106] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.
[36m9a4wozzkmt-algo-1-575yo |[0m 2021-03-31 22:13:08.773741: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:425] Initializing the SageMaker Profiler.
[36m9a4wozzkmt-algo-1-575yo |[0m 2021-03-31 22:13:10,864 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training
[36m9a4wozzkmt-algo-1-575yo |[0m 2021-03-31 22:13:10,871 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
[36m9a4w



[36m9a4wozzkmt-algo-1-575yo |[0m 2021-03-31 22:13:53,741 sagemaker-training-toolkit INFO     Reporting training SUCCESS
[36m9a4wozzkmt-algo-1-575yo exited with code 0
[0mAborting on container exit...
===== Job Complete =====


In [17]:
from sagemaker.tensorflow import TensorFlow

metric_definitions = [
    {'Name': 'loss', 'Regex': 'loss: ([0-9\\.]+)'},
    {'Name': 'v_loss', 'Regex': 'v_loss: ([0-9\\.]+)'},
    {'Name': 'p_loss', 'Regex': 'p_loss: ([0-9\\.]+)'},
    {'Name': 'v_accuracy', 'Regex': 'v_accuracy: ([0-9\\.]+)'},
    {'Name': 'p_accuracy', 'Regex': 'p_accuracy: ([0-9\\.]+)'}
]

estimator = TensorFlow(base_job_name='robot-reboot',
                      entry_point='train.py',
                      source_dir='src/ml',
                      role=role,
                      framework_version="2.2.0",
                      py_version='py37',
                      metric_definitions=metric_definitions,
                      hyperparameters={
                          'epochs': 20,
                          'model': 's3://sagemaker-us-east-1-322961843176/sagemaker/robot_reboot/model_0/',
                          'train': 's3://sagemaker-us-east-1-322961843176/sagemaker/robot_reboot',
                          'validation': 's3://sagemaker-us-east-1-322961843176/sagemaker/robot_reboot',
                          'eval': 's3://sagemaker-us-east-1-322961843176/sagemaker/robot_reboot',
                          'model_version': 1,
                          'optimizer': 'sgd',
                          'learning_rate': 0.01
                      },
                      train_instance_count=1, 
                      train_instance_type='ml.c4.xlarge')

train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [None]:
estimator.fit({'train': 's3://sagemaker-us-east-1-322961843176/sagemaker/robot_reboot/train.tfrecords',
              'validation': 's3://sagemaker-us-east-1-322961843176/sagemaker/robot_reboot/validation.tfrecords',
              'eval': 's3://sagemaker-us-east-1-322961843176/sagemaker/robot_reboot/validation.tfrecords'})

2021-03-31 01:27:58 Starting - Starting the training job...
2021-03-31 01:28:24 Starting - Launching requested ML instancesProfilerReport-1617154077: InProgress
......
2021-03-31 01:29:24 Starting - Preparing the instances for training......
2021-03-31 01:30:24 Downloading - Downloading input data...
2021-03-31 01:30:49 Training - Training image download completed. Training in progress.[34m2021-03-31 01:30:51.097287: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:425] Initializing the SageMaker Profiler.[0m
[34m2021-03-31 01:30:51.104686: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:106] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.[0m
[34m2021-03-31 01:30:51.271422: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:425] Initializing the SageMaker Profiler.[0m
[34m2021-03-31 01:30:55,561 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_contai

[34m#015      1/Unknown - 0s 21us/step - loss: 8.4672 - v_loss: 1.1168 - p_loss: 7.3504 - v_accuracy: 0.0000e+00 - p_accuracy: 0.0000e+00#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015      6/Unknown - 0s 10ms/step - loss: 3.3733 - v_loss: 0.4924 - p_loss: 2.8809 - v_accuracy: 0.0000e+00 - p_accuracy: 0.0000e+00#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#

[34m00 - p_accuracy: 0.0000e+00#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015   1269/Unknown - 14s 11ms/step - loss: 0.5928 - v_loss: 0.0679 - p_loss: 0.5249 - v_accuracy: 0.0000e+00 - p_accuracy: 0.0000e+00#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#0

[34m#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015   1940/Unknown - 21s 11ms/step - loss: 0.4901 - v_loss: 0.0468 - p_loss: 0.4433 - v_accuracy: 0.0000e+00 - p_accuracy: 0.0000e+00#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015   1945/Unknown - 21s 11ms/step - loss: 0.4897 - v_loss: 0.0467 - p_loss: 0.4431 - v_accuracy: 0.0000e+00 - p_accuracy: 0.0000e+00#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#01

[34mknown - 35s 11ms/step - loss: 0.4016 - v_loss: 0.0306 - p_loss: 0.3710 - v_accuracy: 0.0000e+00 - p_accuracy: 0.0000e+00#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015   3286/Unknown - 35s 11ms/step - loss: 0.4012 - v_loss: 0.0306 - p_loss: 0.3706 - v_accuracy: 0.0000e+00 - p_accuracy: 0.0000e+00#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#

[34m#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015   3965/Unknown - 43s 11ms/step - loss: 0.3810 - v_loss: 0.0265 - p_loss: 0.3545 - v_accuracy: 0.0000e+00 - p_accuracy: 0.0000e+00#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#

[34moss: 0.3647 - v_loss: 0.0241 - p_loss: 0.3407 - v_accuracy: 0.0000e+00 - p_accuracy: 0.0000e+00#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015   4617/Unknown - 51s 11ms/step - loss: 0.3647 - v_loss: 0.0240 - p_loss: 0.3406 - v_accuracy: 0.0000e+00 - p_accuracy: 0.0000e+00#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#0

[34m202 - p_loss: 0.3226 - v_accuracy: 0.0000e+00 - p_accuracy: 1.0731e-05#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015   5830/Unknown - 65s 11ms/step - loss: 0.3426 - v_loss: 0.0201 - p_loss: 0.3224 - v_accuracy: 0.0000e+00 - p_accuracy: 1.0720e-05#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#01

[34m#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015   6498/Unknown - 73s 11ms/step - loss: 0.3323 - v_loss: 0.0184 - p_loss: 0.3139 - v_accuracy: 0.0000e+00 - p_accuracy: 9.6183e-06#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015   6503/Unknown - 73s 11ms/step - loss: 0.3324 - v_loss: 0.0184 - p_loss: 0.3140 - v_accuracy: 0.0000e+00 - p

[34maccuracy: 1.3916e-04 - p_accuracy: 3.8269e-04#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015   7192/Unknown - 81s 11ms/step - loss: 0.3243 - v_loss: 0.0170 - p_loss: 0.3073 - v_accuracy: 1.3904e-04 - p_accuracy: 3.8237e-04#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010

[34mss: 0.0155 - p_loss: 0.2975 - v_accuracy: 4.6882e-04 - p_accuracy: 0.0012#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#015   8538/Unknown - 95s 11ms/step - loss: 0.3129 - v_loss: 0.0155 - p_loss: 0.2974 - v_accuracy: 4.6849e-04 - p_accuracy: 0.0012#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010#010

In [21]:
from sagemaker.tuner import HyperparameterTuner,CategoricalParameter, ContinuousParameter

metric_definitions = [
    {'Name': 'loss', 'Regex': 'loss: ([0-9\\.]+)'},
    {'Name': 'v_loss', 'Regex': 'v_loss: ([0-9\\.]+)'},
    {'Name': 'p_loss', 'Regex': 'p_loss: ([0-9\\.]+)'},
    {'Name': 'v_accuracy', 'Regex': 'v_accuracy: ([0-9\\.]+)'},
    {'Name': 'p_accuracy', 'Regex': 'p_accuracy: ([0-9\\.]+)'}
]

hyperparameter_tuner = HyperparameterTuner(estimator = estimator,
                                          objective_metric_name = 'loss',
                                          objective_type = 'Minimize',
                                          max_jobs = 20,
                                          max_parallel_jobs = 10,
                                          metric_definitions=metric_definitions,
                                          hyperparameter_ranges = {
                                                'optimizer': CategoricalParameter(['adam', 'RMSprop', 'sgd']), 
                                                'learning_rate': ContinuousParameter(0.001, 0.01)
                                         })

In [None]:
hyperparameter_tuner.fit({'train': 's3://sagemaker-us-east-1-322961843176/sagemaker/robot_reboot/train.tfrecords',
              'validation': 's3://sagemaker-us-east-1-322961843176/sagemaker/robot_reboot/validation.tfrecords',
              'eval': 's3://sagemaker-us-east-1-322961843176/sagemaker/robot_reboot/validation.tfrecords'})

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................