In [22]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
from sagemaker import get_execution_role
import sagemaker as sage
from sagemaker.tensorflow import TensorFlow
import boto3

In [23]:
sess = sage.Session()
role = get_execution_role()
s3 = boto3.client('s3')

In [9]:
help(sage.estimator.EstimatorBase)

Help on class EstimatorBase in module sagemaker.estimator:

class EstimatorBase(builtins.object)
 |  Handle end-to-end Amazon SageMaker training and deployment tasks.
 |  
 |  For introduction to model training and deployment, see
 |  http://docs.aws.amazon.com/sagemaker/latest/dg/how-it-works-training.html
 |  
 |  Subclasses must define a way to determine what image to use for training,
 |  what hyperparameters to use, and how to create an appropriate predictor
 |  instance.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, role, instance_count=None, instance_type=None, volume_size=30, volume_kms_key=None, max_run=86400, input_mode='File', output_path=None, output_kms_key=None, base_job_name=None, sagemaker_session=None, tags=None, subnets=None, security_group_ids=None, model_uri=None, model_channel_name='model', metric_definitions=None, encrypt_inter_container_traffic=False, use_spot_instances=False, max_wait=None, checkpoint_s3_uri=None, checkpoint_local_path=None, rules=None,

In [8]:
help(sage.estimator.Framework)

Help on class Framework in module sagemaker.estimator:

class Framework(EstimatorBase)
 |  Base class that cannot be instantiated directly.
 |  
 |  Subclasses define functionality pertaining to specific ML frameworks,
 |  such as training/deployment images and predictor instances.
 |  
 |  Method resolution order:
 |      Framework
 |      EstimatorBase
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, entry_point, source_dir=None, hyperparameters=None, container_log_level=20, code_location=None, image_uri=None, dependencies=None, enable_network_isolation=False, git_config=None, checkpoint_s3_uri=None, checkpoint_local_path=None, enable_sagemaker_metrics=None, **kwargs)
 |      Base class initializer.
 |      
 |      Subclasses which override ``__init__`` should invoke ``super()``.
 |      
 |      Args:
 |          entry_point (str): Path (absolute or relative) to the local Python
 |              source file which should be executed as the entry point to

In [6]:
help(TensorFlow)

Help on class TensorFlow in module sagemaker.tensorflow.estimator:

class TensorFlow(sagemaker.estimator.Framework)
 |  Handle end-to-end training and deployment of user-provided TensorFlow code.
 |  
 |  Method resolution order:
 |      TensorFlow
 |      sagemaker.estimator.Framework
 |      sagemaker.estimator.EstimatorBase
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, py_version=None, framework_version=None, model_dir=None, image_uri=None, distribution=None, **kwargs)
 |      Initialize a ``TensorFlow`` estimator.
 |      
 |      Args:
 |          py_version (str): Python version you want to use for executing your model training
 |              code. Defaults to ``None``. Required unless ``image_uri`` is provided.
 |          framework_version (str): TensorFlow version you want to use for executing your model
 |              training code. Defaults to ``None``. Required unless ``image_uri`` is provided.
 |              List of supported versions:
 

In [33]:
tf_estimator = TensorFlow(entry_point='TransferNetworkTraining.py', role=role, 
                         train_instance_count=1, train_instance_type='ml.p2.xlarge',
                          framework_version='1.12', py_version='py3',
                          distributions={
                              'mpi': {
                                  'enabled': True,
                                  'processes_per_host':1,
                                  'custom_mpi_options': '--NCCL_DEBUG INFO'
                              }
                          },
                          output_path='s3://otolith-everything/publication_models'
                         )

distributions has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [35]:
tf_estimator.fit('s3://otolith-everything/publication_training', logs='Training', job_name='TrainJob-WithValidation-v3')

2021-03-04 23:24:55 Starting - Starting the training job...
2021-03-04 23:25:19 Starting - Launching requested ML instances......
2021-03-04 23:26:19 Starting - Preparing the instances for training............
2021-03-04 23:28:21 Downloading - Downloading input data
2021-03-04 23:28:21 Training - Downloading the training image...
2021-03-04 23:28:41 Training - Training image download completed. Training in progress.[34m2021-03-04 23:28:41,968 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2021-03-04 23:28:42,305 sagemaker-containers INFO     Starting MPI run as worker node.[0m
[34m2021-03-04 23:28:42,305 sagemaker-containers INFO     Creating SSH daemon.[0m
[34m2021-03-04 23:28:42,311 sagemaker-containers INFO     Waiting for MPI workers to establish their SSH connections[0m
[34m2021-03-04 23:28:42,311 sagemaker-containers INFO     Env Hosts: ['algo-1'] Hosts: ['algo-1'] process_per_hosts: 1 num_processes: 1[0m
[34m2021-03-04 

In [None]:
 [ 14  98  75  16 131  56 141  44  29 120  94   5 102  51  78  42  92  66
[1,0]<stdout>:  31  35  90  84  77  40 125  99  33  19  73 146  91 135  69 128 114  48
[1,0]<stdout>:  53  28  54 108 112  17 119 103  58 118  18   4  45  59  39  36 117 139
[1,0]<stdout>: 107 132 126  85 122  95  11 113 123  12   2 104   6 127 110  65  55 144
[1,0]<stdout>: 138  46  62  74 116  93 100  89  10  34  32 124  38  83 111 149  27  23
[1,0]<stdout>:  67   9 130  97 105 145  87 148 109  64  15  82  41  80  52  26  76  43
[1,0]<stdout>:  24 136 121 143  49  21  70   3 142  30 147 106  47 115  13  88   8  81
[1,0]<stdout>:  60   0   1  57  22  61  63   7  86  96  68  50 101  20  25 134  71 129
[1,0]<stdout>:  79 133 137  72 140  37]