# DeepFM Tensorflow Horovod on SageMaker Sample

### In this sample, we will demo how to run a deepfm sample code in tensorflow horovod on sagemaker

Notice:

1. Dataset format is TFRecord

2. This model training we will use **GPU** instances

3. Using [SageMaker Python SDK 2.x](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html)

In [1]:
import sagemaker
print(sagemaker.__version__)

2.25.1


## File mode

In [1]:
#下面用多个spot实例进行parameter server方式的分布式训练。
import sagemaker
from sagemaker.tensorflow.estimator import TensorFlow
from datetime import datetime
import os

dt_now = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

bucket = 'sagemaker-us-west-2-169088282855'
checkpoint_s3_uri = 's3://{}/deepfm-checkpoint/{}'.format(bucket, dt_now) #Change to your own path if you want to save ckpt during training
checkpoint_dir = '/opt/ml/deepfm/checkpoints'
model_dir = '/opt/ml/model'
output_path= 's3://{}/deepfm-2021'.format(bucket)

training_channel_name = 'training'
evaluation_channel_name = 'evaluation'

train_instance_type = 'ml.p3.8xlarge'
hvd_processes_per_host = 4
train_instance_count= 1

train_use_spot_instances = True
enable_s3_shard = True
enable_data_multi_path = True

#enable pipe mode
pipe_mode = 0

train_max_run=36000*2
train_max_wait = 72000 if train_use_spot_instances else None

distributions = {'mpi': {
                    'enabled': True,
                    'processes_per_host': hvd_processes_per_host,
                    'custom_mpi_options': '-verbose --NCCL_DEBUG=INFO -x OMPI_MCA_btl_vader_single_copy_mechanism=none'
                        }
                }

deep_layer = '128,64,32'

batch_size = 1024
feature_size = 117581

base_job_name='tf-scriptmode-deepfm'

hyperparameters = {'servable_model_dir': '/opt/ml/model', 'checkpoint_dir':checkpoint_dir,
                   'training_data_dir': '/opt/ml/input/data/training/', 'val_data_dir': '/opt/ml/input/data/evaluation/', 'log_steps': 10, 'num_epochs': 10, 
                   'field_size': 39, 'feature_size': feature_size, 'deep_layers': deep_layer,
                   'perform_shuffle': 0, 'batch_size': batch_size, 'pipe_mode': pipe_mode, 'enable_s3_shard': enable_s3_shard,
                   'training_channel_name': training_channel_name, 'evaluation_channel_name': evaluation_channel_name,
                   'worker_per_host': hvd_processes_per_host, 'enable_data_multi_path': enable_data_multi_path
                  }

estimator = TensorFlow(
                       #source_dir='./',
                       entry_point='DeepFM-hvd-tfrecord-vectorized-map.py',
                       model_dir=False,
                       #checkpoint_s3_uri = checkpoint_s3_uri,
                       #checkpoint_local_path = checkpoint_local_path,
                       output_path= output_path,
                       instance_type=train_instance_type,
                       instance_count=train_instance_count,
                       #volume_size = 500,
                       hyperparameters=hyperparameters,
                       role=sagemaker.get_execution_role(),
                       base_job_name=base_job_name,
                       framework_version='1.15.2',
                       py_version='py3',
                       script_mode=True,
                       #input_mode='Pipe',
                       distribution=distributions,
                       use_spot_instances=train_use_spot_instances,
                       max_wait=train_max_wait,
                       max_run=train_max_run,
                       debugger_hook_config =False,
                       disable_profiler=True
                       )

In [2]:
#下面这个测试file mode
from sagemaker.inputs import TrainingInput

train_s3_uri = 's3://sagemaker-us-west-2-169088282855/tf-SM-deepctr-deepfm-sample/data-tfrecord/training/'
validate_s3_uri = 's3://sagemaker-us-west-2-169088282855/tf-SM-deepctr-deepfm-sample/data-tfrecord/val/'

if enable_s3_shard:
    train_input = TrainingInput(train_s3_uri, distribution='ShardedByS3Key')
    val_input = TrainingInput(validate_s3_uri)
else :
    train_input = TrainingInput(train_s3_uri)
    val_input = TrainingInput(validate_s3_uri)

inputs = {training_channel_name : train_input, evaluation_channel_name : val_input}

estimator.fit(inputs)

2021-02-23 07:51:26 Starting - Starting the training job...
2021-02-23 07:51:28 Starting - Launching requested ML instances......
2021-02-23 07:52:38 Starting - Preparing the instances for training............
2021-02-23 07:54:55 Downloading - Downloading input data
2021-02-23 07:54:55 Training - Downloading the training image...
[0m
[34m2021-02-23 07:55:22,065 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2021-02-23 07:55:22,508 sagemaker-containers INFO     Starting MPI run as worker node.[0m
[34m2021-02-23 07:55:22,508 sagemaker-containers INFO     Creating SSH daemon.[0m
[34m2021-02-23 07:55:22,514 sagemaker-containers INFO     Waiting for MPI workers to establish their SSH connections[0m
[34m2021-02-23 07:55:22,514 sagemaker-containers INFO     Env Hosts: ['algo-1'] Hosts: ['algo-1:4'] process_per_hosts: 4 num_processes: 4[0m
[34m2021-02-23 07:55:22,516 sagemaker-containers INFO     Network interface name: eth0[0m
[34

[34m[1,2]<stderr>:INFO:tensorflow:Done calling model_fn.[0m
[34m[1,2]<stderr>:I0223 07:55:34.749670 139772761040704 estimator.py:1150] Done calling model_fn.[0m
[34m[1,2]<stderr>:INFO:tensorflow:Create CheckpointSaverHook.[0m
[34m[1,2]<stderr>:I0223 07:55:34.751027 139772761040704 basic_session_run_hooks.py:541] Create CheckpointSaverHook.[0m
[34m[1,3]<stderr>:INFO:tensorflow:Done calling model_fn.[0m
[34m[1,3]<stderr>:I0223 07:55:34.783824 140482554521408 estimator.py:1150] Done calling model_fn.[0m
[34m[1,1]<stderr>:INFO:tensorflow:Done calling model_fn.[0m
[34m[1,1]<stderr>:I0223 07:55:34.784728 139660100417344 estimator.py:1150] Done calling model_fn.[0m
[34m[1,3]<stderr>:INFO:tensorflow:Create CheckpointSaverHook.[0m
[34m[1,3]<stderr>:I0223 07:55:34.785248 140482554521408 basic_session_run_hooks.py:541] Create CheckpointSaverHook.[0m
[34m[1,1]<stderr>:INFO:tensorflow:Create CheckpointSaverHook.[0m
[34m[1,1]<stderr>:I0223 07:55:34.786159 139660100417344 basic

[34m[1,1]<stderr>:INFO:tensorflow:Saving checkpoints for 291 into /tmp/tmpp2thczv5/model.ckpt.[0m
[34m[1,2]<stderr>:INFO:tensorflow:Saving checkpoints for 291 into /tmp/tmp8wl08_sc/model.ckpt.[0m
[34m[1,1]<stderr>:I0223 07:55:44.708660 139660100417344 basic_session_run_hooks.py:606] Saving checkpoints for 291 into /tmp/tmpp2thczv5/model.ckpt.[0m
[34m[1,0]<stderr>:INFO:tensorflow:Saving checkpoints for 291 into /opt/ml/deepfm/checkpoints/model.ckpt.[0m
[34m[1,2]<stderr>:I0223 07:55:44.708693 139772761040704 basic_session_run_hooks.py:606] Saving checkpoints for 291 into /tmp/tmp8wl08_sc/model.ckpt.[0m
[34m[1,0]<stderr>:I0223 07:55:44.708797 139872637613888 basic_session_run_hooks.py:606] Saving checkpoints for 291 into /opt/ml/deepfm/checkpoints/model.ckpt.[0m
[34m[1,3]<stderr>:INFO:tensorflow:Saving checkpoints for 291 into /tmp/tmp4nc129__/model.ckpt.[0m
[34m[1,3]<stderr>:I0223 07:55:44.713233 140482554521408 basic_session_run_hooks.py:606] Saving checkpoints for 291 in

[34m[1,0]<stderr>:INFO:tensorflow:Saving checkpoints for 485 into /opt/ml/deepfm/checkpoints/model.ckpt.[0m
[34m[1,2]<stderr>:INFO:tensorflow:Saving checkpoints for 485 into /tmp/tmp8wl08_sc/model.ckpt.[0m
[34m[1,0]<stderr>:I0223 07:55:55.248675 139872637613888 basic_session_run_hooks.py:606] Saving checkpoints for 485 into /opt/ml/deepfm/checkpoints/model.ckpt.[0m
[34m[1,1]<stderr>:INFO:tensorflow:Saving checkpoints for 485 into /tmp/tmpp2thczv5/model.ckpt.[0m
[34m[1,2]<stderr>:I0223 07:55:55.248750 139772761040704 basic_session_run_hooks.py:606] Saving checkpoints for 485 into /tmp/tmp8wl08_sc/model.ckpt.[0m
[34m[1,1]<stderr>:I0223 07:55:55.248817 139660100417344 basic_session_run_hooks.py:606] Saving checkpoints for 485 into /tmp/tmpp2thczv5/model.ckpt.[0m
[34m[1,3]<stderr>:INFO:tensorflow:Saving checkpoints for 485 into /tmp/tmp4nc129__/model.ckpt.[0m
[34m[1,3]<stderr>:I0223 07:55:55.254686 140482554521408 basic_session_run_hooks.py:606] Saving checkpoints for 485 in

[34m[1,2]<stderr>:INFO:tensorflow:Saving checkpoints for 679 into /tmp/tmp8wl08_sc/model.ckpt.[0m
[34m[1,2]<stderr>:I0223 07:56:05.956721 139772761040704 basic_session_run_hooks.py:606] Saving checkpoints for 679 into /tmp/tmp8wl08_sc/model.ckpt.[0m
[34m[1,0]<stderr>:INFO:tensorflow:Saving checkpoints for 679 into /opt/ml/deepfm/checkpoints/model.ckpt.[0m
[34m[1,0]<stderr>:I0223 07:56:05.958564 139872637613888 basic_session_run_hooks.py:606] Saving checkpoints for 679 into /opt/ml/deepfm/checkpoints/model.ckpt.[0m
[34m[1,3]<stderr>:INFO:tensorflow:Saving checkpoints for 679 into /tmp/tmp4nc129__/model.ckpt.[0m
[34m[1,3]<stderr>:I0223 07:56:05.963402 140482554521408 basic_session_run_hooks.py:606] Saving checkpoints for 679 into /tmp/tmp4nc129__/model.ckpt.[0m
[34m[1,1]<stderr>:INFO:tensorflow:Saving checkpoints for 679 into /tmp/tmpp2thczv5/model.ckpt.[0m
[34m[1,1]<stderr>:I0223 07:56:05.965289 139660100417344 basic_session_run_hooks.py:606] Saving checkpoints for 679 in


2021-02-23 07:56:24 Uploading - Uploading generated training model[34m[1,1]<stderr>:INFO:tensorflow:Saving checkpoints for 873 into /tmp/tmpp2thczv5/model.ckpt.[0m
[34m[1,1]<stderr>:I0223 07:56:16.498607 139660100417344 basic_session_run_hooks.py:606] Saving checkpoints for 873 into /tmp/tmpp2thczv5/model.ckpt.[0m
[34m[1,0]<stderr>:INFO:tensorflow:Saving checkpoints for 873 into /opt/ml/deepfm/checkpoints/model.ckpt.[0m
[34m[1,0]<stderr>:I0223 07:56:16.498697 139872637613888 basic_session_run_hooks.py:606] Saving checkpoints for 873 into /opt/ml/deepfm/checkpoints/model.ckpt.[0m
[34m[1,2]<stderr>:INFO:tensorflow:Saving checkpoints for 873 into /tmp/tmp8wl08_sc/model.ckpt.[0m
[34m[1,2]<stderr>:I0223 07:56:16.504280 139772761040704 basic_session_run_hooks.py:606] Saving checkpoints for 873 into /tmp/tmp8wl08_sc/model.ckpt.[0m
[34m[1,3]<stderr>:INFO:tensorflow:Saving checkpoints for 873 into /tmp/tmp4nc129__/model.ckpt.[0m
[34m[1,3]<stderr>:I0223 07:56:16.504509 1404825545


2021-02-23 07:56:30 Completed - Training job completed
Training seconds: 117
Billable seconds: 35
Managed Spot Training savings: 70.1%


## Pipe mode

In [9]:
#下面用多个spot实例进行parameter server方式的分布式训练。
import sagemaker
from sagemaker.tensorflow.estimator import TensorFlow
from datetime import datetime
import os

dt_now = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

bucket = 'sagemaker-us-west-2-169088282855'
checkpoint_s3_uri = 's3://{}/deepfm-checkpoint/{}'.format(bucket, dt_now) #Change to your own path if you want to save ckpt during training
checkpoint_dir = '/opt/ml/deepfm/checkpoints'
model_dir = '/opt/ml/model'
output_path= 's3://{}/deepfm-2021'.format(bucket)

training_channel_name = 'training'
evaluation_channel_name = 'evaluation'

train_instance_type = 'ml.p3.8xlarge'
hvd_processes_per_host = 4
train_instance_count= 1

train_use_spot_instances = True
enable_s3_shard = True
enable_data_multi_path = False

#enable pipe mode
pipe_mode = 1

train_max_run=36000*2
train_max_wait = 72000 if train_use_spot_instances else None

distributions = {'mpi': {
                    'enabled': True,
                    'processes_per_host': hvd_processes_per_host,
                    'custom_mpi_options': '-verbose --NCCL_DEBUG=INFO -x OMPI_MCA_btl_vader_single_copy_mechanism=none'
                        }
                }

deep_layer = '128,64,32'

batch_size = 1024
feature_size = 117581

base_job_name='tf-scriptmode-deepfm'

hyperparameters = {'servable_model_dir': '/opt/ml/model', 'checkpoint_dir':checkpoint_dir,
                   'training_data_dir': '/opt/ml/input/data/training/', 'val_data_dir': '/opt/ml/input/data/evaluation/', 'log_steps': 10, 'num_epochs': 10, 
                   'field_size': 39, 'feature_size': feature_size, 'deep_layers': deep_layer,
                   'perform_shuffle': 0, 'batch_size': batch_size, 'pipe_mode': pipe_mode, 'enable_s3_shard': enable_s3_shard,
                   'training_channel_name': training_channel_name, 'evaluation_channel_name': evaluation_channel_name,
                   'worker_per_host': hvd_processes_per_host, 'enable_data_multi_path': enable_data_multi_path
                  }

estimator = TensorFlow(
                       #source_dir='./',
                       entry_point='DeepFM-hvd-tfrecord-vectorized-map.py',
                       model_dir=False,
                       #checkpoint_s3_uri = checkpoint_s3_uri,
                       #checkpoint_local_path = checkpoint_local_path,
                       output_path= output_path,
                       instance_type=train_instance_type,
                       instance_count=train_instance_count,
                       #volume_size = 500,
                       hyperparameters=hyperparameters,
                       role=sagemaker.get_execution_role(),
                       base_job_name=base_job_name,
                       framework_version='1.14',
                       py_version='py3',
                       script_mode=True,
                       input_mode='Pipe',
                       distribution=distributions,
                       use_spot_instances=train_use_spot_instances,
                       max_wait=train_max_wait,
                       max_run=train_max_run,
                       debugger_hook_config =False,
                       disable_profiler=True
                       )

In [10]:
#下面这个测试pipe mode
from sagemaker.inputs import TrainingInput

'''
在Pipe mode下需要设置多个训练channel，训练channel的数量需要与woker_per_host强一致。
如： inputs = {'training':train_input, 'training-2':train_input, 'training-3':train_input, 'evaluation': validate_s3}

训练数据集路径分两种，一种是各个channel用一个训练数据集路径，一种是每个channel有独自的训练数据集路径（每个channel下面数据集样本数量需要保持一致））。
用户可以根据实际情况决定在准备数据集的时候采用哪种方式，这里我们以 enable_data_multi_path 这个参数表示是否每个channel有独自的数据集路径
'''

train_s3_uri = 's3://sagemaker-us-west-2-169088282855/tf-SM-deepctr-deepfm-sample/data-tfrecord/training/'
validate_s3_uri = 's3://sagemaker-us-west-2-169088282855/tf-SM-deepctr-deepfm-sample/data-tfrecord/val/'

if enable_data_multi_path:    #假如有4个不同的channel

    train_s3_uri_1 = ''
    train_s3_uri_2 = ''
    train_s3_uri_3 = ''
    train_s3_uri_4 = ''
    
    if enable_s3_shard:
        train_input_1 = TrainingInput(train_s3_uri_1, distribution='ShardedByS3Key')
        train_input_2 = TrainingInput(train_s3_uri_2, distribution='ShardedByS3Key')
        train_input_3 = TrainingInput(train_s3_uri_3, distribution='ShardedByS3Key')
        train_input_4 = TrainingInput(train_s3_uri_4, distribution='ShardedByS3Key')
    else :
        train_input_1 = TrainingInput(train_s3_uri_1)
        train_input_2 = TrainingInput(train_s3_uri_2)
        train_input_3 = TrainingInput(train_s3_uri_3)
        train_input_4 = TrainingInput(train_s3_uri_4)
        
    val_input = TrainingInput(validate_s3_uri)
    
    inputs = {'{}'.format(training_channel_name) : train_input_1,
              '{}-1'.format(training_channel_name) : train_input_2,
              '{}-2'.format(training_channel_name) : train_input_3,
              '{}-3'.format(training_channel_name) : train_input_4, 
              evaluation_channel_name : val_input}

else : #共用一个训练数据集路径 train_s3_uri
    
    if enable_s3_shard:
        train_input = TrainingInput(train_s3_uri, distribution='ShardedByS3Key')
    else :
        train_input = TrainingInput(train_s3_uri)
        
    val_input = TrainingInput(validate_s3_uri)
    
    inputs = {'{}'.format(training_channel_name) : train_input,
              '{}-1'.format(training_channel_name) : train_input,
              '{}-2'.format(training_channel_name) : train_input,
              '{}-3'.format(training_channel_name) : train_input, 
              evaluation_channel_name : val_input}

estimator.fit(inputs)

2021-02-22 16:22:47 Starting - Starting the training job...
2021-02-22 16:22:50 Starting - Launching requested ML instances.........
2021-02-22 16:24:31 Starting - Preparing the instances for training...
2021-02-22 16:25:07 Downloading - Downloading input data...
2021-02-22 16:25:39 Training - Downloading the training image...
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])[0m
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])[0m
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])[0m
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])[0m
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])[0m
  np_resource = np.dtype([("resource", np.ubyte, 1)])[0m
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])[0m
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])[0m
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])[0m
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])[0m
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])[0m
  np_resource = np.dtype([("resource", np.ubyte, 1)])
 Data for

[34m[1,1]<stderr>:2021-02-22 16:26:25.744873: W tensorflow/core/framework/dataset.cc:404] Input of PipeModeDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.[0m
[34m[1,2]<stderr>:2021-02-22 16:26:25.756013: W tensorflow/core/framework/dataset.cc:404] Input of PipeModeDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.[0m
[34m[1,0]<stderr>:2021-02-22 16:26:25.759490: W tensorflow/core/framework/dataset.cc:404] Input of PipeModeDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.[0m
[34m[1,3]<stderr>:2021-02-22 16:26:25.783438: W tensorflow/core/framework/dataset.cc:404] Input of PipeModeDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply

[34m[1,2]<stderr>:I0222 16:26:51.539284 140389551736576 basic_session_run_hooks.py:692] global_step/sec: 12.6856[0m
[34m[1,2]<stderr>:I0222 16:26:51.540243 140389551736576 basic_session_run_hooks.py:260] loss = 0.4352495, step = 300 (7.883 sec)[0m
[34m[1,0]<stderr>:I0222 16:26:51.540313 140014048130816 basic_session_run_hooks.py:692] global_step/sec: 12.6636[0m
[34m[1,3]<stderr>:I0222 16:26:51.541116 140121141290752 basic_session_run_hooks.py:692] global_step/sec: 12.6806[0m
[34m[1,1]<stderr>:I0222 16:26:51.541151 140432338822912 basic_session_run_hooks.py:692] global_step/sec: 12.6432[0m
[34m[1,0]<stderr>:I0222 16:26:51.541295 140014048130816 basic_session_run_hooks.py:260] loss = 0.43264875, step = 300 (7.897 sec)[0m
[34m[1,3]<stderr>:I0222 16:26:51.542168 140121141290752 basic_session_run_hooks.py:260] loss = 0.43114898, step = 300 (7.886 sec)[0m
[34m[1,1]<stderr>:I0222 16:26:51.542221 140432338822912 basic_session_run_hooks.py:260] loss = 0.4257103, step = 300 (7.910

[34m[1,2]<stderr>:I0222 16:28:03.062580 140389551736576 basic_session_run_hooks.py:692] global_step/sec: 12.7164[0m
[34m[1,1]<stderr>:I0222 16:28:03.062526 140432338822912 basic_session_run_hooks.py:692] global_step/sec: 12.6971[0m
[34m[1,0]<stderr>:I0222 16:28:03.063098 140014048130816 basic_session_run_hooks.py:692] global_step/sec: 12.6959[0m
[34m[1,3]<stderr>:I0222 16:28:03.063445 140121141290752 basic_session_run_hooks.py:692] global_step/sec: 12.7147[0m
[34m[1,2]<stderr>:I0222 16:28:03.063532 140389551736576 basic_session_run_hooks.py:260] loss = 0.34033877, step = 1200 (7.864 sec)[0m
[34m[1,1]<stderr>:I0222 16:28:03.063655 140432338822912 basic_session_run_hooks.py:260] loss = 0.33552784, step = 1200 (7.876 sec)[0m
[34m[1,0]<stderr>:I0222 16:28:03.064073 140014048130816 basic_session_run_hooks.py:260] loss = 0.33531767, step = 1200 (7.877 sec)[0m
[34m[1,3]<stderr>:I0222 16:28:03.064216 140121141290752 basic_session_run_hooks.py:260] loss = 0.3347452, step = 1200 (

[34m[1,2]<stderr>:I0222 16:29:14.463954 140389551736576 basic_session_run_hooks.py:692] global_step/sec: 12.4373[0m
[34m[1,2]<stderr>:I0222 16:29:14.464845 140389551736576 basic_session_run_hooks.py:260] loss = 0.22476599, step = 2100 (8.040 sec)[0m
[34m[1,0]<stderr>:I0222 16:29:14.478156 140014048130816 basic_session_run_hooks.py:692] global_step/sec: 12.451[0m
[34m[1,0]<stderr>:I0222 16:29:14.478946 140014048130816 basic_session_run_hooks.py:260] loss = 0.21923453, step = 2100 (8.031 sec)[0m
[34m[1,1]<stderr>:I0222 16:29:14.490466 140432338822912 basic_session_run_hooks.py:692] global_step/sec: 12.3899[0m
[34m[1,3]<stderr>:I0222 16:29:14.491314 140121141290752 basic_session_run_hooks.py:692] global_step/sec: 12.4107[0m
[34m[1,1]<stderr>:I0222 16:29:14.491379 140432338822912 basic_session_run_hooks.py:260] loss = 0.2205246, step = 2100 (8.071 sec)[0m
[34m[1,3]<stderr>:I0222 16:29:14.492099 140121141290752 basic_session_run_hooks.py:260] loss = 0.22831568, step = 2100 (8

[34m[1,2]<stderr>:I0222 16:30:33.686388 140389551736576 basic_session_run_hooks.py:692] global_step/sec: 12.629[0m
[34m[1,3]<stderr>:I0222 16:30:33.686393 140121141290752 basic_session_run_hooks.py:692] global_step/sec: 12.6416[0m
[34m[1,2]<stderr>:I0222 16:30:33.687398 140389551736576 basic_session_run_hooks.py:260] loss = 0.22957885, step = 3100 (7.918 sec)[0m
[34m[1,3]<stderr>:I0222 16:30:33.687475 140121141290752 basic_session_run_hooks.py:260] loss = 0.222332, step = 3100 (7.911 sec)[0m
[34m[1,0]<stderr>:I0222 16:30:33.713181 140014048130816 basic_session_run_hooks.py:692] global_step/sec: 12.5658[0m
[34m[1,1]<stderr>:I0222 16:30:33.713722 140432338822912 basic_session_run_hooks.py:692] global_step/sec: 12.5963[0m
[34m[1,0]<stderr>:I0222 16:30:33.714257 140014048130816 basic_session_run_hooks.py:260] loss = 0.21288922, step = 3100 (7.958 sec)[0m
[34m[1,1]<stderr>:I0222 16:30:33.714750 140432338822912 basic_session_run_hooks.py:260] loss = 0.22213861, step = 3100 (7.


2021-02-22 16:31:42 Uploading - Uploading generated training model
2021-02-22 16:31:58 Completed - Training job completed
Training seconds: 411
Billable seconds: 123
Managed Spot Training savings: 70.1%
