# DeepFM Tensorflow Parameter Server on SageMaker Sample

### In this sample, we will demo how to run a deepfm sample code in tensorflow parameter server on sagemaker

Notice:

1. Dataset format is TFRecord

2. This model training we will use **CPU** instances based on our experience, DeepFM script TF PS on CPU will more effective and saving cost. 

3. Using [SageMaker Python SDK 2.x](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html)

In [1]:
import sagemaker
print(sagemaker.__version__)

2.25.1


## File mode

In [7]:
#下面用多个spot实例进行parameter server方式的分布式训练。
import sagemaker
from sagemaker.tensorflow.estimator import TensorFlow
import time
import os

checkpoint_s3_uri = 's3://sagemaker-us-west-2-169088282855/deepfm-checkpoint' #Change to your own path if you want to save ckpt during training
checkpoint_local_path = '/opt/ml/checkpoints'
model_dir = 's3://sagemaker-us-west-2-169088282855/deepfm-ps-ckpt/{}'.format(int(time.time()))
output_path= 's3://sagemaker-us-west-2-169088282855/deepfm-2021'

training_channel_name = 'training'
evaluation_channel_name = 'evaluation'

train_instance_type = 'ml.c5.18xlarge'
train_instance_count= 2

train_use_spot_instances = True
enable_s3_shard = True

train_max_run=36000*2
train_max_wait = 72000 if train_use_spot_instances else None

distributions={'parameter_server': {'enabled': True}}

deep_layer = '128,64,32'

batch_size = 1024
feature_size = 117581

base_job_name='tf-scriptmode-deepfm'

hyperparameters = {'servable_model_dir': '/opt/ml/model', 'training_data_dir': '/opt/ml/input/data/training/',
                   'val_data_dir': '/opt/ml/input/data/evaluation/', 'log_steps': 10, 'num_epochs': 10, 
                   'field_size': 39, 'feature_size': feature_size, 'deep_layers': deep_layer,
                   'perform_shuffle': 0, 'batch_size': batch_size, 'pipe_mode': 0, 'enable_s3_shard': enable_s3_shard,
                   'training_channel_name': training_channel_name, 'evaluation_channel_name': evaluation_channel_name
                  }

estimator = TensorFlow(
                       #source_dir='./',
                       entry_point='DeepFM-dist-ps-for-multipleCPU-multiInstance.py',
                       model_dir=model_dir,
                       #checkpoint_s3_uri = checkpoint_s3_uri,
                       #checkpoint_local_path = checkpoint_local_path,
                       output_path= output_path,
                       instance_type=train_instance_type,
                       instance_count=train_instance_count,
                       #volume_size = 500,
                       hyperparameters=hyperparameters,
                       role=sagemaker.get_execution_role(),
                       base_job_name=base_job_name,
                       framework_version='1.15.2',
                       py_version='py3',
                       script_mode=True,
                       #input_mode='Pipe',
                       distribution=distributions,
                       use_spot_instances=train_use_spot_instances,
                       max_wait=train_max_wait,
                       max_run=train_max_run,
                       debugger_hook_config = False,
                       disable_profiler=True
                       )

In [None]:
#下面这个测试file mode
from sagemaker.inputs import TrainingInput

train_s3_uri = 's3://sagemaker-us-west-2-169088282855/tf-SM-deepctr-deepfm-sample/data-tfrecord/training/'
validate_s3_uri = 's3://sagemaker-us-west-2-169088282855/tf-SM-deepctr-deepfm-sample/data-tfrecord/val/'

if enable_s3_shard:
    train_input = TrainingInput(train_s3_uri, distribution='ShardedByS3Key')
    val_input = TrainingInput(validate_s3_uri)
else :
    train_input = TrainingInput(train_s3_uri)
    val_input = TrainingInput(validate_s3_uri)

inputs = {training_channel_name : train_input, evaluation_channel_name : val_input}

estimator.fit(inputs)

2021-02-20 16:09:27 Starting - Starting the training job...
2021-02-20 16:09:29 Starting - Launching requested ML instances......
2021-02-20 16:10:54 Starting - Preparing the instances for training......
2021-02-20 16:11:56 Downloading - Downloading input data
2021-02-20 16:11:56 Training - Downloading the training image...
[0m
[34m2021-02-20 16:12:13,721 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2021-02-20 16:12:13,729 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-02-20 16:12:13,935 sagemaker_tensorflow_container.training INFO     Running distributed training job with parameter servers[0m
[34m2021-02-20 16:12:13,935 sagemaker_tensorflow_container.training INFO     Launching parameter server process[0m
[34m2021-02-20 16:12:13,935 sagemaker_tensorflow_container.training INFO     Running distributed training job with parameter servers[0m
[0m
[0m
[0m
[0m
[34m2021-02-20 16:12:14

[34mINFO:tensorflow:loss = 0.6992709, step = 0[0m
[34mI0220 16:12:24.013165 139871979530048 basic_session_run_hooks.py:262] loss = 0.6992709, step = 0[0m
[35mInstructions for updating:[0m
[35mUse Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.[0m
[35mW0220 16:12:27.601269 139777914935104 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/training_util.py:236: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.[0m
[35mInstructions for updating:[0m
[35mUse Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.[0m
[0m
[35mW0220 16:12:27.663391 139777914935104 module_wrapper.py:139] From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.parse_example is deprecated.

[35mINFO:tensorflow:global_step/sec: 28.3958[0m
[35mI0220 16:12:33.810918 139777914935104 basic_session_run_hooks.py:692] global_step/sec: 28.3958[0m
[35mINFO:tensorflow:loss = 0.4933927, step = 288 (6.869 sec)[0m
[35mI0220 16:12:36.797049 139777914935104 basic_session_run_hooks.py:260] loss = 0.4933927, step = 288 (6.869 sec)[0m
[35mINFO:tensorflow:global_step/sec: 32.0494[0m
[35mI0220 16:12:36.993429 139777914935104 basic_session_run_hooks.py:692] global_step/sec: 32.0494[0m
[34mINFO:tensorflow:loss = 0.3878574, step = 315 (6.685 sec)[0m
[34mI0220 16:12:37.598397 139871979530048 basic_session_run_hooks.py:260] loss = 0.3878574, step = 315 (6.685 sec)[0m
[35mINFO:tensorflow:global_step/sec: 35.6388[0m
[35mI0220 16:12:39.827474 139777914935104 basic_session_run_hooks.py:692] global_step/sec: 35.6388[0m
[35mINFO:tensorflow:loss = 0.39676228, step = 462 (4.857 sec)[0m
[35mI0220 16:12:41.654518 139777914935104 basic_session_run_hooks.py:260] loss = 0.39676228, step 

[35mINFO:tensorflow:global_step/sec: 36.2259[0m
[35mI0220 16:13:48.839520 139777914935104 basic_session_run_hooks.py:692] global_step/sec: 36.2259[0m
[34mINFO:tensorflow:loss = 0.18661487, step = 2964 (6.718 sec)[0m
[34mI0220 16:13:50.066300 139871979530048 basic_session_run_hooks.py:260] loss = 0.18661487, step = 2964 (6.718 sec)[0m
[35mINFO:tensorflow:global_step/sec: 36.0413[0m
[35mI0220 16:13:51.641853 139777914935104 basic_session_run_hooks.py:692] global_step/sec: 36.0413[0m
[35mINFO:tensorflow:loss = 0.14178167, step = 3024 (4.664 sec)[0m
[35mI0220 16:13:51.683806 139777914935104 basic_session_run_hooks.py:260] loss = 0.14178167, step = 3024 (4.664 sec)[0m
[35mINFO:tensorflow:global_step/sec: 36.3999[0m
[35mI0220 16:13:54.389117 139777914935104 basic_session_run_hooks.py:692] global_step/sec: 36.3999[0m
[35mINFO:tensorflow:loss = 0.15827554, step = 3194 (4.633 sec)[0m
[35mI0220 16:13:56.317152 139777914935104 basic_session_run_hooks.py:260] loss = 0.158275

## Pipe mode

In [None]:
#下面用多个spot实例进行parameter server方式的分布式训练。
import sagemaker
from sagemaker.tensorflow.estimator import TensorFlow
import time
import os

checkpoint_s3_uri = 's3://sagemaker-us-west-2-169088282855/deepfm-checkpoint' #Change to your own path if you want to save ckpt during training
checkpoint_local_path = '/opt/ml/checkpoints'
model_dir = 's3://sagemaker-us-west-2-169088282855/deepfm-ps-ckpt/{}'.format(int(time.time()))
output_path= 's3://sagemaker-us-west-2-169088282855/deepfm-2021'

training_channel_name = 'training'
evaluation_channel_name = 'evaluation'

train_instance_type = 'ml.c5.18xlarge'
train_instance_count= 2

train_use_spot_instances = True
enable_s3_shard = True

train_max_run=36000*2
train_max_wait = 72000 if train_use_spot_instances else None

distributions={'parameter_server': {'enabled': True}}

deep_layer = '128,64,32'

batch_size = 1024
feature_size = 117581

base_job_name='tf-scriptmode-deepfm'

hyperparameters = {'servable_model_dir': '/opt/ml/model', 'training_data_dir': '/opt/ml/input/data/training/',
                   'val_data_dir': '/opt/ml/input/data/evaluation/', 'log_steps': 10, 'num_epochs': 10, 
                   'field_size': 39, 'feature_size': feature_size, 'deep_layers': deep_layer,
                   'perform_shuffle': 0, 'batch_size': batch_size, 'pipe_mode': 1, 'enable_s3_shard': enable_s3_shard,
                   'training_channel_name': training_channel_name, 'evaluation_channel_name': evaluation_channel_name
                  }

estimator = TensorFlow(
                       #source_dir='./',
                       entry_point='DeepFM-dist-ps-for-multipleCPU-multiInstance.py',
                       model_dir=model_dir,
                       #checkpoint_s3_uri = checkpoint_s3_uri,
                       #checkpoint_local_path = checkpoint_local_path,
                       output_path= output_path,
                       instance_type=train_instance_type,
                       instance_count=train_instance_count,
                       #volume_size = 500,
                       hyperparameters=hyperparameters,
                       role=sagemaker.get_execution_role(),
                       base_job_name=base_job_name,
                       framework_version='1.14',
                       py_version='py3',
                       script_mode=True,
                       input_mode='Pipe',
                       distribution=distributions,
                       use_spot_instances=train_use_spot_instances,
                       max_wait=train_max_wait,
                       max_run=train_max_run,
                       debugger_hook_config = False,
                       disable_profiler=True
                       )

In [None]:
#下面这个测试pipe mode
from sagemaker.inputs import TrainingInput

train_s3_uri = 's3://sagemaker-us-west-2-169088282855/tf-SM-deepctr-deepfm-sample/data-tfrecord/training/'
validate_s3_uri = 's3://sagemaker-us-west-2-169088282855/tf-SM-deepctr-deepfm-sample/data-tfrecord/val/'

if enable_s3_shard:
    train_input = TrainingInput(train_s3_uri, distribution='ShardedByS3Key')
    val_input = TrainingInput(validate_s3_uri)
else :
    train_input = TrainingInput(train_s3_uri)
    val_input = TrainingInput(validate_s3_uri)

inputs = {training_channel_name : train_input, evaluation_channel_name : val_input}

estimator.fit(inputs)