In [1]:
#1. 为了方便调试，先在AWS sagemaker的notebook实例上对小数据集进行训练，部署和预测。

In [None]:
#A. 构建在notebook实例上训练的环境

In [51]:
!wget -q https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-script-mode/master/local_mode_setup.sh
!wget -q https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-script-mode/master/daemon.json    
!/bin/bash ./local_mode_setup.sh

SageMaker instance route table setup is ok. We are good to go.
SageMaker instance routing for Docker is ok. We are good to go!


In [None]:
!pwd

In [None]:
#B. 在notebook实例上进行训练，注意model_dir，servable_model_dir必须设置为/opt/ml/model，否则之后部署模型会报错。
#下面超参数的设置请参考https://github.com/lambdaji/tf_repos.git中的说明。这里为了快速测试，epoch设置为1.

In [9]:
import sagemaker
from sagemaker.tensorflow import TensorFlow

train_instance_type = 'local'

#distributions={'parameter_server': {'enabled': True}}
distributions = {'mpi': {'enabled': True, "processes_per_host": 1}}

#设置python程序中的参数checkpoinPath与tensorflow estimator设置的checkpoint_local_path一致
hyperparameters = {'servable_model_dir': '/opt/ml/model', 'data_dir': '/opt/ml/input/data/training/',
                'checkpoinPath': '/opt/ml/checkpoints', 'log_steps': 10, 'num_epochs': 1, 'field_size': 39, 'feature_size': 117581, 'deep_layers': '2,2,2', 'pipe_mode': 0
                  }
local_estimator = TensorFlow(
                       entry_point='DeepFM-hvd.py',
                       model_dir = '/opt/ml/model',
                       train_instance_type=train_instance_type,
                       train_instance_count=3,
                       hyperparameters=hyperparameters,
                       role=sagemaker.get_execution_role(),
                       base_job_name='tf-scriptmode-deepctr-deepfm',
                       framework_version='1.13',
                       distributions=distributions,
                       py_version='py3')


In [10]:
import os
train_dir = os.path.join(os.getcwd(), 'raw')
inputs = {'training': f'file://{train_dir}' , 'eval':  f'file://{train_dir}'}
train_dir



'/home/ec2-user/SageMaker/deepfm test/raw'

In [11]:
local_estimator.fit(inputs)

RuntimeError: Invalid data source: /home/ec2-user/SageMaker/deepfm test/raw does not exist.

In [None]:
#C. 在notebook实例上部署，设置instance_type为local

In [54]:
local_predictor = local_estimator.deploy(initial_instance_count=1,instance_type='local')

Attaching to tmpszqjxs73_algo-1-foa5s_1
[36malgo-1-foa5s_1  |[0m INFO:__main__:starting services
[36malgo-1-foa5s_1  |[0m INFO:__main__:using default model name: model
[36malgo-1-foa5s_1  |[0m INFO:__main__:tensorflow serving model config: 
[36malgo-1-foa5s_1  |[0m model_config_list: {
[36malgo-1-foa5s_1  |[0m   config: {
[36malgo-1-foa5s_1  |[0m     name: "model",
[36malgo-1-foa5s_1  |[0m     base_path: "/opt/ml/model",
[36malgo-1-foa5s_1  |[0m     model_platform: "tensorflow"
[36malgo-1-foa5s_1  |[0m   },
[36malgo-1-foa5s_1  |[0m }
[36malgo-1-foa5s_1  |[0m 
[36malgo-1-foa5s_1  |[0m 
[36malgo-1-foa5s_1  |[0m INFO:__main__:nginx config: 
[36malgo-1-foa5s_1  |[0m load_module modules/ngx_http_js_module.so;
[36malgo-1-foa5s_1  |[0m 
[36malgo-1-foa5s_1  |[0m worker_processes auto;
[36malgo-1-foa5s_1  |[0m daemon off;
[36malgo-1-foa5s_1  |[0m pid /tmp/nginx.pid;
[36malgo-1-foa5s_1  |[0m error_log  /dev/stderr info;
[36malgo-1-foa5s_1  |[0m 
[36malgo-

In [None]:
#D. 在notebook上部署的模型做预测，注意这里的输入数据的格式要匹配到deepfm.py中的feature spec。

In [55]:
test_example = {'feat_ids': [1,2,3,4,5,6,7,8,9,10,11,12,13,15,555,1078,17797,26190,26341,28570,35361,35613,35984,48424,51364,64053,65964,66206,71628,84088,84119,86889,88280,88283,100288,100300,102447,109932,111823],
                'feat_vals': [0.05,0.006633,0.1,0,0.021594,0.008,0.15,0.04,0.362,0.1,0.2,0,0.04,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
                }
local_results = local_predictor.predict(test_example)

[36malgo-1-foa5s_1  |[0m 172.18.0.1 - - [16/Feb/2020:09:41:54 +0000] "POST /invocations HTTP/1.1" 200 38 "-" "-"


In [56]:
local_results

{'predictions': [0.296024]}

In [57]:
#E. 删除notebook实例上的模型部署。
local_predictor.delete_endpoint()

Gracefully stopping... (press Ctrl+C again to force)


In [None]:
#2. 在上面测试成功后，就可以用Sagemaker hosted的方式来训练，部署。
#Sagemaker hosted training

In [None]:
#A. 把数据集上传到S3

In [23]:
s3_prefix = 'tf-SM-deepctr-deepfm'

traindata_s3_prefix = '{}/data'.format(s3_prefix)


In [143]:
#抽取部分数据来训练和评估
#train_s3 = sagemaker.Session().upload_data(path='raw', key_prefix=traindata_s3_prefix)

#使用全量数据来训练和评估
train_s3 = sagemaker.Session().upload_data(path='mini_data', key_prefix=traindata_s3_prefix)


In [48]:
train_s3 = 's3://sagemaker-us-east-1-514385905925/tf-SM-deepctr-deepfm/data/tr.libsvm'
inputs = {'training':train_s3}

print(inputs)


{'training': 's3://sagemaker-us-east-1-514385905925/tf-SM-deepctr-deepfm/data/tr.libsvm'}


In [None]:
#B. 这里用GPU实例来训练，超参数需要修改。这里为了测试方便，没有对超参数做修改。

In [7]:
import sagemaker
from sagemaker.tensorflow import TensorFlow

train_instance_type = 'ml.p3.8xlarge'

"""
#设置比CPU实例的物理core少一点，因为下面设置bind to core。
hvd_processes_per_host = 2
distributions = {'mpi': {
                    'enabled': True,
                    'processes_per_host': hvd_processes_per_host,
                    'custom_mpi_options': '--bind-to core -verbose  -x OMPI_MCA_btl_vader_single_copy_mechanism=none'
                        }
                }
"""

hvd_processes_per_host = 3


"""
#使用horovod autotune来做训练加速
distributions = {'mpi': {
                    'enabled': True,
                    'processes_per_host': hvd_processes_per_host,
                    'custom_mpi_options': '-x HOROVOD_AUTOTUNE=1 -verbose --NCCL_DEBUG=INFO -x OMPI_MCA_btl_vader_single_copy_mechanism=none'
                        }
                }

"""
distributions = {'mpi': {
                    'enabled': True,
                    'processes_per_host': hvd_processes_per_host,
                    'custom_mpi_options': '-verbose --NCCL_DEBUG=INFO -x OMPI_MCA_btl_vader_single_copy_mechanism=none'
                        }
                }


train_use_spot_instances = True
train_max_run=432000 #这个是AWS账号默认的最大训练时间，可以通过联系AWS售后提升整个limit。
train_max_wait = 432000 if train_use_spot_instances else None
model_dir = '/opt/ml/model'
batch_size = 1024 * 32
#deep_layer = '256,128,64'
deep_layer = '4096,4096,4096'   #把全连接层变大，来测试GPU

#设置python程序中的参数checkpoinPath与tensorflow estimator设置的checkpoint_local_path一致
#注意channel的名字需要和之后fit中提供的channel的名字是一样的。更优雅的做法是在BYOS脚本中通过SM的环境变量去获取channel的名字从而或者数据集的路径。
train_channel = 'training'
data_dir = '/opt/ml/input/data/' + train_channel
hyperparameters = {'servable_model_dir': '/opt/ml/model', 'data_dir': data_dir,
                'checkpoinPath': '/home/checkpoints', 'log_steps': 100, 'num_epochs': 200, 'field_size': 39, 'feature_size': 117581, 'deep_layers': deep_layer,
                  'perform_shuffle': 0, 'batch_size': batch_size, 'pipe_mode': 1, 'worker_per_host': hvd_processes_per_host
                  }
estimator = TensorFlow(#entry_point='DeepFM-hvd.py',
                       #entry_point='DeepFM-dist-ps-for-multipleCPU-multiInstance.py',
                       entry_point='DeepFM-hvd-tfrecord-vectorized-map.py',
                       model_dir=model_dir,
                       train_instance_type=train_instance_type,
                       train_instance_count=1,
                       checkpoint_s3_uri = 's3://liang200/deepfm-checkpointtest07899987',
                       checkpoint_local_path = '/home/checkpoints',
                       hyperparameters=hyperparameters,
                       role=sagemaker.get_execution_role(),
                       base_job_name='tf-scriptmode-deepctr-deepfm',
                       framework_version='1.14',
                       py_version='py3',
                       script_mode=True,
                       input_mode='Pipe',
                       distributions=distributions,
                       train_use_spot_instances=train_use_spot_instances,
                       train_max_wait=train_max_wait,
                       train_max_run=train_max_run
                       )

Parameter distribution will be renamed to {'mpi': {'enabled': True, 'processes_per_host': 3, 'custom_mpi_options': '-verbose --NCCL_DEBUG=INFO -x OMPI_MCA_btl_vader_single_copy_mechanism=none'}} in SageMaker Python SDK v2.


In [None]:

#下面这个测试pipe mode for libsvm
"""
train_s3 = 's3://sagemaker-us-east-1-514385905925/tf-SM-deepctr-deepfm/pipemode-training-data'
validate_s3 = 's3://sagemaker-us-east-1-514385905925/tf-SM-deepctr-deepfm/pipemode-evaluation-data'
#train2_s3 = 's3://sagemaker-us-east-1-514385905925/tf-SM-deepctr-deepfm/pipemode-training-data-2'
#train3_s3 = 's3://sagemaker-us-east-1-514385905925/tf-SM-deepctr-deepfm/pipemode-training-data-3'

inputs = {'training':train_s3, 'training-2':train_s3, 'evaluation': validate_s3}

#inputs = {'training':train_s3, 'training-2':train_s3, 'training-3':train_s3, 'evaluation': validate_s3}
print(inputs)
estimator.fit(inputs)
"""

"""
#下面这个测试file mode for libsvm
train_s3 = 's3://sagemaker-us-east-1-514385905925/tf-SM-deepctr-deepfm/pipemode-training-data'
#inputs = {'training':train_s3}
inputs = {train_channel: train_s3}

print(inputs)
estimator.fit(inputs)



#test for tfrecord on file mode

train_s3 = 's3://sagemaker-us-east-1-514385905925/tf-SM-deepctr-deepfm/tfrecords'
inputs = {'training':train_s3}
print(inputs)
estimator.fit(inputs)
"""

#test for tfrecord on pipe mode
train_s3 = 's3://sagemaker-us-east-1-514385905925/tf-SM-deepctr-deepfm/pipemode_tfrecord_train'
validate_s3 = 's3://sagemaker-us-east-1-514385905925/tf-SM-deepctr-deepfm/pipemode_tfrecord_evaluation'
inputs = {'training':train_s3, 'training-2':train_s3, 'training-3':train_s3, 'evaluation': validate_s3}

print(inputs)
estimator.fit(inputs)


{'training': 's3://sagemaker-us-east-1-514385905925/tf-SM-deepctr-deepfm/pipemode_tfrecord_train', 'training-2': 's3://sagemaker-us-east-1-514385905925/tf-SM-deepctr-deepfm/pipemode_tfrecord_train', 'training-3': 's3://sagemaker-us-east-1-514385905925/tf-SM-deepctr-deepfm/pipemode_tfrecord_train', 'evaluation': 's3://sagemaker-us-east-1-514385905925/tf-SM-deepctr-deepfm/pipemode_tfrecord_evaluation'}


's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


2020-10-21 10:01:39 Starting - Starting the training job...
2020-10-21 10:01:41 Starting - Launching requested ML instances............
2020-10-21 10:03:42 Starting - Preparing the instances for training...
2020-10-21 10:04:29 Downloading - Downloading input data...
2020-10-21 10:05:01 Training - Downloading the training image...
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])[0m
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])[0m
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])[0m
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])[0m
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])[0m
  np_resource = np.dtype([("resource", np.ubyte, 1)])[0m
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])[0m
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])[0m
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])[0m
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])[0m
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])[0m
  np_resource = np.dtype([("resource", np.ubyte, 1)])[0m
[

In [19]:
#查看模型的保存位置。
estimator.model_data

's3://sagemaker-us-east-1-514385905925/tf-scriptmode-deepctr-deepfm-2020-05-06-10-21-43-837/output/model.tar.gz'

In [6]:
#测试直接从模型文件部署，可以设置env来影响TFS的batch功能。


import sagemaker
from sagemaker.tensorflow import TensorFlow

#设置环境变量
env = {'SAGEMAKER_TFS_ENABLE_BATCHING': 'true',
       'SAGEMAKER_TFS_BATCH_TIMEOUT_MICROS': '50000',
       'SAGEMAKER_TFS_MAX_BATCH_SIZE': '128',
      'SAGEMAKER_TFS_BATCH_TIMEOUT_MICROS':"100000",
       'SAGEMAKER_TFS_NUM_BATCH_THREADS':"16",
       'SAGEMAKER_TFS_MAX_ENQUEUED_BATCHES':"10000"
      }


sagemaker_model = sagemaker.tensorflow.serving.Model(model_data = 's3://sagemaker-us-east-1-514385905925/tf-scriptmode-deepctr-deepfm-2020-03-02-11-02-03-098/output/model.tar.gz',
                                  role = sagemaker.get_execution_role(),
                                  framework_version = '1.14',
                                  env=env)

predictor = sagemaker_model.deploy(initial_instance_count=1,
                                          instance_type='ml.p3.2xlarge')

-------------!

In [78]:
#C. 用Sagemaker的GPU实例来部署模型。

In [20]:
predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.c5.2xlarge')

-----------!

In [None]:
#D. 用单个样本来预测

In [21]:
#这个样本的label是0
test_example = {'feat_ids': [1,2,3,4,5,6,7,8,9,10,11,12,13,15,555,1078,17797,26190,26341,28570,35361,35613,35984,48424,51364,64053,65964,66206,71628,84088,84119,86889,88280,88283,100288,100300,102447,109932,111823],
                'feat_vals': [0.05,0.006633,0.05,0,0.021594,0.008,0.15,0.04,0.362,0.1,0.2,0,0.04,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
                }
results = predictor.predict(test_example)
results

{'predictions': [0.371603608]}

In [81]:
#查看部署的模型的endpoint名字
predictor.endpoint

'tf-scriptmode-deepctr-deepfm-2020-02-16-14-59-46-265'

In [None]:
#3. 如果想在Sagemaker外面来进行API call做预测的话，参考如下的代码

In [17]:
import os
import io
import boto3
import json
import csv

#endpointname = predictor.endpoint

endpointname = 'tensorflow-inference-2020-03-02-14-12-31-923'
runtime= boto3.client('runtime.sagemaker')

test_example = {'feat_ids': [1,2,3,4,5,6,7,8,9,10,11,12,13,15,555,1078,17797,26190,26341,28570,35361,35613,35984,48424,51364,64053,65964,66206,71628,84088,84119,86889,88280,88283,100288,100300,102447,109932,111823],
                'feat_vals': [0.05,0.006633,0.05,0,0.021594,0.008,0.15,0.04,0.362,0.1,0.2,0,0.04,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
                }

print(test_example)


"""
more_example = {
 "inputs": {
   "feat_ids": [[1,2,3,4,5,6,7,8,9,10,11,12,13,15,555,1078,17797,26190,26341,28570,35361,35613,35984,48424,51364,64053,65964,66206,71628,84088,84119,86889,88280,88283,100288,100300,102447,109932,111823], [1,2,3,4,5,6,7,8,9,10,11,12,13,15,555,1078,17797,26190,26341,28570,35361,35613,35984,48424,51364,64053,65964,66206,71628,84088,84119,86889,88280,88283,100288,100300,102447,109932,111823]],
   "feat_vals": [[0.05,0.006633,0.05,0,0.021594,0.008,0.15,0.04,0.362,0.1,0.2,0,0.04,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1], [0.05,0.006633,0.05,0,0.021594,0.008,0.15,0.04,0.362,0.1,0.2,0,0.04,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]]
 }
}
"""

more_example = {
    "instances": [
        {
            'feat_ids': [1,2,3,4,5,6,7,8,9,10,11,12,13,15,555,1078,17797,26190,26341,28570,35361,35613,35984,48424,51364,64053,65964,66206,71628,84088,84119,86889,88280,88283,100288,100300,102447,109932,111823],
            'feat_vals': [0.05,0.006633,0.05,0,0.021594,0.008,0.15,0.04,0.362,0.1,0.2,0,0.04,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
           
        },
        {
            'feat_ids': [1,2,3,4,5,6,7,8,9,10,11,12,13,15,555,1078,17797,26190,26341,28570,35361,35613,35984,48424,51364,64053,65964,66206,71628,84088,84119,86889,88280,88283,100288,100300,102447,109932,111823],
            'feat_vals': [0.05,0.006633,0.05,0,0.021594,0.008,0.15,0.04,0.362,0.1,0.2,0,0.04,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
                
        }
    ]
    
}
response = runtime.invoke_endpoint(EndpointName=endpointname,
                                   ContentType='application/json',
                                   Body=json.dumps(more_example))
print(response)
result = json.loads(response['Body'].read().decode())
print(result)


{'feat_ids': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 555, 1078, 17797, 26190, 26341, 28570, 35361, 35613, 35984, 48424, 51364, 64053, 65964, 66206, 71628, 84088, 84119, 86889, 88280, 88283, 100288, 100300, 102447, 109932, 111823], 'feat_vals': [0.05, 0.006633, 0.05, 0, 0.021594, 0.008, 0.15, 0.04, 0.362, 0.1, 0.2, 0, 0.04, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'ResponseMetadata': {'RequestId': '61c4bd2e-0f1d-4e8c-a861-ad482ab5caab', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '61c4bd2e-0f1d-4e8c-a861-ad482ab5caab', 'x-amzn-invoked-production-variant': 'AllTraffic', 'date': 'Fri, 6 Mar 2020 05:30:45 GMT', 'content-type': 'application/json', 'content-length': '54'}, 'RetryAttempts': 0}, 'ContentType': 'application/json', 'InvokedProductionVariant': 'AllTraffic', 'Body': <botocore.response.StreamingBody object at 0x7f2011fcd518>}
{'predictions': [0.403204918, 0.403204918]}


In [112]:
#4. 做完实验，删掉部署的endpoint防止产生不必要的费用。
sagemaker.Session().delete_endpoint(predictor.endpoint)

ClientError: An error occurred (ValidationException) when calling the DeleteEndpoint operation: Could not find endpoint "arn:aws:sagemaker:us-east-1:514385905925:endpoint/tf-scriptmode-deepctr-deepfm-2020-02-24-07-02-13-403".

In [8]:
import tensorflow as tf


feat_vals = tf.constant([1,0,1])
feat_vals = tf.reshape(feat_vals,shape=[-1,3])


FM_V = tf.constant([[1,2],[3,4],[5,6]])


#feat_ids = tf.constant([0,2])

feat_ids = tf.constant([0,1,2])
feat_ids = tf.reshape(feat_ids,shape=[-1,3])

embeddings = tf.nn.embedding_lookup(FM_V, feat_ids)
#feat_vals_2 = tf.nn.embedding_lookup(feat_vals, feat_ids)
feat_vals_2 = tf.reshape(feat_vals, shape=[-1, 3, 1])

embeddings = tf.multiply(embeddings, feat_vals_2)
sum_square = tf.square(tf.reduce_sum(embeddings,1))

square_sum = tf.reduce_sum(tf.square(embeddings),1)

y_v = tf.reduce_sum(tf.subtract(sum_square, square_sum),1)


# 矩阵相乘
feat_vals_1 = tf.reshape(feat_vals, shape=[-1,3])
Y_FM = tf.reduce_sum(
        tf.pow(tf.matmul(feat_vals_1, FM_V), 2) -
        tf.matmul(feat_vals_1, tf.pow(FM_V, 2)), 1)


init = tf.initialize_all_variables()
with tf.Session() as sess:
    sess.run(init)
    Y_FM = sess.run(Y_FM)
    FM_SHAPE = sess.run(FM_V)
    print("FM_SHAPE",FM_SHAPE.shape)
    feat_ids
    feat_ids_SHAPE = sess.run(feat_ids)
    print("feat_ids_SHAPE",feat_ids_SHAPE.shape)
    print(Y_FM)
    embeddings = sess.run(embeddings)
    data = sess.run(sum_square)
    square_sum = sess.run(square_sum)
    y_v = sess.run(y_v)
    print(embeddings)
    print(data)
    print(square_sum)
    print(y_v)

FM_SHAPE (3, 2)
feat_ids_SHAPE (1, 3)
[34]
[[[1 2]
  [0 0]
  [5 6]]]
[[36 64]]
[[26 40]]
[34]


In [46]:
0//1

0

In [47]:
1//1

1

In [48]:
7/2

3.5

In [49]:
7/4

1.75

In [50]:
7//4

1

In [7]:
import tensorflow as tf
#tf.compat.v1.disable_eager_execution()
tf.enable_eager_execution()

ds = tf.data.Dataset.range(10)
ds = ds.cache()

ds = ds.shuffle(5,reshuffle_each_iteration=True)
ds = ds.repeat(9)
ds = ds.batch(2)

for epoch in range(9):
  i = 0
  for batch in ds:
    print(batch.numpy())
    i = i+1
  print("End of epoch: ", epoch)
  print(i)

[4 1]
[2 3]
[7 9]
[5 0]
[8 6]
[0 5]
[6 3]
[7 1]
[8 2]
[4 9]
[4 0]
[1 7]
[8 3]
[5 6]
[9 2]
[2 4]
[0 5]
[3 6]
[9 8]
[7 1]
[0 2]
[3 7]
[5 6]
[4 8]
[9 1]
[0 2]
[1 5]
[6 7]
[8 4]
[9 3]
[3 1]
[2 5]
[8 6]
[4 9]
[0 7]
[2 5]
[1 0]
[7 6]
[3 8]
[9 4]
[2 4]
[5 6]
[0 7]
[8 1]
[9 3]
End of epoch:  0
45
[4 1]
[2 3]
[7 9]
[5 0]
[8 6]
[0 5]
[6 3]
[7 1]
[8 2]
[4 9]
[4 0]
[1 7]
[8 3]
[5 6]
[9 2]
[2 4]
[0 5]
[3 6]
[9 8]
[7 1]
[0 2]
[3 7]
[5 6]
[4 8]
[9 1]
[0 2]
[1 5]
[6 7]
[8 4]
[9 3]
[3 1]
[2 5]
[8 6]
[4 9]
[0 7]
[2 5]
[1 0]
[7 6]
[3 8]
[9 4]
[2 4]
[5 6]
[0 7]
[8 1]
[9 3]
End of epoch:  1
45
[4 1]
[2 3]
[7 9]
[5 0]
[8 6]
[0 5]
[6 3]
[7 1]
[8 2]
[4 9]
[4 0]
[1 7]
[8 3]
[5 6]
[9 2]
[2 4]
[0 5]
[3 6]
[9 8]
[7 1]
[0 2]
[3 7]
[5 6]
[4 8]
[9 1]
[0 2]
[1 5]
[6 7]
[8 4]
[9 3]
[3 1]
[2 5]
[8 6]
[4 9]
[0 7]
[2 5]
[1 0]
[7 6]
[3 8]
[9 4]
[2 4]
[5 6]
[0 7]
[8 1]
[9 3]
End of epoch:  2
45
[4 1]
[2 3]
[7 9]
[5 0]
[8 6]
[0 5]
[6 3]
[7 1]
[8 2]
[4 9]
[4 0]
[1 7]
[8 3]
[5 6]
[9 2]
[2 4]
[0 5]
[3 6]
[9 8]
[7 1]
[0 2]
[3 7

In [8]:
import tensorflow as tf
#tf.compat.v1.disable_eager_execution()
tf.enable_eager_execution()

ds = tf.data.Dataset.range(10)
ds = ds.cache()

ds = ds.shuffle(5,reshuffle_each_iteration=True)
ds = ds.batch(2)
ds = ds.repeat(9)


for epoch in range(9):
  i = 0
  for batch in ds:
    print(batch.numpy())
    i = i+1
  print("End of epoch: ", epoch)
  print(i)

[1 4]
[2 3]
[0 5]
[8 7]
[9 6]
[0 3]
[2 7]
[8 5]
[1 6]
[9 4]
[0 5]
[6 3]
[8 1]
[2 4]
[7 9]
[4 3]
[2 1]
[7 9]
[5 0]
[6 8]
[1 4]
[0 7]
[6 2]
[5 3]
[8 9]
[2 0]
[5 1]
[4 8]
[6 3]
[9 7]
[3 0]
[1 2]
[5 4]
[6 9]
[7 8]
[2 0]
[5 4]
[7 6]
[1 8]
[3 9]
[0 4]
[2 5]
[7 3]
[6 9]
[8 1]
End of epoch:  0
45
[1 4]
[2 3]
[0 5]
[8 7]
[9 6]
[0 3]
[2 7]
[8 5]
[1 6]
[9 4]
[0 5]
[6 3]
[8 1]
[2 4]
[7 9]
[4 3]
[2 1]
[7 9]
[5 0]
[6 8]
[1 4]
[0 7]
[6 2]
[5 3]
[8 9]
[2 0]
[5 1]
[4 8]
[6 3]
[9 7]
[3 0]
[1 2]
[5 4]
[6 9]
[7 8]
[2 0]
[5 4]
[7 6]
[1 8]
[3 9]
[0 4]
[2 5]
[7 3]
[6 9]
[8 1]
End of epoch:  1
45
[1 4]
[2 3]
[0 5]
[8 7]
[9 6]
[0 3]
[2 7]
[8 5]
[1 6]
[9 4]
[0 5]
[6 3]
[8 1]
[2 4]
[7 9]
[4 3]
[2 1]
[7 9]
[5 0]
[6 8]
[1 4]
[0 7]
[6 2]
[5 3]
[8 9]
[2 0]
[5 1]
[4 8]
[6 3]
[9 7]
[3 0]
[1 2]
[5 4]
[6 9]
[7 8]
[2 0]
[5 4]
[7 6]
[1 8]
[3 9]
[0 4]
[2 5]
[7 3]
[6 9]
[8 1]
End of epoch:  2
45
[1 4]
[2 3]
[0 5]
[8 7]
[9 6]
[0 3]
[2 7]
[8 5]
[1 6]
[9 4]
[0 5]
[6 3]
[8 1]
[2 4]
[7 9]
[4 3]
[2 1]
[7 9]
[5 0]
[6 8]
[1 4]
[0 7

In [11]:
import tensorflow as tf
from tensorflow import keras

# Set the config values 
config = tf.ConfigProto(intra_op_parallelism_threads=2, 
inter_op_parallelism_threads=2, allow_soft_placement=True, device_count = {'CPU': 
2 })

#Create the session
session = tf.Session(config=config)
tf.keras.backend.set_session(session)

In [None]:
test_config = {'cluster': {'master': ['algo-1:2222'], 'ps': ['algo-1:2223', 'algo-2:2223'], 'worker': ['algo-2:2222']}, 'environment': 'cloud', 'task': {'index': 0, 'type': 'worker'}}

In [None]:
if test_config and 'master' in test_config:
    print("enter")
    os.environ['test_config'] = test_config.replace('master', 'chief')

In [None]:
os.environ['test_config']

In [None]:
test = os.environ.get('test_config')
test