# Chapter 12: Distributing TensorFlow Across Devices and Servers

This notebook is my solution to exercise 10 of chapter 12. It contains three distributed models. Each model requires you restart the kernel and run the code in the **Installation** section.

## Exercise 10

Train a DNN using between-graph replication and data parallelism with asynchronous updates, timimg how long it taeks to reach a satisfying performance. Next, try again using synchronous updates. Do synchronous updates produce a better model? Does it train faster? Split the DNN vertically and place each vertical slice on a different device, and train the model again. Is training any faster? Is performance any different?

## Solution

### Installation

In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:01_CDT_2018
Cuda compilation tools, release 10.0, V10.0.130


In [0]:
!pip3 install --upgrade tensorflow-gpu

### Asynchronous Updates

In [0]:
# Downloading MNIST dataset.

import tensorflow as tf
import numpy as np

(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = X_train.astype(np.float32).reshape(-1, 28*28) / 255.0
X_test = X_test.astype(np.float32).reshape(-1, 28*28) / 255.0
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)
X_valid, X_train = X_train[:5000], X_train[5000:]
y_valid, y_train = y_train[:5000], y_train[5000:]

In [0]:
# Defining the cluster spec for the parallel model.

n_dnns = 3

cluster_spec = tf.train.ClusterSpec({
    'ps': ['127.0.0.1:1000'],
    'worker': ['127.0.0.1:100{}'.format(i + 1) for i in range(1, n_dnns)]
})

In [0]:
# Abstracting the operations with the individual workers which train their own
# copy of the DNN into a class. This model uses the hyperparameters that led
# to the best performance on the validation set in exercise 8.

n_outputs = 10

class DNNTask:
  def __init__(self, X, y, task, parameters, activation=tf.nn.elu, n_outputs=10,
               learning_rate=0.01, momentum=0.95, n_epochs=200, batch_size=50):
    self.task = task
    self.parameters = parameters
    self.batch_size = batch_size
    self.n_batches = len(X_train) // batch_size
    self.n_epochs = n_epochs

    self._X = X
    self._y = y
    self._gpu_name = '/job:worker/task:{}/gpu:0'.format(task)
    self._cpu_name = '/job:worker/task:{}/cpu:0'.format(task)
    self._model_path = 'model{}.ckpt'.format(task)

    with tf.device(self._gpu_name):
      with tf.variable_scope('worker{}'.format(task)):
        self._hidden_layers = []
        for i, params in enumerate(parameters):
          W, b = params
          self._hidden_layers.append(
              activation(
                  tf.matmul(
                      (X if i == 0 else self._hidden_layers[-1]), W) + b))
        self._logits = tf.layers.dense(self._hidden_layers[-1], n_outputs)
        self._xentropy = \
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
                                                           logits=self._logits)
        self._loss = tf.reduce_mean(self._xentropy)
        self._optimizer = \
            tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                       momentum=momentum)
        self._training_op = self._optimizer.minimize(self._loss)

    with tf.device(self._cpu_name):
      with tf.variable_scope('worker{}'.format(task)):
        self._saver = tf.train.Saver()
        self._init = tf.global_variables_initializer()
        self._correct = tf.nn.in_top_k(self._logits, y, 1)
        self._accuracy = tf.reduce_mean(tf.cast(self._correct, tf.float32))
        self._shuffle_queue = \
            tf.RandomShuffleQueue(capacity=len(X_train), min_after_dequeue=0,
                                  dtypes=[tf.float32, tf.int32],
                                  shapes=[(n_inputs), ()], name='input_queue',
                                  shared_name='input_queue')
        self._enqueue_op = self._shuffle_queue.enqueue_many([X, y],
                                                            name='enqueue')
        self._dequeue_op = self._shuffle_queue.dequeue_up_to(batch_size,
                                                             name='dequeue')

  def train_model(self, sess):
    with sess.as_default():
      sess.run(self._init)

      best_loss = 0
      rounds_since_best_loss = 0

      for epoch in range(self.n_epochs):
        sess.run(self._enqueue_op, feed_dict={self._X: X_train,
                                               self._y: y_train})
        for _ in range(self.n_batches):
          X_batch, y_batch = sess.run(self._dequeue_op)
          sess.run(self._training_op, feed_dict={self._X: X_batch,
                                                 self._y: y_batch})
        if epoch == 0:
          best_loss = self._loss.eval(feed_dict={self._X: X_train,
                                                 self._y: y_train})
          self._saver.save(sess, self._model_path)
        elif epoch % 5 == 0:
          loss_val = self._loss.eval(feed_dict={self._X: X_train,
                                                self._y: y_train})
          if loss_val < best_loss:
            best_loss = loss_val
            rounds_since_best_loss = 0
            self._saver.save(sess, self._model_path)
          else:
            rounds_since_best_loss += 1
            if rounds_since_best_loss == 6:
              break
      else:
        self._saver.save(sess, self._model_path)

      self._saver.restore(sess, self._model_path)
      acc_val = self._accuracy.eval(feed_dict={self._X: X_test,
                                               self._y: y_test})
      print('Task {} Complete!\nTest set accuracy: {}'.format(
          self.task, acc_val))

In [8]:
# Defining the graph for the model.

n_inputs = 28 ** 2
n_hidden_layers = 5
n_neurons = 160
stddev = 2.0 / np.sqrt(n_inputs + n_neurons)

tf.reset_default_graph()
ensemble = []

with tf.device('/job:ps/task:0/cpu:0'):
  with tf.variable_scope('ps0'):
    X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
    y = tf.placeholder(tf.int32, shape=(None), name='y')

    parameters = []
    for i in range(n_hidden_layers):
      W = tf.Variable(
          tf.truncated_normal(
              (n_inputs if i == 0 else n_neurons, n_neurons),
              mean=0.0, stddev=stddev))
      b = tf.Variable(tf.zeros([n_neurons]))
      parameters.append((W, b))

    init = tf.global_variables_initializer()

for task in range(n_dnns):
  ensemble.append(DNNTask(X, y, task, parameters))

Instructions for updating:
Use keras.layers.dense instead.


In [0]:
# Starting the servers.

ps = tf.train.Server(cluster_spec, job_name='ps', task_index=0)
workers = []

for task in range(n_dnns):
  workers.append(
      tf.train.Server(cluster_spec, job_name='worker', task_index=task))

In [0]:
# Defining the Clock class for timing training.

import time

class Clock:
  def __init__(self):
    self.start_time = None
  def start(self):
    self.start_time = time.time()
    return self
  def stop(self):
    dt = time.time() - self.start_time
    self.start_time = None
    h, m, s = int(dt // 3600), int(dt % 3600) // 60, dt % 60
    return '{}h {}m {:.3f}s'.format(h, m, s)

In [14]:
# Running the training algorithm

from threading import Thread

clock = Clock().start()

with tf.Session(ps.target) as sess:
  sess.run(init)
  threads = []
  for task in range(n_dnns):
    thread = Thread(target=lambda s: ensemble[task].train_model(s),
                    args=(sess,))
    thread.start()
    threads.append(thread)
  for thread in threads:
    thread.join()
print('Time taken to train model:', clock.stop())

INFO:tensorflow:Restoring parameters from model0.ckpt
Task 0 Complete!
Test set accuracy: 0.9850000143051147
INFO:tensorflow:Restoring parameters from model1.ckpt
Task 1 Complete!
Test set accuracy: 0.9850999712944031
INFO:tensorflow:Restoring parameters from model2.ckpt
Task 2 Complete!
Test set accuracy: 0.9847999811172485
Time taken to train model: 1h 26m 26.734s


The model performed slightly better than a single neural network with those parameters, but training took significantly longer.

### Synchronous Updates

In [0]:
# Downloading MNIST dataset.

import tensorflow as tf
import numpy as np

(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = X_train.astype(np.float32).reshape(-1, 28*28) / 255.0
X_test = X_test.astype(np.float32).reshape(-1, 28*28) / 255.0
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)
X_valid, X_train = X_train[:5000], X_train[5000:]
y_valid, y_train = y_train[:5000], y_train[5000:]

In [0]:
# Defining the cluster spec for the parallel model.

n_dnns = 3

cluster_spec = tf.train.ClusterSpec({
    'ps': ['127.0.0.1:1000'],
    'worker': ['127.0.0.1:100{}'.format(i + 1) for i in range(n_dnns)]
})

In [0]:
# Abstracting the training route of each DNN worker task into a class.
# This class is not responsible for training the model. It receives
# parameter updates from the parameter server job and then computes the
# error gradient which it sends back to the parameter server to update.

class DNNTask:
  def __init__(self, X, y, task, parameters, activation=tf.nn.elu, n_outputs=10,
               learning_rate=0.01, momentum=0.95):
    self._X = tf.placeholder

  def run_training_epoch(self, sess):
    pass

In [0]:
# Defining the graph for training the model.

n_inputs = 28 ** 2
n_hidden_layers = 5
n_neurons = 160
stddev = 2.0 / np.sqrt(n_inputs + n_neurons)

with tf.device('/job:ps/task:0/cpu:0'):
  with tf.variable_scope('ps0'):
    X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
    y = tf.placeholder(tf.int32, shape=(None), name='y')

    parameters = []
    for i in range(n_hidden_layers):
      W = tf.Variable(
          tf.truncated_normal(
              (n_inputs if i == 0 else n_neurons, n_neurons),
              mean=0.0, stddev=stddev))
      b = tf.Variable(tf.zeros([n_neurons]))
      parameters.append((W, b))