# Multi GPU

Keras 2.0.9 makes it really easy to use multiple GPUs for Data-parallel training. Let's see how it's done!

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np

from keras.datasets import fashion_mnist
from keras.utils import to_categorical

## Load data
We use the newly added Fashion Mnist dataset from Zalando Research

https://github.com/zalandoresearch/fashion-mnist


    Label	Description
     0	     T-shirt/top
     1	     Trouser
     2	     Pullover
     3	     Dress
     4	     Coat
     5	     Sandal
     6	     Shirt
     7	     Sneaker
     8	     Bag
     9	     Ankle boot

In [None]:
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

X_train = np.expand_dims(X_train.astype("float"), -1)
X_test = np.expand_dims(X_test.astype("float"), -1)
Y_train = to_categorical(y_train)
Y_test = to_categorical(y_test)

In [None]:
X_train.shape

In [None]:
plt.figure(figsize=(12,4))
for i in range(1, 10):
    plt.subplot(1, 10, i)
    plt.imshow(X_train[i].reshape(28, 28), cmap='gray')
    plt.title(y_train[i])

In [None]:
# apply mean subtraction to the data
mean = np.mean(X_train, axis=0)
X_train -= mean
X_test -= mean

## Data Generator

to augment the data

In [None]:
from keras.preprocessing.image import ImageDataGenerator

In [None]:
datagen = ImageDataGenerator(width_shift_range=0.1,
                             height_shift_range=0.1,
                             horizontal_flip=True,
                             fill_mode="nearest")

## Convolutional Model

In [None]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPool2D
from keras.layers import Activation, Dropout, Flatten, Dense

In [None]:
def create_conv_model(input_shape=(28, 28, 1),
                      n_classes=10,
                      activation='relu',
                      kernel_initializer='glorot_normal'):
    model = Sequential()
    model.add(Conv2D(32, (3, 3),
                     padding='same',
                     input_shape=input_shape,
                     kernel_initializer=kernel_initializer,
                     activation=activation))
    model.add(Conv2D(32, (3, 3),
                     activation=activation, 
                     kernel_initializer=kernel_initializer))
    model.add(MaxPool2D(pool_size=(2, 2)))

    model.add(Conv2D(64, (3, 3),
                     padding='same',
                     activation=activation,
                     kernel_initializer=kernel_initializer))
    model.add(Conv2D(64, (3, 3),
                     activation=activation,
                     kernel_initializer=kernel_initializer))
    model.add(MaxPool2D(pool_size=(2, 2)))

    model.add(Flatten())
    model.add(Dense(512,
                    activation=activation,
                    kernel_initializer=kernel_initializer))
    model.add(Dense(n_classes,
                    activation='softmax',
                    kernel_initializer=kernel_initializer))

    return model

## Multi GPU

In [None]:
from keras.utils.training_utils import multi_gpu_model
import tensorflow as tf

In [None]:
gpus = 1

if more than one GPU is present on the machine we need to create a copy of the model on each GPU and sync them on the CPU

In [None]:
if gpus <= 1:
    model = create_conv_model()
else:
    with tf.device("/cpu:0"):
        model = create_conv_model()
    model = multi_gpu_model(model, gpus=gpus)

## Train

We will use a `LearningRateScheduler` callback to adjust the learning rate of SGD

In [None]:
n_epochs = 30
initial_lr = 5e-3

In [None]:
def poly_decay(epoch):
    max_epochs = n_epochs
    lr = initial_lr
    power = 1.0
    
    alpha = lr * (1 - (epoch / float(max_epochs))) ** power
    
    return alpha

epochs = []
lrs = []
for i in range(n_epochs):
    epochs.append(i)
    lrs.append(poly_decay(i))

plt.plot(epochs, lrs)

In [None]:
from keras.callbacks import LearningRateScheduler
from keras.optimizers import SGD

In [None]:
opt = SGD(lr=initial_lr, momentum=0.9)
callbacks = [LearningRateScheduler(poly_decay)]

model.compile(loss="categorical_crossentropy",
              optimizer=opt,
              metrics=["accuracy"])

In [None]:
batch_size = 1024

In [None]:
history = model.fit_generator(
    datagen.flow(X_train, Y_train, batch_size=batch_size * gpus),
    validation_data=(X_test, Y_test),
    steps_per_epoch=len(X_train) // (batch_size * gpus),
    epochs=n_epochs,
    callbacks=callbacks, verbose=1)

In [None]:
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Accuracy')
plt.legend(['train', 'test'])
plt.xlabel('Epochs')

## Exercise 1

your machine has 4 GPUs.
- compare the training time with 1 GPU VS 2 GPUs VS 4 GPUs. Is the training time the same? Is is larger or smaller?
- try to max out the gpu memory by increasing the batch size. Can you do it?
- is the model overfitting? experiment with the model architechture. Try to reduce overfitting by:
    - adding more layers
    - changing the filter size
    - adding more dropout
    - adding regularization
    - adding batch normalization

*Copyright &copy; 2017 CATALIT LLC.  All rights reserved.*