In [1]:
!nvidia-smi

Wed May 30 00:55:25 2018       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.130                Driver Version: 384.130                   |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 108...  Off  | 00000000:29:00.0  On |                  N/A |
| 18%   53C    P2    61W / 280W |    834MiB / 11169MiB |      5%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [2]:
from mxnet import nd
from mxnet import gluon

# initialize parameters
scale = .01
W1 = nd.random.normal(shape = (20, 1, 3, 3)) * scale
b1 = nd.zeros(shape = 20)
W2 = nd.random.normal(shape = (50, 20, 5, 5)) * scale
b2 = nd.zeros(shape = 50)
W3 = nd.random.normal(shape = (800, 128)) * scale
b3 = nd.zeros(128)
W4 = nd.random.normal(shape = (128, 10))
b4 = nd.zeros(shape = 10)
params = [W1, b1, W2, b2, W3, b3, W4, b4]

# network and loss
def lenet(X, params):
    # 1st conv
    h1_conv = nd.Convolution(data = X, weight = params[0], bias = params[1], 
                             kernel = (3, 3), num_filter = 20)
    h1_activation = nd.relu(h1_conv)
    h1 = nd.Pooling(data = h1_activation, pool_type = 'avg', 
                    kernel = (2, 2), stride = (2, 2))
    # 2nd conv
    h2_conv = nd.Convolution(data = h1, weight = params[2], bias = params[3], 
                             kernel = (5, 5), num_filter = 50)
    h2_activation = nd.relu(h2_conv)
    h2 = nd.Pooling(data = h2_activation, pool_type = 'avg', 
                    kernel = (2, 2), stride = (2,2))
    h2_flat = nd.flatten(h2)
    # 1st dense
    h3_linear = nd.dot(h2_flat, params[4]) + params[5]
    h3 = nd.relu(h3_linear)
    # 2nd dense
    yhat = nd.dot(h3, params[6]) + params[7]
    return yhat

loss = gluon.loss.SoftmaxCrossEntropyLoss()

  import OpenSSL.SSL


In [3]:
from mxnet import gpu
from mxnet import cpu

def get_params(params, ctx):
    new_params = [p.copyto(ctx) for p in params]
    for p in new_params:
        p.attach_grad()
    return new_params

# copy params to GPU(0)
new_params = get_params(params, gpu(0))
print('b1 weight = ', new_params[1])
print('b1 grad = ', new_params[1].grad)

b1 weight =  
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]
<NDArray 20 @gpu(0)>
b1 grad =  
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]
<NDArray 20 @gpu(0)>


In [4]:
import mxnet as mx
def allreduce(data):
    # sum on data[0].context, and then broadcast
    for i in range(1, len(data)):
        data[0][:] += data[i].copyto(data[0].context)
    for i in range(1, len(data)):
        data[0].copyto(data[i])
        
# data = [nd.ones((1, 2), ctx = mx.gpu(i)) * (i + 1) for i in range(2)]
data = [nd.ones((1, 2), ctx = gpu(0)), nd.ones((1, 2), ctx = cpu(0)) * 2]
print('Before:', data)
allreduce(data)
print('After:', data)

Before: [
[[ 1.  1.]]
<NDArray 1x2 @gpu(0)>, 
[[ 2.  2.]]
<NDArray 1x2 @cpu(0)>]
After: [
[[ 3.  3.]]
<NDArray 1x2 @gpu(0)>, 
[[ 3.  3.]]
<NDArray 1x2 @cpu(0)>]


In [5]:
def split_and_load(data, ctx):
    n, k = data.shape[0], len(ctx)
    m = n // k
    assert m * k == n, '# examples is not divided by # devices'
    return [data[i * m : (i + 1) * m].as_in_context(ctx[i]) for i in range(k)]

batch = nd.arange(16).reshape((4, 4))
batch = batch.as_in_context(gpu(0))
# ctx = [gpu(0), gpu(1)]
ctx = [gpu(0), cpu(0)]
splitted = split_and_load(batch, ctx)

print('Input: ', batch)
print('Load into', ctx)
print('Output:', splitted)

Input:  
[[  0.   1.   2.   3.]
 [  4.   5.   6.   7.]
 [  8.   9.  10.  11.]
 [ 12.  13.  14.  15.]]
<NDArray 4x4 @gpu(0)>
Load into [gpu(0), cpu(0)]
Output: [
[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]]
<NDArray 2x4 @gpu(0)>, 
[[  8.   9.  10.  11.]
 [ 12.  13.  14.  15.]]
<NDArray 2x4 @cpu(0)>]


In [6]:
print(len(params[0]))

20


In [7]:
from mxnet import  autograd as ag
import sys
sys.path.append('..')
import utils

def train_batch(data, label, params, ctx, lr):
    # split the data batch and load them on GPUs
    data_list = split_and_load(data, ctx)
    label_list = split_and_load(label, ctx)
    # run forward on each GPU
    with ag.record():
        losses = [loss(lenet(X, W), Y) 
                  for X, Y, W in zip(data_list, label_list, params)]
    # run backward on each GPU
    for l in losses:
        l.backward()
    # aggregate gradient over GPUs
    for i in range(len(params[0])):  ######## W1 b1 W2 b2 ... W4 b4  8个 ######
        allreduce([params[c][i].grad for c in range(len(ctx))])
    # update params with SGD on each GPU
    for p in params:
        utils.SGD(p, lr / data.shape[0])

In [8]:
from time import time

def train(num_gpus, batch_size, lr):
    train_data, test_data = utils.load_data_fashion_mnist(batch_size)
    
#     ctx = [gpu(i) for i in range(num_gpus)]
    try:
        num_gpus > 2
    except:
        print('Not more than 1 GPU and 1 CPU')
#     assert num_gpus < 3
    if num_gpus == 1:
        ctx = [gpu(0)]
    elif num_gpus == 2:
        ctx = [gpu(0), cpu(0)]
    else:
        return 0
    print('Running on', ctx)
    
    # copy params to all GPUs
    dev_params = [get_params(params, c) for c in ctx]   # shape is 2 x 8
    
    for epoch in range(5):
        # train
        start = time()
        for data, label in train_data:
            train_batch(data, label, dev_params, ctx, lr)
        nd.waitall()
        print('Epoch %d, training time = %.1f sec' % 
              (epoch, time() - start))
        
        # validating on GPU 0
        net = lambda data: lenet(data, dev_params[0])
        test_acc = utils.evaluate_accuracy(test_data, net, ctx[0])
        print('      validation accuray = %.4f' % (test_acc))

In [9]:
train(1, 256, 0.3)

Running on [gpu(0)]
Epoch 0, training time = 1.2 sec
      validation accuray = 0.1000
Epoch 1, training time = 0.9 sec
      validation accuray = 0.1002
Epoch 2, training time = 1.1 sec
      validation accuray = 0.1002
Epoch 3, training time = 0.8 sec
      validation accuray = 0.1001
Epoch 4, training time = 0.8 sec
      validation accuray = 0.1001


In [10]:
train(2, 256, 0.3)
# CPU is too slow, so total time is slow

Running on [gpu(0), cpu(0)]
Epoch 0, training time = 18.9 sec
      validation accuray = 0.6487
Epoch 1, training time = 18.9 sec
      validation accuray = 0.7831
Epoch 2, training time = 18.8 sec
      validation accuray = 0.7979
Epoch 3, training time = 18.9 sec
      validation accuray = 0.8086
Epoch 4, training time = 19.0 sec
      validation accuray = 0.8205


In [11]:
train(2, 512, 0.6)

Running on [gpu(0), cpu(0)]
Epoch 0, training time = 18.8 sec
      validation accuray = 0.0998
Epoch 1, training time = 19.3 sec
      validation accuray = 0.1007
Epoch 2, training time = 19.2 sec
      validation accuray = 0.1003
Epoch 3, training time = 19.3 sec
      validation accuray = 0.1009
Epoch 4, training time = 19.3 sec
      validation accuray = 0.1002


In [12]:
train(3, 768, 0.9)

0