## Convolutional layer

In [1]:
from mxnet import nd

# input format is batch x channel x height x width, there batch and channel all 1
# weight format is output _channels x in_channels x height x weight, there input_channel and output_channel all 1
data = nd.arange(16).reshape((1, 1, 4, 4))
w = nd.arange(9).reshape((1, 1, 3, 3))
b = nd.array([1])
out = nd.Convolution(data, w, b, kernel = w.shape[2:], num_filter = w.shape[0])

print('input:', data, '\n\nweight:', w, '\n\nbias:', b, '\n\noutput:', out)

input: 
[[[[  0.   1.   2.   3.]
   [  4.   5.   6.   7.]
   [  8.   9.  10.  11.]
   [ 12.  13.  14.  15.]]]]
<NDArray 1x1x4x4 @cpu(0)> 

weight: 
[[[[ 0.  1.  2.]
   [ 3.  4.  5.]
   [ 6.  7.  8.]]]]
<NDArray 1x1x3x3 @cpu(0)> 

bias: 
[ 1.]
<NDArray 1 @cpu(0)> 

output: 
[[[[ 259.  295.]
   [ 403.  439.]]]]
<NDArray 1x1x2x2 @cpu(0)>


  import OpenSSL.SSL


In [2]:
out = nd.Convolution(data, w, b, kernel = w.shape[2:], num_filter = w.shape[0],
                    stride = (2, 2), pad = (1, 1))
# pad(1, 1) add a row in top, add a column in left
print('input:', data, '\n\nweight:', w, '\n\nbias:', b, '\n\noutput:', out)

input: 
[[[[  0.   1.   2.   3.]
   [  4.   5.   6.   7.]
   [  8.   9.  10.  11.]
   [ 12.  13.  14.  15.]]]]
<NDArray 1x1x4x4 @cpu(0)> 

weight: 
[[[[ 0.  1.  2.]
   [ 3.  4.  5.]
   [ 6.  7.  8.]]]]
<NDArray 1x1x3x3 @cpu(0)> 

bias: 
[ 1.]
<NDArray 1 @cpu(0)> 

output: 
[[[[  74.  155.]
   [ 280.  439.]]]]
<NDArray 1x1x2x2 @cpu(0)>


In [3]:
data = nd.arange(32).reshape((1, 2, 4, 4))
w = nd.arange(18).reshape((1, 2, 3, 3))

out = nd.Convolution(data, w, b, kernel = w.shape[2:], num_filter = w.shape[0])

print('input:', data, '\n\nweight:', w, '\n\nbias:', b, '\n\noutput:', out)

input: 
[[[[  0.   1.   2.   3.]
   [  4.   5.   6.   7.]
   [  8.   9.  10.  11.]
   [ 12.  13.  14.  15.]]

  [[ 16.  17.  18.  19.]
   [ 20.  21.  22.  23.]
   [ 24.  25.  26.  27.]
   [ 28.  29.  30.  31.]]]]
<NDArray 1x2x4x4 @cpu(0)> 

weight: 
[[[[  0.   1.   2.]
   [  3.   4.   5.]
   [  6.   7.   8.]]

  [[  9.  10.  11.]
   [ 12.  13.  14.]
   [ 15.  16.  17.]]]]
<NDArray 1x2x3x3 @cpu(0)> 

bias: 
[ 1.]
<NDArray 1 @cpu(0)> 

output: 
[[[[ 2794.  2947.]
   [ 3406.  3559.]]]]
<NDArray 1x1x2x2 @cpu(0)>


In [4]:
data = nd.arange(32).reshape((1, 2, 4, 4))
w = nd.arange(36).reshape((2, 2, 3, 3))
b = nd.array([1, 2])

out = nd.Convolution(data, w, b, kernel = w.shape[2:], num_filter = w.shape[0])

print('input:', data, '\n\nweight:', w, '\n\nbias:', b, '\n\noutput:', out)

input: 
[[[[  0.   1.   2.   3.]
   [  4.   5.   6.   7.]
   [  8.   9.  10.  11.]
   [ 12.  13.  14.  15.]]

  [[ 16.  17.  18.  19.]
   [ 20.  21.  22.  23.]
   [ 24.  25.  26.  27.]
   [ 28.  29.  30.  31.]]]]
<NDArray 1x2x4x4 @cpu(0)> 

weight: 
[[[[  0.   1.   2.]
   [  3.   4.   5.]
   [  6.   7.   8.]]

  [[  9.  10.  11.]
   [ 12.  13.  14.]
   [ 15.  16.  17.]]]


 [[[ 18.  19.  20.]
   [ 21.  22.  23.]
   [ 24.  25.  26.]]

  [[ 27.  28.  29.]
   [ 30.  31.  32.]
   [ 33.  34.  35.]]]]
<NDArray 2x2x3x3 @cpu(0)> 

bias: 
[ 1.  2.]
<NDArray 2 @cpu(0)> 

output: 
[[[[ 2794.  2947.]
   [ 3406.  3559.]]

  [[ 7007.  7484.]
   [ 8915.  9392.]]]]
<NDArray 1x2x2x2 @cpu(0)>


## Pooling

In [5]:
data = nd.arange(32).reshape((1, 2, 4, 4))

max_pool = nd.Pooling(data = data, pool_type = 'max', kernel = (2, 2), stride = (1, 1))
avg_pool = nd.Pooling(data = data, pool_type = 'avg', kernel = (2, 2), stride = (2, 2))

print('data:', data, '\n\nmax pooling:', max_pool, '\n\navg pooling:', avg_pool)

data: 
[[[[  0.   1.   2.   3.]
   [  4.   5.   6.   7.]
   [  8.   9.  10.  11.]
   [ 12.  13.  14.  15.]]

  [[ 16.  17.  18.  19.]
   [ 20.  21.  22.  23.]
   [ 24.  25.  26.  27.]
   [ 28.  29.  30.  31.]]]]
<NDArray 1x2x4x4 @cpu(0)> 

max pooling: 
[[[[  5.   6.   7.]
   [  9.  10.  11.]
   [ 13.  14.  15.]]

  [[ 21.  22.  23.]
   [ 25.  26.  27.]
   [ 29.  30.  31.]]]]
<NDArray 1x2x3x3 @cpu(0)> 

avg pooling: 
[[[[  2.5   4.5]
   [ 10.5  12.5]]

  [[ 18.5  20.5]
   [ 26.5  28.5]]]]
<NDArray 1x2x2x2 @cpu(0)>


In [6]:
import sys
sys.path.append('..')
from utils import load_data_fashion_mnist

batch_size = 256
train_data, test_data = load_data_fashion_mnist(batch_size)
print(len(train_data), len(test_data))

234 39


In [7]:
# default use GPU, if without GPU then use CPU
import mxnet as mx

try:
    ctx = mx.gpu()
    _ = nd.zeros((1,), ctx = ctx)
except:
    ctx = mx.cpu()
ctx

gpu(0)

## LeNet
    --2 conv layers, 2 dense layers

In [8]:
weight_scale = .01

# output channel = 20, kernel = (5, 5)
W1 = nd.random_normal(shape = (20, 1, 5, 5), scale = weight_scale, ctx = ctx)
b1 = nd.zeros(W1.shape[0], ctx = ctx)

# output channel = 50, kernel = (3, 3)
W2 = nd.random_normal(shape = (50, 20, 3, 3), scale = weight_scale, ctx = ctx)
b2 = nd.zeros(W2.shape[0], ctx = ctx)

# output = 128
W3 = nd.random_normal(shape = (1250, 128), scale = weight_scale, ctx = ctx)
b3 = nd.zeros(W3.shape[1], ctx = ctx)

# output = 10
W4 = nd.random_normal(shape = (128, 10), scale = weight_scale, ctx = ctx)
b4 = nd.zeros(W4.shape[1], ctx = ctx)

params = [W1, b1, W2, b2, W3, b3, W4, b4]
for param in params:
    param.attach_grad()

In [9]:
# verbose used to debug
def net(X, verbose = False):
    X = X.as_in_context(W1.context)   # W1.context equals mx.gpu()
    # first conv layer
    h1_conv = nd.Convolution(data = X, weight = W1, bias = b1,
                             kernel = W1.shape[2:], num_filter = W1.shape[0])
    h1_activation = nd.relu(h1_conv)
    h1 = nd.Pooling(data = h1_activation, pool_type = 'max', kernel = (2, 2), stride = (2, 2))
    # second conv layer
    h2_conv = nd.Convolution(data = h1, weight = W2, bias = b2,
                             kernel = W2.shape[2:], num_filter = W2.shape[0])
    h2_activation = nd.relu(h2_conv)
    h2 = nd.Pooling(data = h2_activation, pool_type = 'max', kernel = (2, 2), stride = (2, 2))
    h2_flat = nd.flatten(h2)  # too important
    # first dense
    h3_linear = nd.dot(h2_flat, W3) + b3
    h3 = nd.relu(h3_linear)
    # second dense
    h4_linear = nd.dot(h3, W4) + b4
    if verbose:
        print('lst conv block:', h1.shape)
        print('2nd conv block:', h2.shape)
        print('1st dense:', h3.shape)
        print('2nd dense:', h4_linear.shape)
        print('output:', h4_linear)
    return h4_linear

In [10]:
for data, _ in train_data:
    net(data, verbose = True)
    break

lst conv block: (256, 20, 12, 12)
2nd conv block: (256, 50, 5, 5)
1st dense: (256, 128)
2nd dense: (256, 10)
output: 
[[ -1.25888808e-04   1.71888860e-05   1.23100952e-04 ...,   7.24935308e-05
    8.06554308e-05   1.10734043e-04]
 [ -1.70488565e-04   4.61007294e-05   1.51696251e-04 ...,   9.41563194e-05
    1.82403484e-04   1.73743480e-04]
 [ -1.24564744e-04  -9.15816781e-06   2.91839388e-05 ...,   1.68508661e-04
    6.87505744e-05   7.67239981e-05]
 ..., 
 [ -1.92659660e-04  -1.84186938e-05   4.50565203e-05 ...,   1.54259193e-04
    1.71014544e-04   1.36011484e-04]
 [ -1.53804373e-04   3.90572495e-06   6.65578118e-05 ...,   3.82953804e-05
    1.01694750e-04   9.99176846e-05]
 [ -1.39973097e-04   7.38169911e-06   1.24147031e-04 ...,   1.02233811e-04
    1.03171726e-04   1.47112325e-04]]
<NDArray 256x10 @gpu(0)>


In [11]:
from mxnet import autograd as ag
from utils import SGD, accuracy, evaluate_accuracy
from mxnet import gluon

softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()

learning_rate = .2

for epoch in range(5):
    train_loss = 0.
    train_acc = 0.
    for data, label in train_data:
        label = label.as_in_context(ctx)
        with ag.record():
            output = net(data)
            loss = softmax_cross_entropy(output, label)
        loss.backward()
        SGD(params, learning_rate / batch_size)
        
        train_loss += nd.mean(loss).asscalar()
        train_acc += accuracy(output, label)
        
    test_acc = evaluate_accuracy(test_data, net, ctx)
    print('Epoch %d, Loss: %f, Train acc %f, Test acc %f' % (
        epoch, train_loss / len(train_data), train_acc / len(train_data), test_acc))

Epoch 0, Loss: 2.302016, Train acc 0.109876, Test acc 0.219351
Epoch 1, Loss: 1.345695, Train acc 0.506243, Test acc 0.722356
Epoch 2, Loss: 0.658532, Train acc 0.745326, Test acc 0.781550
Epoch 3, Loss: 0.524083, Train acc 0.800114, Test acc 0.816707
Epoch 4, Loss: 0.458224, Train acc 0.829377, Test acc 0.851262


In [12]:
# SoftmaxCrossEntropy contains softmax() and cross_entropy()
# help(gluon.loss.SoftmaxCrossEntropyLoss())
gluon.loss.SoftmaxCrossEntropyLoss??

In [13]:
print(len(train_data), len(test_data))

234 39
