# 使用``gluon``开始批量归一化

使用``gluon``实现批量归一化则简单得多，我们只需要像``Dense``层那样指定``gluon.nn.BatchNorm``层，并指定对二维卷积的通道层(axis=1)做归一化即可。

In [1]:
import mxnet as mx
import numpy as np

from mxnet import nd
from mxnet import gluon
from mxnet import autograd

ctx = mx.gpu()

import utils

In [2]:
num_examples = 60000
num_input = 784
num_output = 10

batch_size = 64
train_data, test_data = utils.load_dataset(batch_size, data_type='mnist')

## 定义模型

In [3]:
num_output_conv1 = 20
num_output_conv2 = 50
num_output_fc1 = 128
num_output_fc2 = 10

def get_net():
    net = gluon.nn.Sequential()
    with net.name_scope():
        # first conv layer
        net.add(gluon.nn.Conv2D(num_output_conv1, kernel_size=3, activation="relu"))
        net.add(gluon.nn.BatchNorm(axis=1))
        net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
        # second conv layer 
        net.add(gluon.nn.Conv2D(num_output_conv2, kernel_size=3, activation="relu"))
        net.add(gluon.nn.BatchNorm(axis=1))
        net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
        # flatten layer 
        net.add(gluon.nn.Flatten())
        # first fc layer 
        net.add(gluon.nn.Dense(num_output_fc1, activation="relu"))
        # output layer 
        net.add(gluon.nn.Dense(num_output_fc2))
    return net

In [4]:
net = get_net()
net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)

In [5]:
net.collect_params()

sequential0_ (
  Parameter sequential0_conv0_weight (shape=(20, 0, 3, 3), dtype=<class 'numpy.float32'>)
  Parameter sequential0_conv0_bias (shape=(20,), dtype=<class 'numpy.float32'>)
  Parameter sequential0_batchnorm0_gamma (shape=(0,), dtype=<class 'numpy.float32'>)
  Parameter sequential0_batchnorm0_beta (shape=(0,), dtype=<class 'numpy.float32'>)
  Parameter sequential0_batchnorm0_running_mean (shape=(0,), dtype=<class 'numpy.float32'>)
  Parameter sequential0_batchnorm0_running_var (shape=(0,), dtype=<class 'numpy.float32'>)
  Parameter sequential0_conv1_weight (shape=(50, 0, 3, 3), dtype=<class 'numpy.float32'>)
  Parameter sequential0_conv1_bias (shape=(50,), dtype=<class 'numpy.float32'>)
  Parameter sequential0_batchnorm1_gamma (shape=(0,), dtype=<class 'numpy.float32'>)
  Parameter sequential0_batchnorm1_beta (shape=(0,), dtype=<class 'numpy.float32'>)
  Parameter sequential0_batchnorm1_running_mean (shape=(0,), dtype=<class 'numpy.float32'>)
  Parameter sequential0_batchnor

In [6]:
for data, _ in train_data:
    data = data.as_in_context(ctx)
    print(data.shape)
    break
    
net(data).shape

(64, 1, 28, 28)


(64, 10)

In [7]:
net.collect_params()

sequential0_ (
  Parameter sequential0_conv0_weight (shape=(20, 1, 3, 3), dtype=<class 'numpy.float32'>)
  Parameter sequential0_conv0_bias (shape=(20,), dtype=<class 'numpy.float32'>)
  Parameter sequential0_batchnorm0_gamma (shape=(20,), dtype=<class 'numpy.float32'>)
  Parameter sequential0_batchnorm0_beta (shape=(20,), dtype=<class 'numpy.float32'>)
  Parameter sequential0_batchnorm0_running_mean (shape=(20,), dtype=<class 'numpy.float32'>)
  Parameter sequential0_batchnorm0_running_var (shape=(20,), dtype=<class 'numpy.float32'>)
  Parameter sequential0_conv1_weight (shape=(50, 20, 3, 3), dtype=<class 'numpy.float32'>)
  Parameter sequential0_conv1_bias (shape=(50,), dtype=<class 'numpy.float32'>)
  Parameter sequential0_batchnorm1_gamma (shape=(50,), dtype=<class 'numpy.float32'>)
  Parameter sequential0_batchnorm1_beta (shape=(50,), dtype=<class 'numpy.float32'>)
  Parameter sequential0_batchnorm1_running_mean (shape=(50,), dtype=<class 'numpy.float32'>)
  Parameter sequential0_

## 训练

In [8]:
learning_rate = 0.01
softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate' : learning_rate})

epochs = 10

niter = 0
moving_loss = 0.0
smoothing_constant = 0.9

from time import time
for epoch in range(epochs):
    start = time()
    for i, (data, label) in enumerate(train_data):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        with autograd.record():
            output = net(data)
            loss = softmax_cross_entropy(output, label)
        loss.backward()
        trainer.step(batch_size)
        
        niter += 1
        curr_loss = nd.mean(loss).asscalar()
        moving_loss = smoothing_constant * moving_loss + (1-smoothing_constant) * curr_loss
        estimated_loss = moving_loss / (1 - smoothing_constant**niter)
    
    train_acc = utils.evaluate_accuracy_gluon(train_data, net, ctx)
    test_acc = utils.evaluate_accuracy_gluon(test_data, net, ctx)
    print("Epoch %d, Moving Train Avg loss %.5f, Train acc %.5f, Test acc %.5f, Time consume %.5f s."
         % (epoch, estimated_loss, train_acc, test_acc, time() - start))

Epoch 0, Moving Train Avg loss 0.09402, Train acc 0.97982, Test acc 0.97920, Time consume 25.92356 s.
Epoch 1, Moving Train Avg loss 0.05288, Train acc 0.98732, Test acc 0.98330, Time consume 25.94563 s.
Epoch 2, Moving Train Avg loss 0.03899, Train acc 0.98990, Test acc 0.98530, Time consume 25.91965 s.
Epoch 3, Moving Train Avg loss 0.03017, Train acc 0.99283, Test acc 0.98690, Time consume 25.87799 s.
Epoch 4, Moving Train Avg loss 0.02555, Train acc 0.99440, Test acc 0.98750, Time consume 25.81163 s.
Epoch 5, Moving Train Avg loss 0.02323, Train acc 0.99482, Test acc 0.98750, Time consume 25.89330 s.
Epoch 6, Moving Train Avg loss 0.01272, Train acc 0.99588, Test acc 0.98830, Time consume 25.80154 s.
Epoch 7, Moving Train Avg loss 0.01490, Train acc 0.99663, Test acc 0.98850, Time consume 25.52007 s.
Epoch 8, Moving Train Avg loss 0.01256, Train acc 0.99752, Test acc 0.98900, Time consume 25.39434 s.
Epoch 9, Moving Train Avg loss 0.01898, Train acc 0.99715, Test acc 0.98820, Time 