In [1]:
# 5.10.2 从零开始实现 
# 通过NDArray来实现批量归一化层。
import d2lzh as d2l
from mxnet import autograd,gluon,init,nd
from mxnet.gluon import nn

def batch_norm(X,gamma,beta,moving_mean,moving_var,eps,momentum):
    # 通过autograd来判断当前模式是训练模式还是预测模式
    if not autograd.is_training():
        # 如果是在预测模式下，直接使用传入的移动平均所得的均值和方差
        X_hat=(X-moving_mean)/nd.sqrt(moving_var+eps)
    else:
        assert len(X.shape) in (2,4)# 判断输入X的维数是否为二维或者四维
        if len(X.shape)==2:#二维即(样本，特征)
            # 使用全连接层的情况，计算特征维上的均值和方差
            mean=X.mean(axis=0)# axis=0对每一列的元素求平均
            var=((X-mean)**2).mean(axis=0)
        else:# 四维(样本，通道，高，宽)
            # 使用二维卷积层的情况，计算通道维上（axis=1）的均值和方差。这里我们需要保持
            # X的形状以便后面可以做广播运算
            mean=X.mean(axis=(0,2,3),keepdims=True)
            var=((X-mean)**2).mean(axis=(0,2,3),keepdims=True)
        # 训练模式下用当前的均值和方差做标准化
        X_hat=(X-mean)/nd.sqrt(var+eps)
        # 更新移动平均的均值和方差
        moving_mean=momentum*moving_mean+(1.0-momentum)*mean
        moving_var=momentum*moving_var+(1.0-momentum)*var
    Y=gamma*X_hat+beta # 拉伸和偏移
    return Y,moving_mean,moving_var

In [5]:
# 自定义一个BatchNorm层。它保存参与求梯度和迭代的拉伸参数gamma和偏移参数beta，
# 同时也维护移动平均得到的均值和方差，以便能够在模型预测时被使用。
class BatchNorm(nn.Block):
    def __init__(self,num_features,num_dims,**kwargs):
        super(BatchNorm,self).__init__(**kwargs)
        if num_dims==2:#全连接层
            # (1,num_features)为gamma(权重)的形状，则输入X的形状为(样本数,1)
            shape=(1,num_features)# num_features参数对于全连接层来说应为输出个数
        else:#卷积层
            shape=(1,num_features,1,1)# num_features参数对于卷积层来说则为输出通道数。
        # 参与求梯度和迭代的拉伸和偏移参数，分别初始化成1和0
        self.gamma=self.params.get('gamma',shape=shape,init=init.One())# 相当于权重
        self.beta=self.params.get('beta',shape=shape,init=init.Zero())# 相当于偏差
        # 不参与求梯度和迭代的变量，全在内存上初始化成0
        self.moving_mean=nd.zeros(shape)
        self.moving_var=nd.zeros(shape)
        
    def forward(self,X):
        # 如果X不在内存上，将moving_mean和moving_var复制到X所在显存上
        if self.moving_mean.context!=X.context:
            self.moving_mean=self.moving_mean.copyto(X.context)
            self.moving_var=self.moving_var.copyto(X.context)
        # 保存更新过的moving_mean和moving_var
        Y,self.moving_mean,self.moving_var=batch_norm(X,self.gamma.data(),self.beta.data(),
                                    self.moving_mean,self.moving_var,eps=1e-5,momentum=0.9)
        return Y

In [6]:
# 使用批量归一化层的LeNet
# 在所有的卷积层或全连接层之后、激活层之前加入批量归一化层。
net=nn.Sequential()
net.add(nn.Conv2D(6,kernel_size=5),
       BatchNorm(6,num_dims=4),
       nn.Activation('sigmoid'),
       nn.MaxPool2D(pool_size=2,strides=2),
       nn.Conv2D(16,kernel_size=5),
       BatchNorm(16,num_dims=4),
       nn.Activation('sigmoid'),
       nn.MaxPool2D(pool_size=2,strides=2),
       nn.Dense(120),
       BatchNorm(120,num_dims=2),
       nn.Activation('sigmoid'), 
       nn.Dense(84),
       BatchNorm(84,num_dims=2),
       nn.Activation('sigmoid'),
       nn.Dense(10) 
       )

In [7]:
# 训练修改后的模型。
import mxnet as mx
lr,num_epochs,batch_size,ctx=1.0,5,256,mx.cpu()
net.initialize(ctx=ctx,init=init.Xavier())
trainer=gluon.Trainer(net.collect_params(),'sgd',{'learning_rate':lr})
train_iter,test_iter=d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch5(net,train_iter,test_iter,batch_size,trainer,ctx,num_epochs)

training on cpu(0)
epoch 1, loss 0.6344, train acc 0.774, test acc 0.828, time 62.5 sec
epoch 2, loss 0.3898, train acc 0.858, test acc 0.824, time 61.4 sec
epoch 3, loss 0.3478, train acc 0.875, test acc 0.871, time 61.1 sec
epoch 4, loss 0.3200, train acc 0.886, test acc 0.873, time 60.8 sec
epoch 5, loss 0.3002, train acc 0.892, test acc 0.860, time 61.7 sec


In [11]:
net[1].gamma.data().reshape((-1,)),net[1].beta.data().reshape((-1,))

(
 [1.8226874 1.8819977 2.0167785 1.3154042 1.6759734 1.0088483]
 <NDArray 6 @cpu(0)>,
 
 [-1.9086581   0.5927755  -0.3861179  -1.1667049   0.28886405 -0.68728215]
 <NDArray 6 @cpu(0)>)

In [20]:
# 5.10.3 简洁实现 
# Gluon中nn模块定义的BatchNorm类使用起来更加简单。
# 它不需要指定自己定义的BatchNorm类中所需的num_features和num_dims参数值。
# 在Gluon中，这些参数值都将通过延后初始化而自动获取。
net=nn.Sequential()
net.add(nn.Conv2D(6,kernel_size=5),
       nn.BatchNorm(),
       nn.Activation('sigmoid'),
       nn.MaxPool2D(pool_size=2,strides=2),
       nn.Conv2D(16,kernel_size=5),
       nn.BatchNorm(),
       nn.Activation('sigmoid'),
       nn.MaxPool2D(pool_size=2,strides=2),
       nn.Dense(120),
       nn.BatchNorm(),
       nn.Activation('sigmoid'), 
       nn.Dense(84),
       nn.BatchNorm(),
       nn.Activation('sigmoid'),
       nn.Dense(10)
       )

In [21]:
net.initialize(ctx=ctx,init=init.Xavier())
trainer=gluon.Trainer(net.collect_params(),'sgd',{'learning_rate':lr})
d2l.train_ch5(net,train_iter,test_iter,batch_size,trainer,ctx,num_epochs)

training on cpu(0)
epoch 1, loss 0.6309, train acc 0.779, test acc 0.809, time 25.5 sec
epoch 2, loss 0.3890, train acc 0.858, test acc 0.861, time 25.0 sec
epoch 3, loss 0.3464, train acc 0.874, test acc 0.841, time 25.7 sec
epoch 4, loss 0.3215, train acc 0.882, test acc 0.880, time 25.4 sec
epoch 5, loss 0.3042, train acc 0.889, test acc 0.872, time 25.4 sec


In [None]:
X=nd.arange(24).reshape(2,2,2,2)
assert len(X.shape) in (2,4)
X.mean(axis=(0,2,3),keepdims=True)
net=nn.Sequential()
net.add(nn.Dense(10))
net.initialize()

net[0].weight
x=nd.arange(10).reshape(5,1)# x
w=nd.arange(10).reshape(1,-1)# w
x*w,x,w

In [None]:
X.reshape(2,1,2,2)

In [None]:
nd.mean??

In [18]:
nn.BatchNorm??

In [None]:

init.Zero??
init.One??
nn.Activation??