# 批标准化

## 一. 公式及原理

batch normalization 的实现非常简单，对于给定的一个 batch 的数据 $B = \{x_1, x_2, \cdots, x_m\}$算法的公式如下

$$
\mu_B = \frac{1}{m} \sum_{i=1}^m x_i
$$

$$
\sigma^2_B = \frac{1}{m} \sum_{i=1}^m (x_i - \mu_B)^2
$$

$$
\hat{x}_i = \frac{x_i - \mu_B}{\sqrt{\sigma^2_B + \epsilon}}
$$

$$
y_i = \gamma \hat{x}_i + \beta
$$

第一行和第二行是计算出一个 batch 中数据的均值和方差，接着使用第三个公式对 batch 中的每个数据点做标准化，$\epsilon$ 是为了计算稳定引入的一个小的常数，通常取 $10^{-5}$，最后利用权重修正得到最后的输出结果，非常的简单，下面我们可以实现一下简单的一维的情况，也就是神经网络中的情况

### 1. 实现BN

In [5]:
import sys
sys.path.append('..')
import torch

In [6]:
def simple_batch_norm(x,gamma,beta):
    eps=1e-5 #eps denotes epsilon
    x_mean=torch.mean(x,dim=0,keepdim=True)#注意dim的取值
#     print('x_mean:\n',x_mean)
    x_var=torch.mean((x-x_mean)**2, dim=0, keepdim=True)
#     print('x_var:\n',x_var)
    x_hat=(x-x_mean)/torch.sqrt(x_var+eps) #sqrt denotes square root 平方根
#     print('x_hat:\n',x_hat)
    return gamma.view_as(x_mean)*x_hat + beta.view_as(x_mean)#如果两个tensor的shape不同,也可以相乘,但前提是dim要相同.

#我们来验证一下是否对于任意的输入，输出会被标准化
x=torch.arange(15).view(5,3).float()
gamma=torch.ones(x.shape[1])
beta=torch.zeros(x.shape[1])
print('before bn:\n',x)
y=simple_batch_norm(x,gamma,beta)
print('after bn:\n',y)

before bn:
 tensor([[ 0.,  1.,  2.],
        [ 3.,  4.,  5.],
        [ 6.,  7.,  8.],
        [ 9., 10., 11.],
        [12., 13., 14.]])
after bn:
 tensor([[-1.4142, -1.4142, -1.4142],
        [-0.7071, -0.7071, -0.7071],
        [ 0.0000,  0.0000,  0.0000],
        [ 0.7071,  0.7071,  0.7071],
        [ 1.4142,  1.4142,  1.4142]])


**这个时候会出现一个问题，就是测试的时候该使用批标准化吗？**

答案是肯定的，因为训练的时候使用了，而测试的时候不使用肯定会导致结果出现偏差，但是测试的时候如果只有一个数据集，那么均值不就是这个值，方差为 0 吗？这显然是随机的，所以测试的时候不能用测试的数据集去算均值和方差，而是用训练的时候算出的移动平均均值和方差去代替.

### 2. 能够区分训练状态和测试状态的批标准化方法

In [4]:
def batch_norm(x,gamma,beta,is_training,moving_mean,moving_var,moving_momentum=0.1):
    eps=1e-5
    x_mean=torch.mean(x,dim=0,keepdim=True)
    x_var=torch.mean((x-x_mean)**2,dim=0,keepdim=True)
    if is_training:
        x_hat=(x-x_mean)/torch.sqrt(x_var+eps)
        moving_mean[:]=moving_momentum*moving_mean+(1.-moving_momentum)*x_mean
        moving_var[:]=moving_momentum*moving_var+(1.-moving_momentum)*x_var
    else:
        x_hat=(x-moving_mean)/torch.sqrt(moving_var+eps)
    return gamma.view_as(x_mean)*x_hat+beta.view_as(x_mean)
#我们来验证一下是否对于任意的输入，输出会被标准化
x=torch.arange(15).view(5,3).float().cuda()
gamma=torch.ones(x.shape[1]).cuda()
beta=torch.zeros(x.shape[1]).cuda()
print('before bn:\n',x)
is_training=True
moving_mean=torch.zeros(x.shape[1]).cuda()
moving_var=torch.zeros(x.shape[1]).cuda()
print(beta.device)
y=batch_norm(x,gamma,beta,is_training,moving_mean,moving_var,moving_momentum=0.1)
print('after bn:\n',y)

before bn:
 tensor([[ 0.,  1.,  2.],
        [ 3.,  4.,  5.],
        [ 6.,  7.,  8.],
        [ 9., 10., 11.],
        [12., 13., 14.]], device='cuda:0')
cuda:0
after bn:
 tensor([[-1.4142, -1.4142, -1.4142],
        [-0.7071, -0.7071, -0.7071],
        [ 0.0000,  0.0000,  0.0000],
        [ 0.7071,  0.7071,  0.7071],
        [ 1.4142,  1.4142,  1.4142]], device='cuda:0')


### 3.使用上一节课将的深度神经网络分类 mnist 数据集的例子来试验一下批标准化是否有用

In [7]:
import numpy as np
from torchvision.datasets import mnist # 导入 pytorch 内置的 mnist 数据
from torch.utils.data import DataLoader
from torch import nn
from torch.autograd import Variable

In [31]:
# 使用内置函数下载 mnist 数据集
train_set = mnist.MNIST('../data', train=True)
test_set = mnist.MNIST('../data', train=False)

def data_tf(x):
    x = np.array(x, dtype='float32') / 255
    x = (x - 0.5) / 0.5 # 数据预处理，标准化
    x = x.reshape((-1,)) # 拉平
    x = torch.from_numpy(x)
    return x

train_set = mnist.MNIST('../data', train=True, transform=data_tf, download=True) # 重新载入数据集，申明定义的数据变换
test_set = mnist.MNIST('../data', train=False, transform=data_tf, download=True)
train_data = DataLoader(train_set, batch_size=64, shuffle=True)
test_data = DataLoader(test_set, batch_size=128, shuffle=False)

In [32]:
def batch_norm(x,gamma,beta,is_training,moving_mean,moving_var,moving_momentum=0.1):
    #batch_norm中间的操作最好在cpu上进行,输出的时候再放到GPU上.
    x=x.cpu()
    gamma=gamma.cpu()
    beta=beta.cpu()
    moving_mean=moving_mean.cpu()
    moving_var=moving_var.cpu()
    
    
    eps=1e-5
    x_mean=torch.mean(x,dim=0,keepdim=True)
    x_var=torch.mean((x-x_mean)**2,dim=0,keepdim=True)
    if is_training:
        x_hat=(x-x_mean)/torch.sqrt(x_var+eps)
        
        moving_mean[:]=moving_momentum*moving_mean+(1.-moving_momentum)*x_mean
        moving_var[:]=moving_momentum*moving_var+(1.-moving_momentum)*x_var
    else:
        x_hat=(x-moving_mean)/torch.sqrt(moving_var+eps)
    x_bn=gamma.view_as(x_mean)*x_hat+beta.view_as(x_mean)
    
    x_bn=x_bn.cuda()
    return x_bn

# 构建模型
class multi_network(nn.Module):
    def __init__(self):
        super(multi_network,self).__init__()
        self.layer1=nn.Linear(784,100)
        self.relu=nn.ReLU()
        self.layer2=nn.Linear(100,10)
        
        self.gamma=nn.Parameter(torch.randn(100))
        self.beta=nn.Parameter(torch.randn(100))
        self.moving_mean=Variable(torch.zeros(100))
        self.moving_var=Variable(torch.zeros(100))
    def forward(self,x,is_train=True):
        x=self.layer1(x)#[64,100]
        x=batch_norm(x,self.gamma,self.beta,is_train,self.moving_mean,self.moving_var)
        x=self.relu(x)
        x=self.layer2(x)
        return x

net = multi_network()
# 定义 loss 函数
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), 1e-1) # 使用随机梯度下降，学习率 0.1

def get_acc(output, label):
    total = output.shape[0]
    _, pred_label = output.max(1)
    num_correct = (pred_label == label).sum().item()
    return num_correct / total

def train(net, train_data, valid_data, num_epochs, optimizer, criterion):
    if torch.cuda.is_available():#如果有GPU()
        net = net.cuda()
    for epoch in range(num_epochs):
        train_loss = 0
        train_acc = 0
        net = net.train()
        for im, label in train_data:
            if torch.cuda.is_available():
                im = Variable(im.cuda())  # (bs, 3, h, w)
                label = Variable(label.cuda())  # (bs, h, w)                
            else:
                im = Variable(im)
                label = Variable(label)
            # forward
            output = net(im)
            loss = criterion(output, label)
            # backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            train_acc += get_acc(output, label)

        if valid_data is not None:
            valid_loss = 0
            valid_acc = 0
            net = net.eval()
            for im, label in valid_data:
                if torch.cuda.is_available():
                    im = Variable(im.cuda())
                    label = Variable(label.cuda())
                else:
                    im = Variable(im)
                    label = Variable(label)
                output = net(im)
                loss = criterion(output, label)
                valid_loss += loss.item()
                valid_acc += get_acc(output, label)
            epoch_str = ( "Epoch %d. Train Loss: %f, Train Acc: %f, Valid Loss: %f, Valid Acc: %f, " % (epoch, train_loss / len(train_data),
                   train_acc / len(train_data), valid_loss / len(valid_data),
                   valid_acc / len(valid_data)))
            print(epoch_str)
        else:
            epoch_str = ("Epoch %d. Train Loss: %f, Train Acc: %f, " %(epoch, train_loss / len(train_data),train_acc / len(train_data)))
            print(epoch_str)
        
train(net, train_data, test_data, 10, optimizer, criterion)

Epoch 0. Train Loss: 0.315558, Train Acc: 0.911497, Valid Loss: 0.197100, Valid Acc: 0.941555, 
Epoch 1. Train Loss: 0.175175, Train Acc: 0.949877, Valid Loss: 0.142647, Valid Acc: 0.958366, 
Epoch 2. Train Loss: 0.135746, Train Acc: 0.961521, Valid Loss: 0.121527, Valid Acc: 0.965585, 
Epoch 3. Train Loss: 0.112858, Train Acc: 0.967917, Valid Loss: 0.110114, Valid Acc: 0.968948, 
Epoch 4. Train Loss: 0.097950, Train Acc: 0.971782, Valid Loss: 0.107418, Valid Acc: 0.968849, 
Epoch 5. Train Loss: 0.085791, Train Acc: 0.975113, Valid Loss: 0.104817, Valid Acc: 0.970332, 
Epoch 6. Train Loss: 0.076986, Train Acc: 0.977279, Valid Loss: 0.094231, Valid Acc: 0.972508, 
Epoch 7. Train Loss: 0.069275, Train Acc: 0.979544, Valid Loss: 0.097845, Valid Acc: 0.969937, 
Epoch 8. Train Loss: 0.063007, Train Acc: 0.981943, Valid Loss: 0.095536, Valid Acc: 0.971816, 
Epoch 9. Train Loss: 0.057041, Train Acc: 0.983575, Valid Loss: 0.089297, Valid Acc: 0.973596, 


从上面可以看到，我们自己实现了 2 维情况的批标准化，对应于卷积的 4 维情况的标准化是类似的，只需要沿着通道的维度进行均值和方差的计算，但是我们自己实现批标准化是很累的，pytorch 当然也为我们内置了批标准化的函数，一维和二维分别是 `torch.nn.BatchNorm1d()` 和 `torch.nn.BatchNorm2d()`，不同于我们的实现，pytorch 不仅将 $\gamma$ 和 $\beta$ 作为训练的参数，也将 `moving_mean` 和 `moving_var` 也作为参数进行训练

### 4.卷积网络下试用一下批标准化看看效果, `torch.nn.BatchNorm1d()`和`torch.nn.BatchNorm2d()`

In [27]:
def data_tf(x):
    x=np.array(x,dtype='float32')/255
    x=(x-0.5)/0.5
    x=torch.Tensor(x)
    x=x.unsqueeze(0)
    return x
train_set = mnist.MNIST('../data', train=True, transform=data_tf, download=True) # 重新载入数据集，申明定义的数据变换
test_set = mnist.MNIST('../data', train=False, transform=data_tf, download=True)
train_data = DataLoader(train_set, batch_size=64, shuffle=True)
test_data = DataLoader(test_set, batch_size=128, shuffle=False)

In [28]:
# 使用批标准化
class conv_bn_net(nn.Module):
    def __init__(self):
        super(conv_bn_net, self).__init__()
        self.stage1 = nn.Sequential(
            nn.Conv2d(1, 6, 3, padding=1),#padding=1可以设置.
            nn.BatchNorm2d(6),#BN一般在卷积层之后
            nn.ReLU(True),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(6, 16, 5),
            nn.BatchNorm2d(16),
            nn.ReLU(True),
            nn.MaxPool2d(2, 2)
        )
        
        self.classfy = nn.Linear(400, 10)
    def forward(self, x):
        x = self.stage1(x)#x=[64,1,28,28]
        x = x.view(x.shape[0], -1)
        x = self.classfy(x)
        return x

net = conv_bn_net()
optimizer = torch.optim.SGD(net.parameters(), 1e-1) # 使用随机梯度下降，学习率 0.1
train(net, train_data, test_data, 5, optimizer, criterion)

Epoch 0. Train Loss: 0.171464, Train Acc: 0.949627, Valid Loss: 0.065106, Valid Acc: 0.978343, 
Epoch 1. Train Loss: 0.066592, Train Acc: 0.978928, Valid Loss: 0.061266, Valid Acc: 0.981903, 
Epoch 2. Train Loss: 0.051171, Train Acc: 0.983975, Valid Loss: 0.054453, Valid Acc: 0.982694, 
Epoch 3. Train Loss: 0.043667, Train Acc: 0.986574, Valid Loss: 0.060510, Valid Acc: 0.981112, 
Epoch 4. Train Loss: 0.038681, Train Acc: 0.987906, Valid Loss: 0.037735, Valid Acc: 0.987836, 


### 4.测试`nn.Conv2d(input_channel,output_channel,size,padding=)`,`nn.MaxPool2d()`

In [45]:
a=torch.randn(64,1,28,28)
conv=nn.Conv2d(1,6,3,padding=1)
#第二个数字6代表filter_num,即输出的通道数.第三个数字代表filter_size,3x3.在Torch中,卷积核默认是正方形的,默认stride步长是1.
b=conv(a)
print('b shape:',b.shape)
conv2=nn.Conv2d(6,16,5)
c=conv2(b)
print(c.shape)
maxpool=nn.MaxPool2d(4,2)
d=maxpool(c)
print(d.shape)

b shape: torch.Size([64, 6, 28, 28])
torch.Size([64, 16, 24, 24])
torch.Size([64, 16, 11, 11])


### 扩展:
1. `tensor1.view_as(tensor2)`:按照tensor2的shape,reshape一下tensor1
2. `if torch.cuda.is_available(): net = net.cuda()`:将模型放到GPU上.