In [18]:
import time
import torch
from torch import nn,optim
import sys
sys.path.append("..")
import d2lzh_pytorch as d2l
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import torch.nn.functional as F

In [19]:
def batch_norm(is_training,X,gamma,beta,moving_mean,moving_var,eps,momentum):
    if not is_training:
        X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
    else:
        assert len(X.shape) in (2,4)
        if len(X.shape) == 2:
            mean = X.mean(dim=0)
            var = ((X - mean)**2).mean(dim=0)
        else:
            mean = X.mean(dim=0,keepdim=True).mean(dim=2,keepdim=True).mean(dim=3,keepdim=True)
            var = ((X - mean)**2).mean(dim=0,keepdim=True).mean(dim=2,keepdim=True).mean(dim=3,keepdim=True)
        X_hat = (X - mean) / torch.sqrt(var+eps)
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0 - momentum) * var
    Y = gamma * X_hat + beta
    return Y,moving_mean,moving_var

In [20]:
class BatchNorm(nn.Module):
    def __init__(self,num_features,num_dims):
        super(BatchNorm,self).__init__()
        if num_dims == 2:
            shape = (1,num_features)
        else:
            shape = (1,num_features,1,1)
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.zeros(shape)
    def forward(self,X):
        if self.moving_mean.device != X.device:
            self.moving_mean = self.moving_mean.to(X.device)
            self.moving_var = self.moving_var.to(X.device)
        Y,self.moving_mean,self.moving_var = batch_norm(self.training,
                                                       X,self.gamma,self.beta,self.moving_mean,
                                                       self.moving_var,eps=1e-5,momentum=0.9)
        return Y

In [21]:
#使用批量归一化层的LeNet
net = nn.Sequential(nn.Conv2d(1,6,5),
                   BatchNorm(6,num_dims=4),
                   nn.Sigmoid(),
                   nn.MaxPool2d(2,2),
                   nn.Conv2d(6,16,5),
                   BatchNorm(16,num_dims=4),
                   nn.Sigmoid(),
                   nn.MaxPool2d(2,2),
                   d2l.FlattenLayer(),
                   nn.Linear(16*4*4,120),
                   BatchNorm(120,num_dims=2),
                   nn.Sigmoid(),
                   nn.Linear(120,84),
                   BatchNorm(84,num_dims=2),
                   nn.Sigmoid(),
                   nn.Linear(84,10))

In [22]:
batch_size = 256
train_iter,test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)
lr,num_epochs = 0.001,5
optimizer = torch.optim.Adam(net.parameters(),lr=lr)
d2l.train_ch5(net,train_iter,test_iter,batch_size,optimizer,device,num_epochs)

training on  cuda
epoch 1, loss 0.9931, train acc 0.780, test acc 0.831, time 20.2 sec
epoch 2, loss 0.4576, train acc 0.863, test acc 0.819, time 19.8 sec
epoch 3, loss 0.3625, train acc 0.879, test acc 0.786, time 19.8 sec
epoch 4, loss 0.3258, train acc 0.888, test acc 0.870, time 19.9 sec
epoch 5, loss 0.3029, train acc 0.895, test acc 0.849, time 19.9 sec


In [23]:
net[1].gamma.view((-1,)),net[1].beta.view((-1,))

(tensor([1.1455, 0.8896, 1.1991, 0.9443, 1.1923, 1.0236], device='cuda:0',
        grad_fn=<ViewBackward>),
 tensor([ 0.1991, -0.4676,  0.2946,  0.4281, -0.1175, -0.6195], device='cuda:0',
        grad_fn=<ViewBackward>))

In [27]:
#用pytorch中nn模块简洁实现
net = nn.Sequential(nn.Conv2d(1,6,5),
                   nn.BatchNorm2d(6),
                   nn.Sigmoid(),
                   nn.MaxPool2d(2,2),
                   nn.Conv2d(6,16,5),
                   nn.BatchNorm2d(16),
                   nn.Sigmoid(),nn.MaxPool2d(2,2),
                   d2l.FlattenLayer(),
                   nn.Linear(16*4*4,120),
                   nn.BatchNorm1d(120),
                   nn.Sigmoid(),
                   nn.Linear(120,84),
                   nn.BatchNorm1d(84),
                   nn.Sigmoid(),
                   nn.Linear(84,10)
                   )

In [28]:
batch_size = 256
train_iter,test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)
lr,num_epochs = 0.001,5
optimizer = torch.optim.Adam(net.parameters(),lr=lr)
d2l.train_ch5(net,train_iter,test_iter,batch_size,optimizer,device,num_epochs)

training on  cuda
epoch 1, loss 0.9854, train acc 0.786, test acc 0.815, time 14.0 sec
epoch 2, loss 0.4604, train acc 0.861, test acc 0.839, time 14.0 sec
epoch 3, loss 0.3694, train acc 0.877, test acc 0.842, time 14.1 sec
epoch 4, loss 0.3335, train acc 0.886, test acc 0.840, time 14.1 sec
epoch 5, loss 0.3085, train acc 0.891, test acc 0.849, time 14.2 sec
