因为 ResNet 提出了跨层链接的思想，这直接影响了随后出现的卷积网络架构，其中最有名的就是 cvpr 2017 的 best paper，DenseNet。

DenseNet 和 ResNet 不同在于 ResNet 是跨层求和，而 DenseNet 是跨层将特征在通道维度进行拼接

一如既往的导入库，写训练函数，读取并预处理数据集

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F
from torchvision.datasets import mnist
from torch.utils.data import DataLoader

In [2]:
from datetime import datetime

def get_acc(output, label):
    total = output.shape[0]
    _, pred_label = output.max(1)
    num_correct = (pred_label == label).sum().data
    return num_correct / total


def train(net, train_data, valid_data, num_epochs, optimizer, criterion):
    if torch.cuda.is_available():
        net = net.cuda()
    prev_time = datetime.now()
    for epoch in range(num_epochs):
        train_loss = 0
        train_acc = 0
        net = net.train()
        for im, label in train_data:
            if torch.cuda.is_available():
                im = Variable(im.cuda())  # (bs, 3, h, w)
                label = Variable(label.cuda())  # (bs, h, w)
            else:
                im = Variable(im)
                label = Variable(label)
            # forward
            output = net(im)
            loss = criterion(output, label)
            # backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.data
            train_acc += get_acc(output, label)

        cur_time = datetime.now()
        h, remainder = divmod((cur_time - prev_time).seconds, 3600)
        m, s = divmod(remainder, 60)
        time_str = "Time %02d:%02d:%02d" % (h, m, s)
        if valid_data is not None:
            valid_loss = 0
            valid_acc = 0
            net = net.eval()
            for im, label in valid_data:
                if torch.cuda.is_available():
                    im = Variable(im.cuda())
                    label = Variable(label.cuda())
                else:
                    im = Variable(im)
                    label = Variable(label)
                output = net(im)
                loss = criterion(output, label)
                valid_loss += loss.data
                valid_acc += get_acc(output, label)
            epoch_str = (
                "Epoch %d. Train Loss: %f, Train Acc: %f, Valid Loss: %f, Valid Acc: %f, "
                % (epoch, train_loss / len(train_data),
                   train_acc / len(train_data), valid_loss / len(valid_data),
                   valid_acc / len(valid_data)))
        else:
            epoch_str = ("Epoch %d. Train Loss: %f, Train Acc: %f, " %
                         (epoch, train_loss / len(train_data),
                          train_acc / len(train_data)))
        prev_time = cur_time
        print(epoch_str + time_str)

In [3]:
def data_tf(x):
    x = np.array(x, dtype='float32') / 255
    x = (x - 0.5) / 0.5 # 数据预处理，标准化
    x=np.array([x.tolist()])
    x = torch.from_numpy(x)    
    return x

from torchvision.datasets import mnist # 导入 pytorch 内置的 mnist 数据
train_set = mnist.MNIST('./data', train=True, transform=data_tf,download=True) # 重新载入数据集，申明定义的数据变换
test_set = mnist.MNIST('./data', train=False, transform=data_tf,download=True)
train_data = torch.utils.data.DataLoader(train_set, batch_size=64, shuffle=True)
test_data = torch.utils.data.DataLoader(test_set, batch_size=128, shuffle=False)

查看一个batch数据的尺寸，便于网络的设计

In [4]:
x=torch.tensor([])
for i,j in train_data:
    x=i
    break
print(x.shape)

接下来设计实现一个densen block

In [5]:
# 首先定义一个卷积块，这个卷积块的顺序是 bn -> relu -> conv
def conv_block(in_channel, out_channel):
    layer = nn.Sequential(
        nn.BatchNorm2d(in_channel),
        nn.ReLU(True),
        nn.Conv2d(in_channel, out_channel, 3, padding=1, bias=False)
    )
    return layer

In [6]:
# dense block 将每次的卷积的输出称为 `growth_rate`，因为如果输入是 `in_channel`，有 n 层，那么输出就是 `in_channel + n * growh_rate`
class dense_block(nn.Module):
    def __init__(self, in_channel, growth_rate, num_layers):
        super(dense_block, self).__init__()
        block = []
        channel = in_channel
        for i in range(num_layers):
            block.append(conv_block(channel, growth_rate))
            channel += growth_rate
            
        self.net = nn.Sequential(*block)
        
    def forward(self, x):
        for layer in self.net:
            out = layer(x)
            x = torch.cat((out, x), dim=1)
        return x

DenseNet 中还有一个模块叫过渡层（transition block），因为 DenseNet 会不断地对维度进行拼接， 所以当层数很高的时候，输出的通道数就会越来越大，参数和计算量也会越来越大，为了避免这个问题，需要引入过渡层将输出通道降低下来，同时也将输入的长宽减半，这个过渡层可以使用 1 x 1 的卷积

In [7]:
def transition(in_channel, out_channel):
    trans_layer = nn.Sequential(
        nn.BatchNorm2d(in_channel),
        nn.ReLU(True),
        nn.Conv2d(in_channel, out_channel, 1),
        nn.AvgPool2d(2, 2)
    )
    return trans_layer

In [8]:
block1 = nn.Sequential(
            nn.Conv2d(1, 64, 7, 2, 3),
            nn.BatchNorm2d(64),
            nn.ReLU(True),
            nn.MaxPool2d(3, 2, padding=1)
        )

In [9]:
block=[]
block.append(dense_block(64, 21, 16))
block.append(transition(400, 400 // 2)) # 通过 transition 层将大小减半，通道数减半
block2 = nn.Sequential(*block)

In [10]:
block3 = nn.Sequential(*[])
block3.add_module('bn', nn.BatchNorm2d(200))
block3.add_module('relu', nn.ReLU(True))
block3.add_module('avg_pool', nn.AvgPool2d(2))

In [11]:
print("1:",x.shape)
x=block1(x.float())
print("2:",x.shape)
x=block2(x)
print("3:",x.shape)
x=block3(x)
print("4:",x.shape)
x = x.view(x.shape[0], -1)
print("4:",x.shape)
x=nn.Linear(200, 10)(x)
print("10:",x.shape)

In [12]:
class densenet(nn.Module):
    def __init__(self, in_channel, num_classes):
        super(densenet, self).__init__()
        self.block1 = nn.Sequential(
            nn.Conv2d(1, 64, 7, 2, 3),
            nn.BatchNorm2d(64),
            nn.ReLU(True),
            nn.MaxPool2d(3, 2, padding=1)
        )
        
        block=[]
        block.append(dense_block(64, 21, 16))
        block.append(transition(400, 400 // 2)) # 通过 transition 层将大小减半，通道数减半
        self.block2 = nn.Sequential(*block)
        
        self.block3 = nn.Sequential(*[])
        self.block3.add_module('bn', nn.BatchNorm2d(200))
        self.block3.add_module('relu', nn.ReLU(True))
        self.block3.add_module('avg_pool', nn.AvgPool2d(2))
        
        
        self.classifier = nn.Linear(200, num_classes)
    
    def forward(self, x):
        x=x.float()
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = x.view(x.shape[0], -1)
        x = self.classifier(x)
        return x

In [13]:
net=densenet(1,10)
optimizer = torch.optim.SGD(net.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [14]:
train(net, train_data, test_data, 20, optimizer, criterion)

简化了的模型也能在训练20次时达到0.992682的测试集准确率