In [3]:
'''ShuffleNet in PyTorch.

See the paper "ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" for more details.
'''
import torch
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Prepare Data

In [9]:
batch_size = 64
transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))])
train_data = datasets.CIFAR10(root = '../dataset/cifar',download=True,train=True,transform=transform)
train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True)
test_data = datasets.CIFAR10(root='../dataset/cifar',download=True,train=False,transform=transform)
test_loader = DataLoader(test_data,batch_size=batch_size,shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


# Design Model

In [10]:
# 可以用于处理大小为32*32的彩色图片，将图片分成10类
class ShuffleBlock(nn.Module):
    def __init__(self, groups):
        super(ShuffleBlock, self).__init__()
        self.groups = groups

    def forward(self, x):
        '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]'''
        N,C,H,W = x.size()
        g = self.groups
        # 维度变换之后必须要使用.contiguous()使得张量在内存连续之后才能调用view函数
        return x.view(N,g,int(C/g),H,W).permute(0,2,1,3,4).contiguous().view(N,C,H,W)


class Bottleneck(nn.Module):
    def __init__(self, in_planes, out_planes, stride, groups):
        super(Bottleneck, self).__init__()
        self.stride = stride

        # bottleneck层中间层的channel数变为输出channel数的1/4
        mid_planes = int(out_planes/4)


        g = 1 if in_planes==24 else groups
        # 作者提到不在stage2的第一个pointwise层使用组卷积,因为输入channel数量太少,只有24
        self.conv1 = nn.Conv2d(in_planes, mid_planes,
                               kernel_size=1, groups=g, bias=False)
        self.bn1 = nn.BatchNorm2d(mid_planes)
        self.shuffle1 = ShuffleBlock(groups=g)
        self.conv2 = nn.Conv2d(mid_planes, mid_planes,
                               kernel_size=3, stride=stride, padding=1,
                               groups=mid_planes, bias=False)
        self.bn2 = nn.BatchNorm2d(mid_planes)
        self.conv3 = nn.Conv2d(mid_planes, out_planes,
                               kernel_size=1, groups=groups, bias=False)
        self.bn3 = nn.BatchNorm2d(out_planes)

        self.shortcut = nn.Sequential()
        if stride == 2:
            self.shortcut = nn.Sequential(nn.AvgPool2d(3, stride=2, padding=1))

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.shuffle1(out)
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        res = self.shortcut(x)
        out = F.relu(torch.cat([out,res], 1)) if self.stride==2 else F.relu(out+res)
        return out


class ShuffleNet(nn.Module):
    def __init__(self, cfg):
        super(ShuffleNet, self).__init__()
        out_planes = cfg['out_planes']
        num_blocks = cfg['num_blocks']
        groups = cfg['groups']

        self.conv1 = nn.Conv2d(3, 24, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(24)
        self.in_planes = 24
        self.layer1 = self._make_layer(out_planes[0], num_blocks[0], groups)
        self.layer2 = self._make_layer(out_planes[1], num_blocks[1], groups)
        self.layer3 = self._make_layer(out_planes[2], num_blocks[2], groups)
        self.linear = nn.Linear(out_planes[2], 10)

    def _make_layer(self, out_planes, num_blocks, groups):
        layers = []
        for i in range(num_blocks):
            if i == 0:
                layers.append(Bottleneck(self.in_planes,
                                         out_planes-self.in_planes,
                                         stride=2, groups=groups))
            else:
                layers.append(Bottleneck(self.in_planes,
                                         out_planes,
                                         stride=1, groups=groups))
            self.in_planes = out_planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ShuffleNetG2():
    cfg = {
        'out_planes': [200,400,800],
        'num_blocks': [4,8,4],
        'groups': 2
    }
    return ShuffleNet(cfg)

def ShuffleNetG3():
    cfg = {
        'out_planes': [240,480,960],
        'num_blocks': [4,8,4],
        'groups': 3
    }
    return ShuffleNet(cfg)


def test():
    net = ShuffleNetG2()
    x = torch.randn(1,3,32,32)
    y = net(x)
    print(y)

test()

tensor([[ 0.4033, -0.6807,  0.1131,  0.9165,  0.2609, -1.2319, -0.4935,  0.0389,
          2.1435, -1.2825]], grad_fn=<AddmmBackward0>)


# construct loss and optimizer

In [11]:
net = ShuffleNetG2()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

# Training cycle

In [12]:
def train(epoch,device):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        # get the inputs
        inputs, labels = data
        inputs,labels=inputs.to(device),labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 300 == 299:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 299))
            running_loss = 0.0

    print('Finished Training')

In [13]:
def test(device):
    correct = 0
    total = 0
    with torch.no_grad():
        for data in test_loader:
            images, labels = data
            images,labels=images.to(device),labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print('Accuracy of the network on test set: %d %%' % (
        100 * correct / total))

In [14]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# print(device) #cuda:0
net.to(device)
for epoch in range(10):
    train(epoch,device)
    test(device)

[1,   300] loss: 1.996
[1,   600] loss: 1.732
Finished Training
Accuracy of the network on test set: 40 %
[2,   300] loss: 1.558
[2,   600] loss: 1.525
Finished Training
Accuracy of the network on test set: 46 %
[3,   300] loss: 1.426
[3,   600] loss: 1.397
Finished Training
Accuracy of the network on test set: 50 %
[4,   300] loss: 1.330
[4,   600] loss: 1.288
Finished Training
Accuracy of the network on test set: 54 %
[5,   300] loss: 1.212
[5,   600] loss: 1.204
Finished Training
Accuracy of the network on test set: 55 %
[6,   300] loss: 1.123
[6,   600] loss: 1.109
Finished Training
Accuracy of the network on test set: 59 %
[7,   300] loss: 1.035
[7,   600] loss: 1.055
Finished Training
Accuracy of the network on test set: 60 %
[8,   300] loss: 0.980
[8,   600] loss: 0.977
Finished Training
Accuracy of the network on test set: 63 %
[9,   300] loss: 0.901
[9,   600] loss: 0.917
Finished Training
Accuracy of the network on test set: 62 %
[10,   300] loss: 0.853
[10,   600] loss: 0.86