# Deep Pyramidal Residual Networks

论文地址: [Deep Pyramidal Residual Networks](https://arxiv.org/abs/1610.02915)

# ShakeDrop Regularization

论文地址: [ShakeDrop Regularization](https://arxiv.org/abs/1802.02375)

#### <font color="red">二者的结合可以在CIFAR-10/CIFAR-100上取得state-of-the-art的成绩
    
## PyramidNet

该网络主要在pre-activation版本的ResNet上提出了新的思想，主要包括两个方面
* 新的Network Architecture
  + 放弃传统地仅在下采样层阶段增加output channel而每个residual unit的num_filter都相同的做法，转而使得每个residual unit的outputchannel都逐级增加以分担仅在下采样层增加output channel的压力；
  + 两种网络结构：
  
<img src="../img/Chapter4-Convolutional-Neural-Networks/4-31.png" width="600">
  
* 新的Building Block
  + 丢弃每个residual unit中的第一个relu激活层
  + 在每个residual unit最后加上额外的BN以加快收敛
  
<img src="../img/Chapter4-Convolutional-Neural-Networks/4-32.png" width="800">

## ShakeDrop Regularization

In [4]:
import math

import mxnet as mx
import numpy as np

from mxnet import nd
from mxnet import image
from mxnet import gluon
from mxnet import autograd
mx.random.seed(1)

%matplotlib inline
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 120
import matplotlib.pyplot as plt

In [123]:
def BN_ReLU_Conv(num_filter, kernel_size, strides=1, padding=0, erase_relu=False):
    net = gluon.nn.HybridSequential()
    with net.name_scope():
        net.add(gluon.nn.BatchNorm(axis=1))
        if not erase_relu:
            net.add(gluon.nn.Activation('relu'))
        net.add(gluon.nn.Conv2D(num_filter, kernel_size=kernel_size, strides=strides, 
                                padding=padding, use_bias=False))
    return net

def BN_ReLU(erase_relu=False):
    '''
    erase_relu : Boolean
        whether erase relu
    '''
    net = gluon.nn.HybridSequential()
    with net.name_scope():
        net.add(gluon.nn.BatchNorm(axis=1))
        if not erase_relu:
            net.add(gluon.nn.Activation('relu'))
    return net

'''
BasicBlock
'''
class BasicBlock(gluon.nn.HybridBlock):
    outchannel_ratio = 1
    def __init__(self, out_channels, strides=1, downsample=None, **kwargs):
        super().__init__(**kwargs)
        blk = self.blk = gluon.nn.HybridSequential()
        self.downsample = downsample
        self.strides = strides
        with self.name_scope():
            blk.add(BN_ReLU_Conv(out_channels, kernel_size=3, strides=strides, padding=1, erase_relu=True))
            blk.add(BN_ReLU_Conv(out_channels * BasicBlock.outchannel_ratio, kernel_size=3, strides=1, padding=1))
            blk.add(gluon.nn.BatchNorm(axis=1))
        
    def hybrid_forward(self, F, X):
        out = self.blk(X)
        if self.downsample is not None:
            shortcut = self.downsample(X)
        else:
            shortcut = X
            
        residual_channel = out.shape[1]
        shortcut_channel = shortcut.shape[1]
        
        if residual_channel != shortcut_channel:
            # zero-padded shortcut connection
            shortcut = F.array(np.pad(shortcut.asnumpy(), pad_width=((0,0),(0,residual_channel - shortcut_channel),(0,0),(0,0)),
                            mode='constant', constant_values=0))      
        print((out + shortcut).shape)
        return out + shortcut
    
'''
Bottleneck
'''
class Bottleneck(gluon.nn.HybridBlock):
    outchannel_ratio = 4
    def __init__(self, out_channels, strides=1, downsample=None, **kwargs):
        super().__init__(**kwargs)
        blk = self.blk = gluon.nn.HybridSequential()
        self.downsample = downsample
        self.strides = strides
        with self.name_scope():
            blk.add(BN_ReLU_Conv(out_channels, kernel_size=1, erase_relu=True))
            blk.add(BN_ReLU_Conv(out_channels, kernel_size=3, strides=strides, padding=1))
            blk.add(BN_ReLU_Conv(out_channels * Bottleneck.outchannel_ratio, kernel_size=1))
            blk.add(gluon.nn.BatchNorm(axis=1))
        
    def hybrid_forward(self, F, X):
        out = self.blk(X)
        if self.downsample is not None:
            shortcut = self.downsample(X)
        else:
            shortcut = X
            
        residual_channel = out.shape[1]
        shortcut_channel = shortcut.shape[1]
        
        if residual_channel != shortcut_channel:
            # zero-padded shortcut connection
            shortcut = F.array(np.pad(shortcut.asnumpy(), pad_width=((0,0),(0,residual_channel - shortcut_channel),(0,0),(0,0)),
                            mode='constant', constant_values=0))      
        print((out + shortcut).shape)
        return out + shortcut

def pyramidal_residual_layer(block, block_depth, fm_dim, add_rate, strides=1):
    downsample = None
    if strides != 1:
        downsample = gluon.nn.AvgPool2D(pool_size=2, strides=2)
    residual = gluon.nn.HybridSequential()
    with residual.name_scope():
        fm_dim = fm_dim + add_rate
        # first downsample layer
        residual.add(block(int(round(fm_dim)), strides, downsample))
        for i in range(1, int(block_depth)):
            tmp_fm_dim = fm_dim + add_rate
            residual.add(block(int(round(tmp_fm_dim)), 1))
            fm_dim = tmp_fm_dim

    '''
    the reason for multiply the outchannel_ratio is that, whether in BasicBlock or Bottleneck, we 
    multiply the outchannel_ratio to the last conv block to enlarge its channel.And we count the 
    channel to feed into the last classification layer.
    '''
    # fm_dim *= block.outchannel_ratio
    return residual, fm_dim

class PyramidNet(gluon.nn.HybridBlock):
    def __init__(self, depth, alpha, inchannels, num_classes, bottleneck=False, debug=False, **kwargs):
        super().__init__(**kwargs)
        self.debug = debug
        
        if bottleneck:
            n = (depth - 2) / 9
            block = Bottleneck
        else:
            n = (depth - 2) / 6
            block = BasicBlock
            
        addrate = alpha / (3 * n * 1.0)
        featuremap_dim = inchannels # init fm_dim
            
        net = self.net = gluon.nn.HybridSequential()
        with self.name_scope():
            blk1 = gluon.nn.HybridSequential()
            blk1.add(
                gluon.nn.Conv2D(featuremap_dim, kernel_size=3, strides=1, padding=1, use_bias=False),
                gluon.nn.BatchNorm(axis=1)
            )
            
            blk2, featuremap_dim = pyramidal_residual_layer(block, n, featuremap_dim, addrate)
            blk3, featuremap_dim = pyramidal_residual_layer(block, n, featuremap_dim, addrate, strides=2)
            blk4, _ = pyramidal_residual_layer(block, n, featuremap_dim, addrate, strides=2)

            blk5 = gluon.nn.HybridSequential()
            blk5.add(
                gluon.nn.BatchNorm(axis=1),
                gluon.nn.Activation('relu'),
                gluon.nn.GlobalAvgPool2D(),
                gluon.nn.Dense(num_classes)
            )
        
        net.add(blk1, blk2, blk3, blk4, blk5)
        
    def hybrid_forward(self, F, X):
        out = X
        for i, blk in enumerate(self.net):
            out = blk(out)
            if self.debug:
                print("blk {} : {}".format(i+1, out.shape))
        return out

In [124]:
pyramidnet = PyramidNet(depth=200, alpha=240, inchannels=16, num_classes=10, bottleneck=True, debug=True)
pyramidnet.initialize()
X = nd.random.normal(shape=(1,3,32,32))
y = pyramidnet(X)

blk 1 : (1, 16, 32, 32)
(1, 80, 32, 32)
(1, 92, 32, 32)
(1, 108, 32, 32)
(1, 124, 32, 32)
(1, 136, 32, 32)
(1, 152, 32, 32)
(1, 164, 32, 32)
(1, 180, 32, 32)
(1, 196, 32, 32)
(1, 208, 32, 32)
(1, 224, 32, 32)
(1, 240, 32, 32)
(1, 252, 32, 32)
(1, 268, 32, 32)
(1, 284, 32, 32)
(1, 296, 32, 32)
(1, 312, 32, 32)
(1, 324, 32, 32)
(1, 340, 32, 32)
(1, 356, 32, 32)
(1, 368, 32, 32)
(1, 384, 32, 32)
blk 2 : (1, 384, 32, 32)
(1, 400, 16, 16)
(1, 412, 16, 16)
(1, 428, 16, 16)
(1, 444, 16, 16)
(1, 456, 16, 16)
(1, 472, 16, 16)
(1, 484, 16, 16)
(1, 500, 16, 16)
(1, 516, 16, 16)
(1, 528, 16, 16)
(1, 544, 16, 16)
(1, 560, 16, 16)
(1, 572, 16, 16)
(1, 588, 16, 16)
(1, 604, 16, 16)
(1, 616, 16, 16)
(1, 632, 16, 16)
(1, 644, 16, 16)
(1, 660, 16, 16)
(1, 676, 16, 16)
(1, 688, 16, 16)
(1, 704, 16, 16)
blk 3 : (1, 704, 16, 16)
(1, 720, 8, 8)
(1, 732, 8, 8)
(1, 748, 8, 8)
(1, 764, 8, 8)
(1, 776, 8, 8)
(1, 792, 8, 8)
(1, 804, 8, 8)
(1, 820, 8, 8)
(1, 836, 8, 8)
(1, 848, 8, 8)
(1, 864, 8, 8)
(1, 880, 8, 8)


In [120]:
pyramidnet

PyramidNet(
  (net): HybridSequential(
    (0): HybridSequential(
      (0): Conv2D(2 -> 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm(momentum=0.9, fix_gamma=False, axis=1, eps=1e-05, in_channels=16)
    )
    (1): HybridSequential(
      (0): Bottleneck(
        (blk): HybridSequential(
          (0): HybridSequential(
            (0): BatchNorm(momentum=0.9, fix_gamma=False, axis=1, eps=1e-05, in_channels=16)
            (1): Conv2D(16 -> 20, kernel_size=(1, 1), stride=(1, 1), bias=False)
          )
          (1): HybridSequential(
            (0): BatchNorm(momentum=0.9, fix_gamma=False, axis=1, eps=1e-05, in_channels=20)
            (1): Activation(relu)
            (2): Conv2D(20 -> 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          )
          (2): HybridSequential(
            (0): BatchNorm(momentum=0.9, fix_gamma=False, axis=1, eps=1e-05, in_channels=20)
            (1): Activation(relu)
            (2): Con