### Try to understand the parameters required in the neural network

In [1]:
import re
import collections
from functools import partial
import math

In [2]:
def efficientnet_params(model_name):
    """ Map EfficientNet model name to parameter coefficients. """
    params_dict = {
        # Coefficients:   width,depth,res,dropout
        'efficientnet-b0': (1.0, 1.0, 224, 0.2),
        'efficientnet-b1': (1.0, 1.1, 240, 0.2),
        'efficientnet-b2': (1.1, 1.2, 260, 0.3),
        'efficientnet-b3': (1.2, 1.4, 300, 0.3),
        'efficientnet-b4': (1.4, 1.8, 380, 0.4),
        'efficientnet-b5': (1.6, 2.2, 456, 0.4),
        'efficientnet-b6': (1.8, 2.6, 528, 0.5),
        'efficientnet-b7': (2.0, 3.1, 600, 0.5),
        'efficientnet-b8': (2.2, 3.6, 672, 0.5),
        'efficientnet-l2': (4.3, 5.3, 800, 0.5),
    }
    return params_dict[model_name]

In [3]:
class BlockDecoder(object):
    """ Block Decoder for readability, straight from the official TensorFlow repository """

    @staticmethod
    def _decode_block_string(block_string):
        """ Gets a block through a string notation of arguments. """
        assert isinstance(block_string, str)

        ops = block_string.split('_')
        options = {}
        for op in ops:
            splits = re.split(r'(\d.*)', op)
            if len(splits) >= 2:
                key, value = splits[:2]
                options[key] = value

        # Check stride
        assert (('s' in options and len(options['s']) == 1) or
                (len(options['s']) == 2 and options['s'][0] == options['s'][1]))

        return BlockArgs(
            kernel_size=int(options['k']),
            num_repeat=int(options['r']),
            input_filters=int(options['i']),
            output_filters=int(options['o']),
            expand_ratio=int(options['e']),
            id_skip=('noskip' not in block_string),
            se_ratio=float(options['se']) if 'se' in options else None,
            stride=[int(options['s'][0])])

    @staticmethod
    def _encode_block_string(block):
        """Encodes a block to a string."""
        args = [
            'r%d' % block.num_repeat,
            'k%d' % block.kernel_size,
            's%d%d' % (block.strides[0], block.strides[1]),
            'e%s' % block.expand_ratio,
            'i%d' % block.input_filters,
            'o%d' % block.output_filters
        ]
        if 0 < block.se_ratio <= 1:
            args.append('se%s' % block.se_ratio)
        if block.id_skip is False:
            args.append('noskip')
        return '_'.join(args)

    @staticmethod
    def decode(string_list):
        """
        Decodes a list of string notations to specify blocks inside the network.

        :param string_list: a list of strings, each string is a notation of block
        :return: a list of BlockArgs namedtuples of block args
        """
        assert isinstance(string_list, list)
        blocks_args = []
        for block_string in string_list:
            blocks_args.append(BlockDecoder._decode_block_string(block_string))
        return blocks_args

    @staticmethod
    def encode(blocks_args):
        """
        Encodes a list of BlockArgs to a list of strings.

        :param blocks_args: a list of BlockArgs namedtuples of block args
        :return: a list of strings, each string is a notation of block
        """
        block_strings = []
        for block in blocks_args:
            block_strings.append(BlockDecoder._encode_block_string(block))
        return block_strings

In [4]:
def efficientnet(width_coefficient=None, depth_coefficient=None, dropout_rate=0.2,
                 drop_connect_rate=0.2, image_size=None, num_classes=1000):
    """ Creates a efficientnet model. """

    blocks_args = [
        'r1_k3_s11_e1_i32_o16_se0.25', 'r2_k3_s22_e6_i16_o24_se0.25',
        'r2_k5_s22_e6_i24_o40_se0.25', 'r3_k3_s22_e6_i40_o80_se0.25',
        'r3_k5_s11_e6_i80_o112_se0.25', 'r4_k5_s22_e6_i112_o192_se0.25',
        'r1_k3_s11_e6_i192_o320_se0.25',
    ]
    blocks_args = BlockDecoder.decode(blocks_args)

    global_params = GlobalParams(
        batch_norm_momentum=0.99,
        batch_norm_epsilon=1e-3,
        dropout_rate=dropout_rate,
        drop_connect_rate=drop_connect_rate,
        # data_format='channels_last',  # removed, this is always true in PyTorch
        num_classes=num_classes,
        width_coefficient=width_coefficient,
        depth_coefficient=depth_coefficient,
        depth_divisor=8,
        min_depth=None,
        image_size=image_size,
    )

    return blocks_args, global_params

In [5]:
def get_model_params(model_name, override_params):
    """ Get the block args and global params for a given model """
    if model_name.startswith('efficientnet'):
        w, d, s, p = efficientnet_params(model_name)
        # note: all models have drop connect rate = 0.2
        blocks_args, global_params = efficientnet(
            width_coefficient=w, depth_coefficient=d, dropout_rate=p, image_size=s)
    else:
        raise NotImplementedError('model name is not pre-defined: %s' % model_name)
    if override_params:
        # ValueError will be raised here if override_params has fields not included in global_params.
        global_params = global_params._replace(**override_params)
    return blocks_args, global_params

In [6]:
# Parameters for the entire model (stem, all blocks, and head)
GlobalParams = collections.namedtuple('GlobalParams', [
    'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate',
    'num_classes', 'width_coefficient', 'depth_coefficient',
    'depth_divisor', 'min_depth', 'drop_connect_rate', 'image_size'])

# Parameters for an individual model block
BlockArgs = collections.namedtuple('BlockArgs', [
    'kernel_size', 'num_repeat', 'input_filters', 'output_filters',
    'expand_ratio', 'id_skip', 'stride', 'se_ratio'])

# Change namedtuple defaults
GlobalParams.__new__.__defaults__ = (None,) * len(GlobalParams._fields)
BlockArgs.__new__.__defaults__ = (None,) * len(BlockArgs._fields)

In [7]:
blocks_args, global_params = get_model_params('efficientnet-b0', None)

In [9]:
blocks_args

[BlockArgs(kernel_size=3, num_repeat=1, input_filters=32, output_filters=16, expand_ratio=1, id_skip=True, stride=[1], se_ratio=0.25),
 BlockArgs(kernel_size=3, num_repeat=2, input_filters=16, output_filters=24, expand_ratio=6, id_skip=True, stride=[2], se_ratio=0.25),
 BlockArgs(kernel_size=5, num_repeat=2, input_filters=24, output_filters=40, expand_ratio=6, id_skip=True, stride=[2], se_ratio=0.25),
 BlockArgs(kernel_size=3, num_repeat=3, input_filters=40, output_filters=80, expand_ratio=6, id_skip=True, stride=[2], se_ratio=0.25),
 BlockArgs(kernel_size=5, num_repeat=3, input_filters=80, output_filters=112, expand_ratio=6, id_skip=True, stride=[1], se_ratio=0.25),
 BlockArgs(kernel_size=5, num_repeat=4, input_filters=112, output_filters=192, expand_ratio=6, id_skip=True, stride=[2], se_ratio=0.25),
 BlockArgs(kernel_size=3, num_repeat=1, input_filters=192, output_filters=320, expand_ratio=6, id_skip=True, stride=[1], se_ratio=0.25)]

In [10]:
global_params

GlobalParams(batch_norm_momentum=0.99, batch_norm_epsilon=0.001, dropout_rate=0.2, num_classes=1000, width_coefficient=1.0, depth_coefficient=1.0, depth_divisor=8, min_depth=None, drop_connect_rate=0.2, image_size=224)

In [11]:
import torch
from torch import nn
from torch.nn import functional as F

In [16]:
class SwishImplementation(torch.autograd.Function):
    @staticmethod
    def forward(ctx, i):
        result = i * torch.sigmoid(i)
        ctx.save_for_backward(i)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        i = ctx.saved_variables[0]
        sigmoid_i = torch.sigmoid(i)
        return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))


class MemoryEfficientSwish(nn.Module):
    def forward(self, x):
        return SwishImplementation.apply(x)

In [11]:
x = torch.randn(20, 24, 56, 56)
expand = nn.Conv2d(24, 144, 1)
expand_output = expand(x)
expand_output.shape

torch.Size([20, 144, 56, 56])

In [14]:
x = torch.randn(20, 144, 28, 28)
x_squeezed = F.adaptive_avg_pool2d(x, 1)

In [15]:
x_squeezed.shape

torch.Size([20, 144, 1, 1])

In [39]:
def stochastic_depth(inputs, skip_probability, training):
    if not training: return inputs
    batch_size = inputs.shape[0]
    keep_prob = 1 - skip_probability
    random_tensor = keep_prob
    random_tensor += torch.rand([batch_size, 1, 1, 1], dtype=inputs.dtype, device=inputs.device)
    binary_tensor = torch.floor(random_tensor)
    output = inputs / keep_prob * binary_tensor
    return output

In [33]:
class SqueezeExcitation(nn.Module):
    def __init__(self, channel, se_ratio, activation = None):
        super().__init__()
        self.squeezed_channel = max(1, int(channel * se_ratio))
        self.se_reduce = nn.Conv2d(channel, self.squeezed_channel, 1)
        self.se_expand = nn.Conv2d(self.squeezed_channel, channel, 1)
        self.activation = activation
    def forward(self, x):
        x_squeezed = F.adaptive_avg_pool2d(x, 1)
        x_squeezed = self.se_expand(self.activation(self.se_reduce(x_squeezed)))
        return torch.sigmoid(x_squeezed)

In [53]:
class MBConvBasicBlc(nn.Module):
    def __init__(self, in_channel, out_channel, expand_ratio, stride, kernel, 
                 se_ratio, skip_probability, bn_momentum, bn_epsilon):
        super().__init__()
        self.expand_option = (expand_ratio != 1)
        med_channel = in_channel*expand_ratio
        self.activation = MemoryEfficientSwish()
        self.skip_probability = skip_probability
        if self.expand_option:
            self.expand = nn.Conv2d(in_channel, med_channel, 1)
            self.bn_expand = nn.BatchNorm2d(num_features=med_channel, momentum=(1-bn_momentum),
                                            eps=bn_epsilon)
        self.depth_wise = nn.Conv2d(med_channel, med_channel, kernel, stride = stride, 
                                    padding=math.ceil((kernel-stride)/2), groups=med_channel)
        self.bn_depth_wise = nn.BatchNorm2d(num_features=med_channel, momentum=(1-bn_momentum),
                                            eps=bn_epsilon)
        
        if (se_ratio is not None) and (0 < se_ratio < 1):
            self.se_operation = SqueezeExcitation(med_channel, se_ratio, self.activation)
        else:
            self.se_operation = None
        self.real_out = nn.Conv2d(med_channel, out_channel, 1)
        self.bn_out = nn.BatchNorm2d(num_features=out_channel, momentum=(1-bn_momentum),
                                            eps=bn_epsilon)
        
    def forward(self, inputs):
        x = inputs
        
        if self.expand_option:
            x = self.expand(x)
            x = self.bn_expand(x)
            x = self.activation(x)
            
        x = self.depth_wise(x)
        x = self.bn_depth_wise(x)
        x = self.activation(x)
        
        if self.se_operation is not None:
            x_squeezed = self.se_operation(x)
            x = x_squeezed * x
            
        x = self.real_out(x)
        x = self.bn_out(x)
        x = self.activation(x)
        
        if x.shape == inputs.shape:
            if self.skip_probability:
                x = stochastic_depth(x, self.skip_probability, training=self.training)
            x = x + inputs
        return x

In [54]:
class MBConvBlc(nn.Module):
    def __init__(self, in_channel, out_channel, expand_ratio, stride, kernel, 
                 se_ratio, skip_probability, bn_momentum, bn_epsilon, n_repeat):
        super().__init__()
        self.blocks = nn.ModuleList([])
        for i in range(n_repeat):
            if i == 0:
                self.blocks.append(MBConvBasicBlc(in_channel, out_channel, 
                                                  expand_ratio, stride, kernel, 
                                                  se_ratio, skip_probability,
                                                  bn_momentum, bn_epsilon))
            else:
                self.blocks.append(MBConvBasicBlc(out_channel, out_channel, 
                                                  expand_ratio, 1, kernel, 
                                                  se_ratio, skip_probability,
                                                  bn_momentum, bn_epsilon))
    def forward(self, x):
        for blc in self.blocks:
            x = blc(x)
        return x

In [64]:
x = torch.randn(20, 192, 7, 7)
expand = MBConvBlc(in_channel=192, 
                   out_channel=320, 
                   expand_ratio=6, 
                   stride=1, 
                   kernel=3,
                   se_ratio=1/24,
                   skip_probability=0.2,
                   bn_momentum=0.99, 
                   bn_epsilon=0.001,
                   n_repeat=1)
expand_output = expand(x)
expand_output.shape

torch.Size([20, 320, 7, 7])

In [26]:
x = torch.randn(20, 32, 112, 112)
expand = MBConvBasicBlc(in_channel=32, 
                        out_channel=16, 
                        expand_ratio=1, 
                        stride=1, 
                        kernel=3)
expand_output = expand(x)
expand_output.shape

torch.Size([20, 16, 112, 112])

In [None]:
(20, 96, 112, 112)

In [None]:
(20, 96, 56, 56)

In [None]:
(20, 24, 56, 56)

In [16]:
depth_wise = nn.Conv2d(144, 144, 5, stride = 2, padding=2, groups=144)
output = depth_wise(expand_output)
output.shape

torch.Size([20, 144, 28, 28])

In [13]:
real_out = nn.Conv2d(96, 24, 1)
real_output = real_out(output)
real_output.shape

torch.Size([20, 24, 56, 56])

In [54]:
depth_wise.weight.shape

torch.Size([96, 1, 3, 3])

In [17]:
import math

In [18]:
math.ceil((5-2)/2)

2

In [40]:
0 < 0.25 <= 1

True

In [41]:
a = F.adaptive_avg_pool2d(output, 1)

In [42]:
a.shape

torch.Size([20, 96, 1, 1])

In [74]:
def get_same_padding_conv2d(image_size=None):
    """ Chooses static padding if you have specified an image size, and dynamic padding otherwise.
        Static padding is necessary for ONNX exporting of models. """
    if image_size is None:
        return Conv2dDynamicSamePadding
    else:
        return partial(Conv2dStaticSamePadding, image_size=image_size)


class Conv2dDynamicSamePadding(nn.Conv2d):
    """ 2D Convolutions like TensorFlow, for a dynamic image size """

    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True):
        super().__init__(in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
        self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2

    def forward(self, x):
        ih, iw = x.size()[-2:]
        kh, kw = self.weight.size()[-2:]
        sh, sw = self.stride
        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
        pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
        pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
        if pad_h > 0 or pad_w > 0:
            x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)


class Conv2dStaticSamePadding(nn.Conv2d):
    """ 2D Convolutions like TensorFlow, for a fixed image size"""

    def __init__(self, in_channels, out_channels, kernel_size, image_size=None, **kwargs):
        super().__init__(in_channels, out_channels, kernel_size, **kwargs)
        self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2

        # Calculate padding based on image size and save it
        assert image_size is not None
        ih, iw = image_size if type(image_size) == list else [image_size, image_size]
        print(ih, iw)
        kh, kw = self.weight.size()[-2:]
        sh, sw = self.stride
        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
        print(oh, ow)
        pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
        pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
        print(pad_h, pad_w)
        if pad_h > 0 or pad_w > 0:
            self.static_padding = nn.ZeroPad2d((pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2))
        else:
            self.static_padding = Identity()

    def forward(self, x):
        x = self.static_padding(x)
        x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
        return x

In [75]:
global_params.image_size

224

In [76]:
Conv2d = get_same_padding_conv2d(image_size=global_params.image_size)

In [79]:
depthwise_conv = Conv2d(
            in_channels=144, out_channels=144, groups=144,  # groups makes it depthwise
            kernel_size=3, stride=2, bias=False)

224 224
112 112
1 1


In [80]:
output = depthwise_conv(expand_output)
output.shape

torch.Size([20, 144, 28, 28])

In [64]:
depthwise_conv

Conv2dStaticSamePadding(
  96, 96, kernel_size=(3, 3), stride=(2, 2), groups=96, bias=False
  (static_padding): ZeroPad2d(padding=(0, 1, 0, 1), value=0.0)
)