In [None]:
# In the efficientNet the idea is not to give a new architechture but to modify the baseline model by scaling method called Compound Scaling Method
# This is done by scaling the depth (number of layers), width ( number of channels), resolution of image by some factore alpha^phi , beta^phi , gamma^phi
# where alpha*(beta^2)*(gamma^2) = 2 ( nearly equal to) and alpha, beta, gamma >= 1 as constraints
# alpha, beta , gamma are constant determined by grid search over constraints
# various techniques is used for these which include : mobile inverted resdual layer, squeeze and excitation optimization, stochastic depth  , survival prob, moble net depth wise convolution
# A little bit complex model



In [38]:
import torch.nn as nn
import torch
from math import ceil
from torchsummary import summary

In [27]:
# with reference to the EfficientNet Paper
base_model = [
    # [expand_ratio, channels, repeats, stride, kernel_size]
    [1, 16, 1, 1, 3 ],
    [6, 24, 2, 2, 3],
    [6, 40, 2, 2, 5],
    [6, 80, 3, 2, 3],
    [6, 112, 3, 1, 5],
    [6, 192, 4, 2, 5],
    [6, 320, 1, 1, 3],
]

phi_values ={
    # tuple of : ( phi_value, resolution, drop_rate)
    "b0": (0, 224, 0.2), # alpha, beta, gamma, depth = alpha**phi
    "b1": (0.5, 240 , 0.2),
    "b2": (1, 260, 0.3),
    "b3": (2, 300, 0.3),
    "b4": (3, 380, 0.4),
    "b5": (4, 456, 0.4),
    "b6": (5, 528, 0.5),
    "b7": (6, 600, 0.5),
}



In [28]:
class CNNBlock(nn.Module):

  def __init__(self, in_channels, out_channels, kernel_size, stride, padding , groups = 1):

    super(CNNBlock, self).__init__()
    self.cnn = nn.Conv2d( in_channels, out_channels, kernel_size, stride, padding, groups= groups, bias = False) # groups for depth-wise-convolution
    # if groups = 1  then it is normal convolution
    # if groups = in_channels then it is Depthwise convolution
    self.bn = nn.BatchNorm2d(out_channels)
    self.silu = nn.SiLU()

  def forward(self, x):
    return self.silu(self.bn(self.cnn(x)))



In [29]:
class SqueezeExcitation(nn.Module):

  def __init__(self, in_channels, reduced_dim):

    super(SqueezeExcitation, self).__init__()
    self.se = nn.Sequential(
        nn.AdaptiveAvgPool2d(1), # C x H x W -> C x1 x1
        nn.Conv2d( in_channels, reduced_dim , 1),
        nn.SiLU(),
        nn.Conv2d( reduced_dim, in_channels, 1),
        nn.Sigmoid(),
    )

  def forward(self, x):
    return x*self.se(x)

In [34]:
class InvertedResidualBlock(nn.Module):

  def __init__(self, in_channels, out_channels, kernel_size, stride, padding , expand_ratio, reduction =4, survival_prob = 0.8): # expand_ration uses depthwise convoluton to expand to higher no. of channels and then reduce it to initial no. of channels
    # reduction is for reduced_di for SqueezeExcitaion
    # for stochastic depth is survival_prob
    super(InvertedResidualBlock, self).__init__()

    self.survival_prob = 0.8
    self.use_residual = in_channels == out_channels and stride == 1 # i.e. skip connections can only be used when input and output channels match
    hidden_dim = in_channels*expand_ratio
    self.expand = in_channels != hidden_dim
    reduced_dim = in_channels //reduction

    if self.expand:
      self.expand_conv = CNNBlock( in_channels, hidden_dim , kernel_size =3, stride = 1, padding =1)

    self.conv = nn.Sequential(
        CNNBlock( hidden_dim, hidden_dim, kernel_size, stride, padding, groups = hidden_dim),
        SqueezeExcitation(hidden_dim, reduced_dim),
        nn.Conv2d(hidden_dim, out_channels, 1, bias= False ) , # bottle neck convolution to change number of channels
        nn.BatchNorm2d( out_channels),
    )

  def stochastic_depth(self, x):
    if not self.training:
      return x

    binary_tenosr = torch.rand(x.shape[0], 1,1,1 , device = x.device ) < self.survival_prob
    return torch.div(x, self.survival_prob)*binary_tenosr

  def forward(self, input ):

    x = self.expand_conv(input ) if self.expand else input

    if self.use_residual:
      return self.stochastic_depth(self.conv(x)) + input # this is the residual connection adding input

    else:
      return self.conv(x)


In [35]:
class EfficientNet(nn.Module):

  def __init__(self, version, num_classes):

    super(EfficientNet, self).__init__()
    width_factor, depth_factor, dropout_rate = self.calculate_factors(version)
    last_channels = ceil(1280*width_factor)
    self.pool = nn.AdaptiveAvgPool2d(1)
    self.features = self.create_features(width_factor, depth_factor, last_channels)
    self.classifier = nn.Sequential(
        nn.Dropout(dropout_rate),
        nn.Linear(last_channels, num_classes)
    )

  def calculate_factors(self, version, alpha = 1.2, beta = 1.1  ):
    phi, res, drop_rate = phi_values[version]
    depth_factor, width_factor = alpha**phi, beta**phi
    return width_factor, depth_factor, drop_rate


  def create_features(self, width_factor, depth_factor, last_channels):
    channels = int(32*width_factor)
    features = [CNNBlock(3, channels, 3, stride =2 , padding= 1)] # this list will contain whole architechture  and passed to Sequential(#features)
    in_channels = channels

    for expand_ratio , channels, num_repeats, stride, kernel_size in base_model:
      out_channels = 4*ceil(int(channels*width_factor)/4)
      layer_repeats = ceil(num_repeats*depth_factor)

      for layer in range(layer_repeats):
        features.append(
            InvertedResidualBlock(
                in_channels,
                out_channels,
                expand_ratio = expand_ratio,
                stride = stride if layer ==0 else 1,  # we want to downsample at start of each block
                kernel_size = kernel_size,
                padding = kernel_size//2 , # if kernel = 1, pad = 0 , kernel =3 pad =1 ... an so on to maintain the size of

            )
        )
        in_channels = out_channels

    features.append(
            CNNBlock(in_channels, last_channels, kernel_size =1 , stride = 1, padding =0)
        )

    return nn.Sequential(*features)

  def forward(self, x):
    x = self.pool(self.features(x))
    return self.classifier(x.view(x.shape[0], -1))





In [41]:
def test():
  device = "cuda" if torch.cuda.is_available() else "cpu"
  version = "b0"
  phi , res, drop_rate = phi_values[version]
  num_examples, num_classes = 4, 10
  x = torch.rand((num_examples, 3, res, res)).to(device)
  model = EfficientNet(
      version = version,
      num_classes = num_classes
  ).to(device)

  print(model(x).shape) # (num_examples, num_classes)
  print(summary(model=model, input_size=(3, 224, 224), device="cuda"))

In [42]:
test()

torch.Size([4, 10])
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 32, 112, 112]             864
       BatchNorm2d-2         [-1, 32, 112, 112]              64
              SiLU-3         [-1, 32, 112, 112]               0
          CNNBlock-4         [-1, 32, 112, 112]               0
            Conv2d-5         [-1, 32, 112, 112]             288
       BatchNorm2d-6         [-1, 32, 112, 112]              64
              SiLU-7         [-1, 32, 112, 112]               0
          CNNBlock-8         [-1, 32, 112, 112]               0
 AdaptiveAvgPool2d-9             [-1, 32, 1, 1]               0
           Conv2d-10              [-1, 8, 1, 1]             264
             SiLU-11              [-1, 8, 1, 1]               0
           Conv2d-12             [-1, 32, 1, 1]             288
          Sigmoid-13             [-1, 32, 1, 1]               0
SqueezeExcitation-1