# Different model architectures
We want to compare different architectures for a CNN model.
While keeping the number of learnable params (and maybe computation cost) constant across models,
we try to vary their structure between rather flat and deeper models.

In [1]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from prettytable import PrettyTable

# Utility
Function to count learnable params of torch model

In [10]:
def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params += params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params

# Model 1 - Standard
As a starting point we build the 18 layer plain model from *Deep Residual Learning for Image Recognition*.
There the input images have size 112 and convolutions are done on feature maps of sizes 56,28,14,7.

Here with have input images of 32x32 and do convolutions on feature maps of sizes 32,16,8,4 in block Conv2, Conv3, Conv4 and Conv5 respectively.

In [23]:
class Plain18Layer():
    def __init__(self):
        super().__init__()
        self.network = nn.Sequential(
            # Conv1: Prepare by mapping to 16 feature maps
            nn.Conv2d(3,8, kernel_size=3, padding=1, bias=False),
            nn.ReLU(),
            
            # Conv2:                                        Learnable params
            nn.Conv2d(8,16, kernel_size=3, padding=1, bias=False),     # 8*16*3*3 = 1152
            nn.ReLU(),
            nn.Conv2d(16,16, kernel_size=3, padding=1, bias=False),     # 16*16*3*3 = 2304
            nn.ReLU(),
            nn.Conv2d(16,16, kernel_size=3, padding=1, bias=False),     # 16*16*3*3 = 2304
            nn.ReLU(),
            nn.Conv2d(16,16, kernel_size=3, padding=1, bias=False),     # 16*16*3*3 = 2304
            nn.ReLU(),                                                  # --------------------
                                                                        # conv2 total = 8064

            nn.MaxPool2d(2, 2), # output: 16 x 16 x 16
            
            # Conv3:                                
            nn.Conv2d(16,32, kernel_size=3, padding=1, bias=False),     # 16*32*3*3 = 4608
            nn.ReLU(),
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),                                                  # --------------------
                                                                        # conv3 total = 32256

            nn.MaxPool2d(2, 2), # output: 32 x 8 x 8
            
            # Conv4:
            nn.Conv2d(32,64, kernel_size=3, padding=1, bias=False),     # 32*64*3*3 = 18432
            nn.ReLU(),
            nn.Conv2d(64,64, kernel_size=3, padding=1, bias=False),     # 64*64*3*3 = 36864
            nn.ReLU(),
            nn.Conv2d(64,64, kernel_size=3, padding=1, bias=False),     # 64*64*3*3 = 36864
            nn.ReLU(),
            nn.Conv2d(64,64, kernel_size=3, padding=1, bias=False),     # 64*64*3*3 = 36864
            nn.ReLU(),                                      # --------------------
                                                            # conv4 total = 129024

            nn.MaxPool2d(2, 2), # output: 64 x 4 x 4
            
            # Conv5:
            nn.Conv2d(64,128, kernel_size=3, padding=1, bias=False),    # 64*128*3*3 = 73728
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, padding=1, bias=False),  # 128*128*3*3 = 147456
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, padding=1, bias=False),  # 128*128*3*3 = 147456
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, padding=1, bias=False),  # 128*128*3*3 = 147456
            nn.ReLU(),                                      # --------------------


        )
        '''
            nn.Flatten(),
            nn.Linear(128*4*4, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 10))
        '''
            
    def forward(self, xb):
        return self.network(xb)

In [24]:
plain_18_layer_net = Plain18Layer().network
count_parameters(plain_18_layer_net)

+-----------+------------+
|  Modules  | Parameters |
+-----------+------------+
|  0.weight |    216     |
|  2.weight |    1152    |
|  4.weight |    2304    |
|  6.weight |    2304    |
|  8.weight |    2304    |
| 11.weight |    4608    |
| 13.weight |    9216    |
| 15.weight |    9216    |
| 17.weight |    9216    |
| 20.weight |   18432    |
| 22.weight |   36864    |
| 24.weight |   36864    |
| 26.weight |   36864    |
| 29.weight |   73728    |
| 31.weight |   147456   |
| 33.weight |   147456   |
| 35.weight |   147456   |
+-----------+------------+
Total Trainable Params: 685656


685656

# Model 2 - Deeper
In this model we skip going from 32 to 64 feature maps. Instead we stay with 32 maps for more convolutions and then directly go from 32 to 128.

In order to keep models comparable we keep the number of trainable parameters constant. Each block of convolutions from the standard model Conv2,..., Conv5 is replaced by Convolutions, so that the number of learnable parameters within the block is the same as with the standard model.
Conv4 now operates on 32 instead of 64 feature maps, therefore we can introduce more convolutions in this block while keeping parameter count the same.
The 2 convolution of Conv4_a have as many learnable parameters as the first convolution of Conv4 in standard model.
The same is true for Conv4_b,c,d and the second, third, fourth convolution of Conv4 in the standard model.

As The first convolution of Conv5 now goes from 32 to 128 feature maps instead of 64 to 128 it has half the parameters. We introduce Conv5_a before the 32 -> 128 step and fill the parameters there.

In [27]:
class Plain28Layer():
    def __init__(self):
        super().__init__()
        self.network = nn.Sequential(
            # Conv1: Prepare by mapping to 16 feature maps
            nn.Conv2d(3,8, kernel_size=3, padding=1, bias=False),
            nn.ReLU(),

            # Conv2:                                        Learnable params
            nn.Conv2d(8,16, kernel_size=3, padding=1, bias=False),     # 8*16*3*3 = 1152
            nn.ReLU(),
            nn.Conv2d(16,16, kernel_size=3, padding=1, bias=False),     # 16*16*3*3 = 2304
            nn.ReLU(),
            nn.Conv2d(16,16, kernel_size=3, padding=1, bias=False),     # 16*16*3*3 = 2304
            nn.ReLU(),
            nn.Conv2d(16,16, kernel_size=3, padding=1, bias=False),     # 16*16*3*3 = 2304
            nn.ReLU(),                                      # --------------------
                                                            # conv2 total = 8064

            nn.MaxPool2d(2, 2), # output: 16 x 16 x 16

            # Conv3:                                
            nn.Conv2d(16,32, kernel_size=3, padding=1, bias=False),     # 16*32*3*3 = 4608
            nn.ReLU(),
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),                                      # --------------------
                                                            # conv3 total = 32256

            nn.MaxPool2d(2, 2), # output: 32 x 8 x 8

            # Conv4_a:
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),                                                  # --------------------
                                                                        # conv4_d total = 18432

            # Conv4_b:
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),                                      # --------------------
                                                            # conv4_b total = 36864
            
            # Conv4_c:
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),                                      # --------------------
                                                            # conv4_c total = 36864

            # Conv4_d:
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),                                      # --------------------
                                                            # conv4_d total = 36864
                                                            # =====================
                                                            # conv4 total = 36864 + 36864 + 36864 + 18432 = 129024

            nn.MaxPool2d(2, 2), # output: 32 x 4 x 4
            
            # Conv5_a:
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),
            nn.Conv2d(32,32, kernel_size=3, padding=1, bias=False),     # 32*32*3*3 = 9216
            nn.ReLU(),
            nn.Conv2d(32,32,kernel_size=3, padding=1, bias=False),      # 32*32*3*3 = 9216
            nn.ReLU(),
            nn.Conv2d(32,32,kernel_size=3, padding=1, bias=False),      # 32*32*3*3 = 9216
            nn.ReLU(),                                      # -------------------
                                                            # conv5_a total = 36864

            # Conv5_b:
            nn.Conv2d(32,128, kernel_size=3, padding=1, bias=False),    # 32*128*3*3 = 36864
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, padding=1, bias=False),  # 128*128*3*3 = 147456
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, padding=1, bias=False),  # 128*128*3*3 = 147456
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, padding=1, bias=False),  # 128*128*3*3 = 147456
            nn.ReLU(),                                      # --------------------
                                                            # conv5_b total = 479232
                                                            # conv5 total = 36864 + 479232 = 516096
        )
        '''
            nn.Flatten(),
            nn.Linear(128*4*4, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 10))
        '''


    def forward(self, xb):
        return self.network(xb)

In [26]:
plain_28_layer_net = Plain28Layer().network
count_parameters(plain_28_layer_net)

+-----------+------------+
|  Modules  | Parameters |
+-----------+------------+
|  0.weight |    216     |
|  2.weight |    1152    |
|  4.weight |    2304    |
|  6.weight |    2304    |
|  8.weight |    2304    |
| 11.weight |    4608    |
| 13.weight |    9216    |
| 15.weight |    9216    |
| 17.weight |    9216    |
| 20.weight |    9216    |
| 22.weight |    9216    |
| 24.weight |    9216    |
| 26.weight |    9216    |
| 28.weight |    9216    |
| 30.weight |    9216    |
| 32.weight |    9216    |
| 34.weight |    9216    |
| 36.weight |    9216    |
| 38.weight |    9216    |
| 40.weight |    9216    |
| 42.weight |    9216    |
| 44.weight |    9216    |
| 46.weight |    9216    |
| 49.weight |    9216    |
| 51.weight |    9216    |
| 53.weight |    9216    |
| 55.weight |    9216    |
| 57.weight |   36864    |
| 59.weight |   147456   |
| 61.weight |   147456   |
| 63.weight |   147456   |
+-----------+------------+
Total Trainable Params: 685656


685656

=> Teh convolutional layers of both models have 685.656 learnable parameters