In [1]:
def num_params(n,m,l,k):
    '''Number of parameters conv2d_2
        n,m = shape of kernel
        l = number of inputs
        k = number of outputs
        num_param = (n*m*l+1)*k'''
    print(f'(Kernel=({n}x{m}) * num_in={l} + 1)) * num_out={k} = {(n*m*l+1)*k}')
    return 

In [2]:
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D
tf.__version__

'2.7.0'

In [3]:
# create model
model = Sequential()
model.add(Conv2D(512, (3,3), padding='same', 
                 activation='relu', input_shape=(256, 256, 3)))
# summarize model
num_params(3,3,3,512)
model.summary()

(Kernel=(3x3) * num_in=3 + 1)) * num_out=512 = 14336
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 256, 256, 512)     14336     
                                                                 
Total params: 14,336
Trainable params: 14,336
Non-trainable params: 0
_________________________________________________________________


In [4]:
Conv2D(512, (3,3), padding='same', activation='relu', input_shape=(256, 256, 3))

<keras.layers.convolutional.Conv2D at 0x7f6939563c90>

#### 1x1, number of channels in equals number of channels out

Size of feature map doesn't cange

In [5]:
model.add(Conv2D(512, (1,1), activation='relu'))
num_params(1,1,512,512)
model.summary()

(Kernel=(1x1) * num_in=512 + 1)) * num_out=512 = 262656
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 256, 256, 512)     14336     
                                                                 
 conv2d_2 (Conv2D)           (None, 256, 256, 512)     262656    
                                                                 
Total params: 276,992
Trainable params: 276,992
Non-trainable params: 0
_________________________________________________________________


#### 1x1, number of channels decreases

In [6]:
model.add(Conv2D(64, (1,1), activation='relu'))
num_params(1,1,512,64)
model.summary()

(Kernel=(1x1) * num_in=512 + 1)) * num_out=64 = 32832
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 256, 256, 512)     14336     
                                                                 
 conv2d_2 (Conv2D)           (None, 256, 256, 512)     262656    
                                                                 
 conv2d_3 (Conv2D)           (None, 256, 256, 64)      32832     
                                                                 
Total params: 309,824
Trainable params: 309,824
Non-trainable params: 0
_________________________________________________________________


#### 1x1, number of channels increases

In [7]:
model.add(Conv2D(512, (1,1), activation='relu'))
# summarize model
num_params(1,1,64,512)
model.summary()

(Kernel=(1x1) * num_in=64 + 1)) * num_out=512 = 33280
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 256, 256, 512)     14336     
                                                                 
 conv2d_2 (Conv2D)           (None, 256, 256, 512)     262656    
                                                                 
 conv2d_3 (Conv2D)           (None, 256, 256, 64)      32832     
                                                                 
 conv2d_4 (Conv2D)           (None, 256, 256, 512)     33280     
                                                                 
Total params: 343,104
Trainable params: 343,104
Non-trainable params: 0
_________________________________________________________________


####  Use 1x1 convolution to reduce number of channels before applying larger convolution

input (256 depth) -> 1x1 convolution (64 depth) -> 4x4 convolution (256 depth)

input (256 depth) -> 4x4 convolution (256 depth)

Bottom ~3.7 times slower

## ResNet Model

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

torch.__version__

'1.10.0+cu111'

In [9]:
import sys
try:
    from pytorch_model_summary import summary
except:
    if 'google.colab' in sys.modules:
        !pip install pytorch-model-summary
    else:
        !conda install -c conda-forge pytorch-model-summary
    from pytorch_model_summary import summary

Collecting pytorch-model-summary
  Downloading pytorch_model_summary-0.1.2-py3-none-any.whl (9.3 kB)
Installing collected packages: pytorch-model-summary
Successfully installed pytorch-model-summary-0.1.2


In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [12]:
class Residual(nn.Module):
  
  def __init__(self,input_channels, num_channels, use_1x1conv=False, strides=1, **kwargs):
    super(Residual, self).__init__(**kwargs)
    self.conv1 = nn.Conv2d(input_channels, num_channels,kernel_size=3, padding=1, stride=strides)
    self.conv2 = nn.Conv2d(num_channels, num_channels, kernel_size=3, padding=1)
    if use_1x1conv:
      self.conv3 = nn.Conv2d(input_channels, num_channels, kernel_size=1, stride=strides)
    else:
      self.conv3 = None
    self.bn1 = nn.BatchNorm2d(num_channels)
    self.bn2 = nn.BatchNorm2d(num_channels)
    self.relu = nn.ReLU(inplace=True)
  
  def forward(self, X):
    Y = self.relu(self.bn1(self.conv1(X)))
    Y = self.bn2(self.conv2(Y))
    if self.conv3:
      X = self.conv3(X)
    Y += X
    Y = self.relu(Y)
    return Y

In [13]:
def resnet_block(input_channels, num_channels, num_residuals, first_block=False):
  blk = []
  for i in range(num_residuals):
    if i == 0 and not first_block:
      blk.append(Residual(input_channels, num_channels, use_1x1conv=True, strides=2))
    else:
      blk.append(Residual(num_channels, num_channels))
  return blk

In [14]:
b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
                    nn.BatchNorm2d(64),
                    nn.ReLU(),
                    nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

In [15]:
b2=nn.Sequential(*resnet_block(64,64,2,first_block=True))
b3=nn.Sequential(*resnet_block(64,128,2))
b4=nn.Sequential(*resnet_block(128,256,2))
b5=nn.Sequential(*resnet_block(256,512,2))
net=nn.Sequential(b1,
                  b2,b3,b4,b5,
                  nn.AdaptiveMaxPool2d((1,1)),
                  nn.Flatten(),
                  nn.Linear(512, 10))
net.to(device)

Sequential(
  (0): Sequential(
    (0): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  )
  (1): Sequential(
    (0): Residual(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (1): Residual(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   

In [16]:
print(summary(net,torch.zeros((1,1,28,28)).to(device))) # show_input=True

----------------------------------------------------------------------------
           Layer (type)        Output Shape         Param #     Tr. Param #
               Conv2d-1     [1, 64, 14, 14]           3,200           3,200
          BatchNorm2d-2     [1, 64, 14, 14]             128             128
                 ReLU-3     [1, 64, 14, 14]               0               0
            MaxPool2d-4       [1, 64, 7, 7]               0               0
             Residual-5       [1, 64, 7, 7]          74,112          74,112
             Residual-6       [1, 64, 7, 7]          74,112          74,112
             Residual-7      [1, 128, 4, 4]         230,272         230,272
             Residual-8      [1, 128, 4, 4]         295,680         295,680
             Residual-9      [1, 256, 2, 2]         919,296         919,296
            Residual-10      [1, 256, 2, 2]       1,181,184       1,181,184
            Residual-11      [1, 512, 1, 1]       3,673,600       3,673,600
           

In [17]:
def init_weights(m):
    if type(m) == nn.Linear or type(m) == nn.Conv2d:
        torch.nn.init.xavier_uniform_(m.weight)
        
def evaluate_accuracy(data_iter, net, device):
    """Evaluate accuracy of a model"""
    net.eval()  # Switch to evaluation mode for Dropout, BatchNorm etc layers.
    acc_sum, n = torch.tensor([0], dtype=torch.float32, device=device), 0
    for X, y in data_iter:
        # Copy the data to device.
        X, y = X.to(device), y.to(device)
        with torch.no_grad():
            y = y.long()
            acc_sum += torch.sum((torch.argmax(net(X), dim=1) == y))
            n += y.shape[0]
    return acc_sum.item()/n

import time
def train_resnet(net, train_iter, test_iter, num_epochs, batch_size, device, lr=None):
    print('training on', device)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=lr)
    for epoch in range(num_epochs):
        net.train() # Switch to training mode
        n, start = 0, time.time()
        train_l_sum = torch.tensor([0.0], dtype=torch.float32, device=device)
        train_acc_sum = torch.tensor([0.0], dtype=torch.float32, device=device)
        for X, y in train_iter:
            optimizer.zero_grad()
            X, y = X.to(device), y.to(device) 
            y_hat = net(X) # Forward
            loss = criterion(y_hat, y)
            loss.backward()
            optimizer.step()
            with torch.no_grad():
                y = y.long()
                train_l_sum += loss.float()
                train_acc_sum += (torch.sum((torch.argmax(y_hat, dim=1) == y))).float()
                n += y.shape[0]

        test_acc = evaluate_accuracy(test_iter, net, device) 
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'\
            % (epoch + 1, train_l_sum/n, train_acc_sum/n, test_acc, time.time() - start))


In [18]:
train_dataset = torchvision.datasets.FashionMNIST(
    root='.',
    train=True,
    transform=transforms.ToTensor(),
    download=True)
test_dataset = torchvision.datasets.FashionMNIST(
    root='.',
    train=False,
    transform=transforms.ToTensor(),
    download=True)

batch_size = 256
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./FashionMNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/26421880 [00:00<?, ?it/s]

Extracting ./FashionMNIST/raw/train-images-idx3-ubyte.gz to ./FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./FashionMNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/29515 [00:00<?, ?it/s]

Extracting ./FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./FashionMNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/4422102 [00:00<?, ?it/s]

Extracting ./FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/5148 [00:00<?, ?it/s]

Extracting ./FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./FashionMNIST/raw



In [19]:
lr, num_epochs, batch_size = 0.05, 5, 256
net.apply(init_weights)
train_resnet(net, train_loader, test_loader, num_epochs, batch_size, device, lr)

training on cuda:0
epoch 1, loss 0.0073, train acc 0.621, test acc 0.780, time 27.6 sec
epoch 2, loss 0.0019, train acc 0.828, test acc 0.782, time 27.4 sec
epoch 3, loss 0.0015, train acc 0.855, test acc 0.845, time 27.5 sec
epoch 4, loss 0.0014, train acc 0.868, test acc 0.821, time 27.6 sec
epoch 5, loss 0.0013, train acc 0.877, test acc 0.839, time 27.5 sec
