<a href="https://colab.research.google.com/github/DemoySegment/dl-miniproject-resnet/blob/main/dl_miniproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import torch
import torch.nn as nn
import torchsummary
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as functional
import torchvision.models as models
!pip install torchinfo
from torchinfo import summary
import torch.nn.functional as F
from torch.utils.data import ConcatDataset


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [14]:
# output_size = (input_size + 2*padding - kernel)/stride + 1 
class BuildingBlock(nn.Module):
   
    def __init__(self, in_channels, intermediate_channels, identity_downsample=None, kernel_size=3, stride=1, expansion=1):
      """
      This class is for building a resnet block. In each block various 
      convolution layers will be connected to each other with a batctnorm layer and a relu activation between.
      Skip connection will be built between the input of the first layer and the input of the last layer, 
      that is to add the input of the block to the output of the last batctnorm layer.
      Size of the two inputs of skip connection should be pay attention to.

      :param in_channels: the number of input channels of the whole block. Since block will repeat several times, 
      lets say a block with with a input channels of 64 and output channels of 128, the next time going
      through the block need a input channels of 128.

      :param intermediate_channels: the number of output channels of conv layers in the block. 
      Since channels always expand, the output channels of the block will be the expansion * intermediate_channels.

      :param identity_downsample: a model to deal with skip connection problem. this model should have a conv layer and
      a batchnorm layer. In the next iteration of same block, the input channels may not be consist with the output of the 
      last batchnorm output, therefore we need the parameter to help change x's channels.
      :type identity_downsample: nn.Module

      :paran stride: if stride>1 for one conv layer in each same block in iteration, then the size of the images will be 
      decreased for block_num times, which is not what we want. Therefore, for iterations of the the same block, only one layer
      in one of the block will have a stride that reduce the size of the image.
      """

      super(BuildingBlock, self).__init__()
      
      #expansion rate, the output channels of the block will be the expansion * intermediate_channels
      self.expansion = expansion
      self.conv1 = nn.Conv2d(
          in_channels,
          intermediate_channels,
          kernel_size=kernel_size,
          stride=stride,
          padding=(int)((kernel_size-1)/2),
          bias=False,
      )
      self.bn1 = nn.BatchNorm2d(intermediate_channels)
      self.conv2 = nn.Conv2d(
          intermediate_channels,
          intermediate_channels * self.expansion,
          kernel_size=kernel_size,
          stride=1,
          padding=(int)((kernel_size-1)/2),
          bias=False,
      )
      self.bn2 = nn.BatchNorm2d(intermediate_channels * self.expansion)
      self.relu = nn.ReLU()
      self.identity_downsample = identity_downsample
      self.stride = stride
      self.in_channels = in_channels
      self.intermediate_channels = intermediate_channels

    def forward(self, x):
        identity = x.clone()

        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)

        if self.identity_downsample is not None:
            identity = self.identity_downsample(identity)

        x += identity
        x = self.relu(x)
        return x


class ResNet(nn.Module):
    def __init__(self, block, layerNums, image_channels, start_channels, num_classes):
        super(ResNet, self).__init__()
        # head layers
        self.in_channels = start_channels
        self.conv1 = nn.Conv2d(
            image_channels, self.in_channels, kernel_size=3, stride=1, padding=1, bias=False
        )
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        # recursion block layers
        # Essentially the entire ResNet architecture are in these 4 lines below
        self.layer1, self.in_channels = self._make_block(
            BuildingBlock, layerNums[0], intermediate_channels=32, in_channels=self.in_channels, stride=1, kernel_size=5
        )
        self.layer2, self.in_channels = self._make_block(
            BuildingBlock, layerNums[1], intermediate_channels=64, in_channels=self.in_channels, stride=2, kernel_size=5
        )
        self.layer3, self.in_channels = self._make_block(
            BuildingBlock, layerNums[2], intermediate_channels=128, in_channels=self.in_channels, stride=2, kernel_size=5
        )
        self.layer4, self.in_channels = self._make_block(
            BuildingBlock, layerNums[3], intermediate_channels=256, in_channels=self.in_channels, stride=2, kernel_size=3
        )


        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(256, num_classes)

        self.layerNums = layerNums

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        #x = self.maxpool(x)
        
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.reshape(x.shape[0], -1)
        x = self.fc(x)
        #x = functional.softmax(x, dim=0)
        return x

    # for resnet18, expansion === 1
    def _make_block(self, block, num_layers, intermediate_channels, in_channels, stride, expansion=1, kernel_size=3):
        identity_downsample = None
        layers = []

        # Either if we half the input space for ex, 56x56 -> 28x28 (stride=2), or channels changes
        # we need to adapt the Identity (skip connection) so it will be able to be added
        # to the layer that's ahead
        # it is used at the end of first iteration of each block
        if stride != 1 or in_channels != intermediate_channels*expansion:
          identity_downsample = nn.Sequential(
                  nn.Conv2d(
                      in_channels,
                      intermediate_channels*expansion,
                      kernel_size=1,
                      stride=stride,
                      bias=False,
                  ),
                  nn.BatchNorm2d(intermediate_channels*expansion),
              )

        layers.append(
            block(in_channels, intermediate_channels, stride=stride, identity_downsample=identity_downsample, expansion=expansion, kernel_size=kernel_size)
        )

       
        in_channels = intermediate_channels*expansion

        # For example for first resnet layer: 256 will be mapped to 64 as intermediate layer,
        # then finally back to 256. Hence no identity downsample is needed, since stride = 1,
        # and also same amount of channels.
        for i in range(num_layers - 1):
            layers.append(block(in_channels, intermediate_channels, expansion=expansion, kernel_size=kernel_size))
        
        return nn.Sequential(*layers), in_channels
    
    def to_string(self):
      print('current model status:')
      print('parameter numbers: {}'.format(sum(p.numel() for p in self.parameters() if p.requires_grad)))
      print('block numbers: {}'.format(self.layerNums))

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ResNet(BuildingBlock, [3,3,2,2], 3, 32, 10)
model = model.to(device)
print(model)


ResNet(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU()
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BuildingBlock(
      (conv1): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), bias=False)
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), bias=False)
      (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU()
    )
    (1): BuildingBlock(
      (conv1): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2), bias=False)
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(32, 32, kernel_size=(5, 5), str

In [16]:
summary(model, input_size=(256, 3, 32, 32))

Layer (type:depth-idx)                   Output Shape              Param #
ResNet                                   [256, 10]                 --
├─Conv2d: 1-1                            [256, 32, 32, 32]         864
├─BatchNorm2d: 1-2                       [256, 32, 32, 32]         64
├─ReLU: 1-3                              [256, 32, 32, 32]         --
├─Sequential: 1-4                        [256, 32, 32, 32]         --
│    └─BuildingBlock: 2-1                [256, 32, 32, 32]         --
│    │    └─Conv2d: 3-1                  [256, 32, 32, 32]         25,600
│    │    └─BatchNorm2d: 3-2             [256, 32, 32, 32]         64
│    │    └─ReLU: 3-3                    [256, 32, 32, 32]         --
│    │    └─Conv2d: 3-4                  [256, 32, 32, 32]         25,600
│    │    └─BatchNorm2d: 3-5             [256, 32, 32, 32]         64
│    │    └─ReLU: 3-6                    [256, 32, 32, 32]         --
│    └─BuildingBlock: 2-2                [256, 32, 32, 32]         --
│    │

In [17]:
def accuracy(y_pred, y):
  predict = torch.argmax(y_pred, dim=1)
  acc = torch.sum(predict == y) / y.shape[0]
  return acc
  #return y_pred.argmax(dim=1).eq(y).sum().item()

In [18]:
def evaluate(model, iterator, criterion):
    

    epoch_loss = 0
    epoch_acc = 0

    #set the model in evaluation mode
    model.eval()
    with torch.no_grad():
      for(x, y) in iterator:
        x = x.to(device)
        y = y.to(device)
        y_pred = model(x)
        loss = criterion(y_pred, y)
        acc = accuracy(y_pred, y)
        
        epoch_loss += loss
        epoch_acc += acc
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [19]:
def train(model, iterator, optimizer, criterion, scheduler):
    
    epoch_loss = 0
    epoch_acc = 0

    #set the model in training mode
    model.train()

    for(x, y) in iterator:
      x = x.to(device)
      y = y.to(device)

      y_pred = model(x)
      
      loss = criterion(y_pred, y)
      acc = accuracy(y_pred, y)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      if scheduler is not None:
        scheduler.step()
      
      epoch_loss += loss
      epoch_acc += acc
      

        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [20]:
def run_epoches(epoch_num, model, optimizer, criterion, trainloader, testloader, scheduler=None):
  best_valid_acc = float(0)
  print("start running")
  for epoch in range(N_EPOCHS):
      print(' --Epoch {}'.format(epoch))
      print(" --start training--")
      train_loss, train_acc = train(model, trainloader, optimizer, criterion, scheduler)
      print(" --start validing--")
      valid_loss, valid_acc = evaluate(model, testloader, criterion)
      if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc

      print('Epoch:', epoch, 'LR:', scheduler.get_lr())
      print(f'  \tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
      print(f'  \t Val. Loss: {valid_loss:.3f} |  Val Acc: {valid_acc*100:.2f}%')
      print(f'  Current best Val Acc: {best_valid_acc}')
      torch.cuda.empty_cache()
  print("--end running")
  return best_valid_acc

In [22]:
best_result_SGD = (0.1, 128, 0.0005)
lr, batch_size, wd = best_result_SGD

N_EPOCHS = 20

print("--------------lr={}, batch_size={}, wd={} start-------------".format(lr, batch_size, wd))
# optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=wd)
cosine_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=0)


criterion = nn.CrossEntropyLoss()

criterion = criterion.to(device)

transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

transform1 = transforms.Compose([
        
        transforms.RandomResizedCrop(32),#训练模型有resize 和 翻折的操作
    transforms.RandomHorizontalFlip(),#
        transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

transform2 = transforms.Compose([
        
        transforms.RandomResizedCrop(32),#训练模型有resize 和 翻折的操作
    transforms.RandomVerticalFlip(),
        transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

transform3 = transforms.Compose([
        
        transforms.RandomResizedCrop(32),#训练模型有resize 和 翻折的操作
    transforms.RandomRotation(90),
        transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
trainset1 = torchvision.datasets.CIFAR10(root='./data', train=True, download=False, transform=transform1)
trainset2 = torchvision.datasets.CIFAR10(root='./data', train=True, download=False, transform=transform2)
trainset3 = torchvision.datasets.CIFAR10(root='./data', train=True, download=False, transform=transform3)

trainset = ConcatDataset([trainset,trainset1,trainset3,trainset2])

transform_test = transforms.Compose([
        transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])


trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=0,drop_last=True)


testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=0,drop_last=True)

result = run_epoches(N_EPOCHS, model, optimizer, criterion, trainloader, testloader, cosine_scheduler)




--------------lr=0.1, batch_size=128, wd=0.0005 start-------------
Files already downloaded and verified
Files already downloaded and verified
start running
 --Epoch 0
 --start training--
 --start validing--
Epoch: 0 LR: [0.08386590697411314]
  	Train Loss: 0.627 | Train Acc: 78.11%
  	 Val. Loss: 0.464 |  Val Acc: 84.78%
  Current best Val Acc: 0.8477564454078674
 --Epoch 1
 --start training--
 --start validing--
Epoch: 1 LR: [0.053959611008137134]
  	Train Loss: 0.618 | Train Acc: 78.50%
  	 Val. Loss: 0.409 |  Val Acc: 86.83%
  Current best Val Acc: 0.8682892918586731
 --Epoch 2
 --start training--
 --start validing--
Epoch: 2 LR: [0.023872875703126455]
  	Train Loss: 0.612 | Train Acc: 78.73%
  	 Val. Loss: 0.373 |  Val Acc: 87.84%
  Current best Val Acc: 0.8784054517745972
 --Epoch 3
 --start training--
 --start validing--
Epoch: 3 LR: [0.004424211972086993]
  	Train Loss: 0.605 | Train Acc: 78.95%
  	 Val. Loss: 0.376 |  Val Acc: 87.90%
  Current best Val Acc: 0.8790064454078674


In [None]:
N_EPOCHS = 20
criterion = nn.CrossEntropyLoss()

criterion = criterion.to(device)

transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

lr_candidates = [0.1, 0.05, 0.005]
wd_candidates = [5e-4,1e-4]

batch_size_candidates = [128, 256]

best_result = (0,0,0,0)
for lr in lr_candidates:
  for wd in wd_candidates:
    for batch_size in batch_size_candidates:
      print("--------------lr={}, wd={}, batch_size={}, start-------------".format(lr, wd, batch_size))
      # optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
      model = ResNet(BuildingBlock, [3,3,2,2], 3, 32, 10)
      model = model.to(device)
      optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=wd)
      cosine_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=0)

      trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
      trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)


      testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
      testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)

      result = run_epoches(N_EPOCHS, model, optimizer,  criterion, trainloader, testloader， cosine_scheduler)
      print("--------------lr={}, wd={}, batch_size={}, result={}-------------".format(lr, wd, batch_size, result))
      if result > best_result[3]:
        best_result = (lr, batch_size, wd, result)
      print('current best hyperparameters:{}'.format(best_result))
    
print(best_result)

--------------lr=0.1, wd=0.0005, batch_size=128, start-------------
Files already downloaded and verified
Files already downloaded and verified
start running
 --Epoch 0
 --start training--
 --start validing--
Epoch: 0 LR: [0.004894348370484647]
  	Train Loss: 1.601 | Train Acc: 41.89%
  	 Val. Loss: 1.218 |  Val Acc: 55.99%
  Current best Val Acc: 0.559928834438324
 --Epoch 1
 --start training--
 --start validing--
Epoch: 1 LR: [0.08386590697411969]
  	Train Loss: 1.139 | Train Acc: 59.20%
  	 Val. Loss: 2.041 |  Val Acc: 36.82%
  Current best Val Acc: 0.559928834438324
 --Epoch 2
 --start training--
 --start validing--
Epoch: 2 LR: [0.04448589487622449]
  	Train Loss: 0.874 | Train Acc: 69.12%
  	 Val. Loss: 0.813 |  Val Acc: 71.81%
  Current best Val Acc: 0.7180577516555786
 --Epoch 3
 --start training--
 --start validing--
Epoch: 3 LR: [0.05395961100810772]
  	Train Loss: 0.700 | Train Acc: 75.52%
  	 Val. Loss: 0.957 |  Val Acc: 67.35%
  Current best Val Acc: 0.7180577516555786
 --