First, let us import all necessary libraries for this one

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import scipy.signal
import math
import time
from typing import List, Union, Dict

Problem A. Back propagation
==

In this problem, let us try to implement the back propagation by ourselves.

Requirement:
* You cannot reuse the any pytorch backward function
* You cannot use any auto-grad library
* You can use pytorch forward function, or numpy utility functions, unless we explicitly mention that certain functions cannot be used.

## Utility functions for verifying the gradient results

First, let us create a base class, which you will need to create your inheritted class later. Do not change it.

In [None]:
#@title Base class for all your own implementation.

class CustomizedLayer(object):

  def set_params(self, param_dict: Dict[str, torch.Tensor]) -> None:
    """
    Set the parameters of the layer, like weight and bias of linear layer.

    Args:
      param_dict: A dictionary of parameters.
    """
    raise NotImplementedError

  def forward(self, inputs: List[np.ndarray]) -> np.ndarray:
    """
    Forward pass of the layer.

    Args:
      inputs: A list of input tensors.

    Returns:
      The output of the layer.
    """
    raise NotImplementedError

  def input_gradients(
      self,
      inputs: List[np.ndarray],
      outputs: np.ndarray,
      output_gradient: np.ndarray
  ) -> Union[np.ndarray, List[np.ndarray]]:
    """
    Calculate the input gradients of the layer.
    Args:
      inputs: A list of input tensors.
      outputs: The output of the layer.
      output_gradient: The gradient of the output.
    """
    raise NotImplementedError

  def param_gradients(
      self,
      inputs: List[np.ndarray],
      outputs: np.ndarray,
      output_gradient: np.ndarray
  ) -> Dict[str, np.ndarray]:
    """
    Calculate the parameter gradients of the layer.
    Args:
      inputs: A list of input tensors.
      outputs: The output of the layer.
      output_gradient: The gradient of the output.
    """
    raise NotImplementedError

Then, define a few utility functions for testing.

In [None]:
#@title Utility functions

def verify_output(
    actual: Union[np.ndarray, List[np.ndarray]],
    reference: Union[np.ndarray, List[np.ndarray]],
    name: Union[str, List[str]],
    atol: float = 1e-6,
    rtol: float = 1e-6
) -> None:
  """
  Verify if the actual output is close to the reference output.

  Args:
    actual: The actual output.
    reference: The reference output.
    atol: The absolute tolerance.
    rtol: The relative tolerance.
  """
  if isinstance(actual, np.ndarray):
    close = np.allclose(actual, reference, atol=atol, rtol=rtol)
    if not close:
      print(f"{name} is not close to the reference output.")
    else:
      print(f"{name} is close to the reference output.")
  elif isinstance(actual, List):
    close = all(verify_output(a, b, name[i], atol=atol, rtol=rtol)
                for i, (a, b) in enumerate(zip(actual, reference)))
  return close

def compare_with_actual_layer(
    official_layer: nn.modules,
    customized_layer: CustomizedLayer,
    input_np: List[np.ndarray],
    output_grad_np: np.ndarray,
    parameter_names: List[str],
    param_np: List[np.ndarray],
    a_tol: float = 1e-6,
    r_tol: float = 1e-6,
) -> bool:
  """
  Compare forward/backward of your customized layer with the official layer.

  Args:
    official_layer: The official layer.
    customized_layer: The customized layer.
    input_np: The input numpy array.
    output_grad_np: The output gradient numpy array.
    parameter_names: The names of the parameters.
    param_np: The numpy array of the parameters.
    a_tol: The absolute tolerance.
    r_tol: The relative tolerance.

  Returns:
    True if the output of the customized layer is close to the output of
    the official layer, False otherwise.
  """
  success = True
  input_torch = [
      torch.tensor(x, requires_grad=True) for x in input_np
  ]
  output_grad_torch = torch.tensor(output_grad_np, requires_grad=True)
  for i, name in enumerate(parameter_names):
    setattr(official_layer, name,
            nn.Parameter(torch.tensor(param_np[i], requires_grad=True)))

  # Run forward and backward on official layers.
  output_official_torch = official_layer(*input_torch)
  output_official_torch.backward(output_grad_torch)
  output_official_np = output_official_torch.detach().numpy()

  # Check output
  output_customized_np = customized_layer.forward(input_np)
  success = verify_output(output_customized_np, output_official_np,
                          "Output", atol=a_tol, rtol=r_tol)
  if not success:
    return success

  # Check input gradient
  input_grad_customized = customized_layer.input_gradients(
      input_np, output_official_np, output_grad_np)
  input_grad_official = [x.grad.numpy() for x in input_torch]
  success = verify_output(
      input_grad_customized,
      input_grad_official,
      [f'Input Gradient {i}' for i in range(len(input_grad_customized))]
  )
  if not success:
    return success

  # Check parameter gradient
  parameter_grad_customized = customized_layer.param_gradients(
      input_np, output_official_np, output_grad_np)
  for name in parameter_names:
    success = verify_output(parameter_grad_customized[name],
                            getattr(official_layer, name).grad.numpy(),
                            f"Parameter Gradient {name}")
    if not success:
      return success

  print('All tests passed')
  return success

## Question A.1 Back-propagation of ADD

Complete the implementation of the following class below, then run the test below.

* We have provided the implementation of forward pass
* Complete the backward one `input_gradients`.
* Note that Add did not have parameters, so `param_gradients` just return an empty result.

In [None]:
#@title CustomizedAdd

class CustomizedAdd(CustomizedLayer):
  def __init__(self):
    pass

  def set_params(self, param_dict: Dict[str, torch.Tensor]) -> None:
    pass

  def forward(self, inputs: List[np.ndarray]) -> np.ndarray:
    x, y = inputs
    return x + y

  def input_gradients(
    self,
    inputs: List[np.ndarray],
    outputs: np.ndarray,
    output_gradient: np.ndarray
  ) -> Union[np.ndarray, List[np.ndarray]]:
    #####################
    # Your code goes here
    #####################
    return [output_gradient, output_gradient]

  def param_gradients(
      self,
      inputs: List[np.ndarray],
      outputs: np.ndarray,
      output_gradient: np.ndarray
  ) -> Dict[str, np.ndarray]:
    return {}

Run the test below. Do not change it.

In [None]:
#@title Run test

def verify_add():
  for width, height in [(3, 3), (10, 1), (100, 300)]:
    print(f'=== Testing with width={width} and height={height} ===')
    if width == 1:
      x1 = np.random.randn(height)
      x2 = np.random.randn(height)
    else:
      x1 = np.random.randn(height, width)
      x2 = np.random.randn(height, width)
    output_gradient = np.random.randn(height, width)
    customized_add = CustomizedAdd()
    customized_add.set_params({})
    class TorchAdd(nn.Module):
      def __init__(self):
        super(TorchAdd, self).__init__()
      def forward(self, x1, x2):
        return x1 + x2
    torch_add = TorchAdd()

    compare_with_actual_layer(
        torch_add,
        customized_add,
        [x1, x2],
        output_gradient,
        param_np=[],
        parameter_names=[],
    )

verify_add()

=== Testing with width=3 and height=3 ===
Output is close to the reference output.
Input Gradient 0 is close to the reference output.
Input Gradient 1 is close to the reference output.
All tests passed
=== Testing with width=10 and height=1 ===
Output is close to the reference output.
Input Gradient 0 is close to the reference output.
Input Gradient 1 is close to the reference output.
All tests passed
=== Testing with width=100 and height=300 ===
Output is close to the reference output.
Input Gradient 0 is close to the reference output.
Input Gradient 1 is close to the reference output.
All tests passed


## Question A.2 Back-propagation of ReLU

Similarly, complete the implementation of the following class below, then run the test below.

* We have provided the implementation of forward pass
* Complete the backward one `input_gradients`.
* Note that Add did not have parameters, so `param_gradients` just return an empty result.

Hint: for x=0, you can assume gradient of relu is 0.

In [None]:
#@title CustomizedReLU

class CustomizedReLU(CustomizedLayer):
  def __init__(self):
    pass

  def set_params(self, param_dict: Dict[str, torch.Tensor]) -> None:
    pass

  def forward(self, inputs: List[np.ndarray]) -> np.ndarray:
    x = inputs[0]
    return np.maximum(x, 0)

  def input_gradients(
    self,
    inputs: List[np.ndarray],
    outputs: np.ndarray,
    output_gradient: np.ndarray
  ) -> Union[np.ndarray, List[np.ndarray]]:
    #####################
    # Your code goes here
    #####################
    x = inputs[0]
    relu_grad = (x > 0).astype(np.float32)
    input_gradient = output_gradient * relu_grad
    return [input_gradient]

  def param_gradients(
      self,
      inputs: List[np.ndarray],
      outputs: np.ndarray,
      output_gradient: np.ndarray
  ) -> Dict[str, torch.Tensor]:
    return {}

Run the test below. Do not change it.

In [None]:
#@title Run test

def verify_relu():
  for width, height in [(3, 3), (10, 1), (100, 300)]:
    print(f'=== Testing with width={width} and height={height} ===')
    if width == 1:
      x = np.random.randn(height)
    else:
      x = np.random.randn(height, width)
    output_gradient = np.random.randn(height, width)
    customized_relu = CustomizedReLU()
    customized_relu.set_params({})
    torch_relu = nn.ReLU()

    compare_with_actual_layer(
        torch_relu,
        customized_relu,
        [x],
        output_gradient,
        param_np=[],
        parameter_names=[],
    )

verify_relu()

=== Testing with width=3 and height=3 ===
Output is close to the reference output.
Input Gradient 0 is close to the reference output.
All tests passed
=== Testing with width=10 and height=1 ===
Output is close to the reference output.
Input Gradient 0 is close to the reference output.
All tests passed
=== Testing with width=100 and height=300 ===
Output is close to the reference output.
Input Gradient 0 is close to the reference output.
All tests passed


## Question A.3 Back-propagation of Linear

Similarly, complete the implementation of the following class below, then run the test below.

* This time, we also ask you to implement the forward pass.
* And complete the backward pass `input_gradients` and `param_gradients`.

In [None]:
#@title CustomizedLinear

class CustomizedLinear(CustomizedLayer):
  def __init__(self, in_features, out_features):
    self.in_features = in_features
    self.out_features = out_features
    self.weight = np.zeros((in_features, out_features))
    self.bias = np.zeros(out_features)

  def set_params(self, param_dict: Dict[str, torch.Tensor]):
    self.weight = param_dict['weight']
    self.bias = param_dict['bias']

  def forward(self, inputs: List[np.ndarray]) -> np.ndarray:
    #####################
    # Your code goes here
    #####################
    x = inputs[0]       #batch size
    return x @ self.weight.T + self.bias

  def input_gradients(
    self,
    inputs: List[np.ndarray],
    outputs: np.ndarray,
    output_gradient: np.ndarray
  ) -> Union[np.ndarray, List[np.ndarray]]:
    #####################
    # Your code goes here
    #####################
    input_gradient = output_gradient @ self.weight
    return input_gradient

  def param_gradients(
      self,
      inputs: List[np.ndarray],
      outputs: np.ndarray,
      output_gradient: np.ndarray
  ) -> Dict[str, np.ndarray]:
    #####################
    # Your code goes here
    #####################
    x = inputs[0]
    weight_gradient = output_gradient.T @ x
    bias_gradient = np.sum(output_gradient, axis=0)
    return {'weight': weight_gradient,
            'bias': bias_gradient}

Run the test below. Do not change it.

In [None]:
#@title Run test

def verify_linear():
  for batch_size, in_features, out_features in [(3, 4, 8), (5, 10, 20), (2, 100, 300)]:
    print(f'=== Testing with in_features={in_features} '
          f'and out_features={out_features} ===')
    input_np = np.random.randn(batch_size, in_features)
    output_gradient = np.random.randn(batch_size, out_features)
    weight = np.random.randn(out_features, in_features)
    bias = np.random.randn(out_features)
    customized_linear = CustomizedLinear(in_features, out_features)
    customized_linear.set_params({'weight': weight, 'bias': bias})
    torch_linear = nn.Linear(in_features, out_features)

    compare_with_actual_layer(
        torch_linear,
        customized_linear,
        [input_np],
        output_gradient,
        param_np=[weight, bias],
        parameter_names=['weight', 'bias'],
    )

verify_linear()

=== Testing with in_features=4 and out_features=8 ===
Output is close to the reference output.
['Input Gradient 0', 'Input Gradient 1', 'Input Gradient 2'] is close to the reference output.
Parameter Gradient weight is close to the reference output.
Parameter Gradient bias is close to the reference output.
All tests passed
=== Testing with in_features=10 and out_features=20 ===
Output is close to the reference output.
['Input Gradient 0', 'Input Gradient 1', 'Input Gradient 2', 'Input Gradient 3', 'Input Gradient 4'] is close to the reference output.
Parameter Gradient weight is close to the reference output.
Parameter Gradient bias is close to the reference output.
All tests passed
=== Testing with in_features=100 and out_features=300 ===
Output is close to the reference output.
['Input Gradient 0', 'Input Gradient 1'] is close to the reference output.
Parameter Gradient weight is close to the reference output.
Parameter Gradient bias is close to the reference output.
All tests passed

## Question A.4 Back-propagation of Conv

At last, let us try convolutional layer. To make your life easier, let us assume:
* Padding is always 0
* No striding
* Kernel is always squared

Similarly, complete the implementation of the following class below, then run the test below.

* For your reference, we have provided the forward pass.
* Please complete the backward pass `input_gradients` and `param_gradients`.

Also, we have additional requirement for this question:
* You cannot use any library functions that can do convolution, either from numpy, pytorch, or other libraries

In [None]:
#@title CustomizedConv2d

class CustomizedConv2d(CustomizedLayer):
  def __init__(self, in_channels, out_channels, kernel_size):
    if (kernel_size % 2 == 0):
      raise ValueError(f"Kernel size {kernel_size} must be odd")
    self.in_channels = in_channels
    self.out_channels = out_channels
    self.kernel_size = kernel_size
    self.weight = np.zeros((out_channels, in_channels, kernel_size, kernel_size))
    self.bias = np.zeros(out_channels)

  def set_params(self, param_dict: Dict[str, torch.Tensor]):
    self.weight = param_dict['weight']
    self.bias = param_dict['bias']

  def forward(self, inputs: List[np.ndarray]) -> np.ndarray:
    input = inputs[0] # [batch_size, in_channels, height, width]
    batch_size, in_channels, height, width = input.shape
    if self.in_channels != in_channels:
      raise ValueError(f'Input channels {in_channels} do not '
                       f'match expected {self.in_channels}')
    w = self.weight # [out_channels, in_channels, kernel_size, kernel_size]
    b = self.bias
    out_height = height - self.kernel_size + 1
    out_width = width - self.kernel_size + 1
    output = np.zeros((batch_size, self.out_channels, out_height, out_width),
                      np.float64)
    kf = (self.kernel_size - 1) // 2
    for dx in range(0, self.kernel_size):
      for dy in range(0, self.kernel_size):
        # channel order: [B, out_channels, in_channels, H, W]
        y_start = dy
        y_end = y_start + out_height
        x_start = dx
        x_end = x_start + out_width
        output += (input[:, np.newaxis, :, y_start:y_end, x_start:x_end] *
                   w[np.newaxis, :, :, np.newaxis, np.newaxis, dy, dx]
        ).sum(axis=2)
    output += b[np.newaxis, :, np.newaxis, np.newaxis]
    return output

  def input_gradients(
    self,
    inputs: List[np.ndarray],
    outputs: np.ndarray,
    output_gradient: np.ndarray
  ) -> Union[np.ndarray, List[np.ndarray]]:
    #####################
    # Your code goes here
    #####################
    input = inputs[0]
    batch_size, in_channels, height, width = input.shape
    input_gradient = np.zeros_like(input)
    out_height = height - self.kernel_size + 1
    out_width = width - self.kernel_size + 1
    for dx in range(0, self.kernel_size):
      for dy in range(0, self.kernel_size):
        # channel order: [B, out_channels, in_channels, H, W]
        y_start = dy
        y_end = y_start + out_height
        x_start = dx
        x_end = x_start + out_width
        input_gradient[:, :, y_start:y_end, x_start:x_end] += (
            output_gradient[:, :, np.newaxis, :, :] *
            self.weight[:, :, dy, dx][np.newaxis, :, :, np.newaxis, np.newaxis]
        ).sum(axis=1)
    return [input_gradient]

  def param_gradients(
    self,
    inputs: List[np.ndarray],
    outputs: np.ndarray,
    output_gradient: np.ndarray
  ) -> Dict[str, np.ndarray]:
    #####################
    # Your code goes here
    #####################
    input = inputs[0]
    batch_size, in_channels, height, width = input.shape
    out_height = height - self.kernel_size + 1
    out_width = width - self.kernel_size + 1
    weight_gradient = np.zeros_like(self.weight)
    for dx in range(0, self.kernel_size):
      for dy in range(0, self.kernel_size):
        y_start = dy
        y_end = y_start + out_height
        x_start = dx
        x_end = x_start + out_width
        weight_gradient[:, :, dy, dx] += (
            input[:, np.newaxis, :, y_start:y_end, x_start:x_end] *
            output_gradient[:, :, np.newaxis, :, :]
        ).sum(axis=(0, 3, 4))
    bias_weight = np.sum(output_gradient, axis=(0, 2, 3))
    return {'weight': weight_gradient, 'bias': bias_weight}


Run the test below. Do not change it.

In [None]:
#@title Run test

def verify_conv2d():
  for batch_size, in_channels, out_channels, kernel_size, height, width in [
      (1, 1, 1, 3, 5, 5),
      (1, 4, 5, 3, 11, 11),
      (1, 4, 7, 5, 12, 12),
      (3, 4, 5, 3, 20, 30),
      (1, 5, 7, 5, 256, 256),
  ]:
    print(f'=== Testing with in_channels={in_channels} '
          f'and out_channels={out_channels} and kernel_size={kernel_size} ===')
    out_height = height - kernel_size + 1
    out_width = width - kernel_size + 1
    input_np = np.random.randn(batch_size, in_channels, height, width)
    output_gradient = np.random.randn(batch_size, out_channels, out_height, out_width)
    weight = np.random.randn(out_channels, in_channels, kernel_size, kernel_size)
    bias = np.random.randn(out_channels)
    customized_conv2d = CustomizedConv2d(in_channels, out_channels, kernel_size)
    customized_conv2d.set_params({'weight': weight, 'bias': bias})
    torch_conv2d = nn.Conv2d(in_channels, out_channels, kernel_size)

    compare_with_actual_layer(
        torch_conv2d,
        customized_conv2d,
        [input_np],
        output_gradient,
        param_np=[weight, bias],
        parameter_names=['weight', 'bias'],
    )

verify_conv2d()

=== Testing with in_channels=1 and out_channels=1 and kernel_size=3 ===
Output is close to the reference output.
Input Gradient 0 is close to the reference output.
Parameter Gradient weight is close to the reference output.
Parameter Gradient bias is close to the reference output.
All tests passed
=== Testing with in_channels=4 and out_channels=5 and kernel_size=3 ===
Output is close to the reference output.
Input Gradient 0 is close to the reference output.
Parameter Gradient weight is close to the reference output.
Parameter Gradient bias is close to the reference output.
All tests passed
=== Testing with in_channels=4 and out_channels=7 and kernel_size=5 ===
Output is close to the reference output.
Input Gradient 0 is close to the reference output.
Parameter Gradient weight is close to the reference output.
Parameter Gradient bias is close to the reference output.
All tests passed
=== Testing with in_channels=4 and out_channels=5 and kernel_size=3 ===
Output is close to the referenc

Problem B. ResNet
==

For the ease of network design, we resize the image to 32x32.

In [None]:
#@title Load the training dataset

# Define transformations for the training and test sets

mean_train = 0.2860405743122101
std_train = 0.3530242443084717
transform_train = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize((mean_train,), (std_train,))
])

# The transformation applied to the testing is the same as training
transform_test = transforms.Compose([
    # For the ease of network design, we resize the image to 32x32[
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize((mean_train,), (std_train,))
])

# Load the FashionMNIST dataset
train_dataset = torchvision.datasets.FashionMNIST(
    root='./data',
    train=True,
    download=True,
    transform=transform_train
)

test_dataset = torchvision.datasets.FashionMNIST(
    root='./data',
    train=False,
    download=True,
    transform=transform_test
)

100%|██████████| 26.4M/26.4M [00:03<00:00, 8.08MB/s]
100%|██████████| 29.5k/29.5k [00:00<00:00, 154kB/s]
100%|██████████| 4.42M/4.42M [00:01<00:00, 2.86MB/s]
100%|██████████| 5.15k/5.15k [00:00<00:00, 24.0MB/s]


Also, here are the same utility function as assignment 2. If necessary, feel free to change it.

In [None]:
#@title Utility functions for training

def train_one_epoch(
    dataloader: torch.utils.data.DataLoader,
    model: nn.Module,
    loss_fn: nn.Module,
    optimizer: optim.Optimizer,
    device: str='cpu',
    loss_print_iter: int=100
  ):
  num_train_samples = len(dataloader.dataset)

  # Set the model to the training mod
  model.train()
  all_losses = []
  all_acc = []

  for batch_index, (image, label) in enumerate(dataloader):
    image, label = image.to(device), label.to(device)
    pred = model(image)
    loss = loss_fn(pred, label)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    # Calculate the loss
    all_losses.append(loss.item())
    acc = ((pred.argmax(1) == label).type(torch.float).sum().item() /
           image.shape[0])
    all_acc.append(acc)

    if batch_index % loss_print_iter == 0:
      loss, trained_samples = loss.item(), (batch_index + 1) * image.shape[0]
      print(f'loss: {loss:>7f} '
            f'[{trained_samples:>5d}/{num_train_samples:>5d}] ')

  return all_losses, all_acc

def test_all_samples(
    dataloader: torch.utils.data.DataLoader,
    model: nn.Module,
    loss_fn: nn.Module,
    device: str='cpu'
) -> None:
  model.eval()  # Set the model to evaluation mode
  num_testing_samples = len(dataloader.dataset)
  num_batches = len(dataloader)
  test_loss, correct = 0, 0

  # Disable gradient calculation for inference
  with torch.no_grad():
    for image, label in dataloader:
      image, label = image.to(device), label.to(device)
      pred = model(image)
      test_loss += loss_fn(pred, label).item()
      correct += (pred.argmax(1) == label).type(torch.float).sum().item()

  test_loss /= num_batches
  acc = correct / num_testing_samples
  print(f'Test Error: \n Accuracy: {(100*acc):>0.1f}%, '
        f'Avg loss: {test_loss:>8f} \n')
  return test_loss, acc

And let us define the running devices.

GPU devices are suggested.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In this question, let us verify why resnet structure is important for deep network.

First, let us create the data loader. Feel free to change the batch size.

In [None]:
# Create data loaders

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Problem B.1 A simple CNN.

No coding is required. Just run the following blocks.

In [None]:
#@title An utility function to create network

def gen_double_conv(in_channels, out_channels):
  return [nn.Conv2d(in_channels, out_channels, 3, padding=1),
          nn.ReLU(),
          nn.Conv2d(out_channels, out_channels, 3, padding=1),
          nn.ReLU()]

First, we will define an CNN networks with 11 convolution/linear layers, and let us try it on FashineMNIST.

In [None]:
#@title SimpleCNN

class SimpleCNN(nn.Module):
  def __init__(
    self,
    img_size = 32,
    in_channels = 1,
    num_classes = 10,
  ):

    super().__init__()
    base_channel = 16

    # Starting block. One 2x downampling.
    self.start_conv = nn.Sequential(
        *gen_double_conv(1, base_channel),
    )

    self.start_ds = nn.Sequential(
        nn.AvgPool2d(2),
    )

    # Mid block. No downsampling.
    self.mid_block = nn.Sequential(
        nn.Conv2d(base_channel, base_channel * 2, 3, padding=1),
        nn.ReLU(),
        nn.AvgPool2d(2),
        nn.Conv2d(base_channel * 2, base_channel * 2, 3, padding=1),
        nn.ReLU(),
        # We ConvTranspose2d to increase the resolution of an image by 2.
        nn.ConvTranspose2d(base_channel * 2, base_channel, 2, stride=2),
        nn.ReLU(),
    )

    # Last block. Two 2x downsampling.
    self.end_conv1 = nn.Sequential(
        *gen_double_conv(base_channel, base_channel * 2)
    )
    self.end_ds1 = nn.Sequential(
        nn.AvgPool2d(2),
    )
    self.end_conv2 = nn.Sequential(
        *gen_double_conv(base_channel * 2, base_channel * 4)
    )

    n_pooling_layer = 3
    last_layer_size = img_size // (2 ** n_pooling_layer)
    self.end_ds2_linear = nn.Sequential(
        nn.AvgPool2d(2),
        nn.Flatten(),
        nn.Linear(last_layer_size * last_layer_size *
                  base_channel * 4, base_channel * 8),
        nn.ReLU(),
        nn.Linear(base_channel * 8, num_classes)
    )

  def forward(self, x):
    after_start = self.start_ds(self.start_conv(x))
    after_mid = self.mid_block(after_start)
    output = self.end_ds2_linear(
        self.end_conv2(self.end_ds1(self.end_conv1(after_mid))))
    return output

  def get_n_conv_linear_layers(self) -> int:
    all_blocks = [self.start_conv, self.start_ds,
                  self.mid_block,
                  self.end_conv1, self.end_ds1,
                  self.end_conv2, self.end_ds2_linear]
    module_list = (nn.ConvTranspose2d, nn.Conv2d, nn.Linear)
    nlayer = 0
    for block in all_blocks:
      nlayer += len([mod for mod in block.modules()
                    if isinstance(mod, module_list)])
    return nlayer

In [None]:
#@title Create a model and corresponding optmizer

model = SimpleCNN()
model.to(device)
loss_fn = nn.CrossEntropyLoss()

epochs = 20   # Number of training epochs
optimizer = torch.optim.Adam(
  model.parameters(),    # All trainable parameters.
  lr=1e-4                # Learning rate
)

print(f'The model contains {model.get_n_conv_linear_layers()} conv or '
      f'linear layers')

The model contains 11 conv or linear layers


Running the training below.

After training, you should be avaiable to get accuracy of 89% on the testing set.

In [None]:
#@title Run training

for t in range(epochs):
  print(f"Epoch {t+1}\n-------------------------------")
  start_time =time.time()
  _, _ = train_one_epoch(train_loader, model, loss_fn, optimizer, device=device)
  print(f'One epoch takes {time.time() - start_time}')
  _, _ = test_all_samples(test_loader, model, loss_fn, device=device)

test_all_samples(test_loader, model, loss_fn, device=device)

print("Training done!")

Epoch 1
-------------------------------
loss: 2.309474 [   32/60000] 
loss: 2.284549 [ 3232/60000] 
loss: 1.244536 [ 6432/60000] 
loss: 1.264719 [ 9632/60000] 
loss: 0.838336 [12832/60000] 
loss: 0.664978 [16032/60000] 
loss: 0.658524 [19232/60000] 
loss: 0.671320 [22432/60000] 
loss: 0.725209 [25632/60000] 
loss: 0.849925 [28832/60000] 
loss: 0.562990 [32032/60000] 
loss: 0.556036 [35232/60000] 
loss: 0.683814 [38432/60000] 
loss: 0.513558 [41632/60000] 
loss: 0.832027 [44832/60000] 
loss: 0.769928 [48032/60000] 
loss: 0.446838 [51232/60000] 
loss: 0.703982 [54432/60000] 
loss: 0.556531 [57632/60000] 
One epoch takes 25.626473903656006
Test Error: 
 Accuracy: 76.4%, Avg loss: 0.629762 

Epoch 2
-------------------------------
loss: 0.552083 [   32/60000] 
loss: 0.463386 [ 3232/60000] 
loss: 0.606315 [ 6432/60000] 
loss: 0.496216 [ 9632/60000] 
loss: 0.421425 [12832/60000] 
loss: 0.436497 [16032/60000] 
loss: 0.591903 [19232/60000] 
loss: 0.519413 [22432/60000] 
loss: 0.376786 [25632/6

## Problem B.2. Deeper CNN

Now, please change the network we have provided, by duplicating some of conv parts. For example, you can make a multiple copy of self.mid_block. The network you have created should have at least 30 conv/linear layers.

In [None]:
class DeeperCNN(nn.Module):
  def __init__(
    self,
    img_size=32,
    in_channels=1,
    num_classes=10,
  ):
    #####################
    # Your code goes here
    #####################
    super().__init__()
    base_channel = 16

    # Starting block. One 2x downampling.
    self.start_conv = nn.Sequential(
        *gen_double_conv(1, base_channel),
        *gen_double_conv(base_channel, base_channel),
    )

    self.start_ds = nn.Sequential(
        nn.AvgPool2d(2),
    )

    mid_blocks = []
    for _ in range(5):
        mid_blocks.append(self._make_mid_block(base_channel))

    self.deepmid = nn.Sequential(*mid_blocks)

    # Last block. Two 2x downsampling.
    self.end_conv1 = nn.Sequential(
        *gen_double_conv(base_channel, base_channel * 2),
        *gen_double_conv(base_channel * 2, base_channel * 2),
    )
    self.end_ds1 = nn.Sequential(
        nn.AvgPool2d(2),
    )
    self.end_conv2 = nn.Sequential(
        *gen_double_conv(base_channel * 2, base_channel * 4),
        *gen_double_conv(base_channel * 4, base_channel * 4)
    )

    n_pooling_layer = 3
    last_layer_size = img_size // (2 ** n_pooling_layer)
    self.end_ds2_linear = nn.Sequential(
        nn.AvgPool2d(2),
        nn.Flatten(),
        nn.Linear(last_layer_size * last_layer_size *
                  base_channel * 4, base_channel * 16),
        nn.ReLU(),
        nn.Linear(base_channel * 16, base_channel * 8),
        nn.ReLU(),
        nn.Linear(base_channel * 8, num_classes)
    )

  def _make_mid_block(self, base_channel):
        # Mid block. No downsampling.
      return nn.Sequential(
            nn.Conv2d(base_channel, base_channel * 2, 3, padding=1),
            nn.ReLU(),
            nn.AvgPool2d(2),
            nn.Conv2d(base_channel * 2, base_channel * 2, 3, padding=1),
            nn.ReLU(),
           # We ConvTranspose2d to increase the resolution of an image by 2.
            nn.ConvTranspose2d(base_channel * 2, base_channel, 2, stride=2),
            nn.ReLU(),
      )

  def forward(self, x):
    #####################
    # Your code goes here
    #####################
    after_start = self.start_ds(self.start_conv(x))
    after_mid = self.deepmid(after_start)
    output = self.end_ds2_linear(
    self.end_conv2(self.end_ds1(self.end_conv1(after_mid))))
    return output

  def get_n_conv_linear_layers(self) -> int:
    #####################
    # Your code goes here
    #####################
    all_blocks = [self.start_conv, self.start_ds,
                  self.deepmid,
                  self.end_conv1, self.end_ds1,
                  self.end_conv2, self.end_ds2_linear]
    module_list = (nn.ConvTranspose2d, nn.Conv2d, nn.Linear)
    nlayer = 0
    for block in all_blocks:
      nlayer += len([mod for mod in block.modules()
                    if isinstance(mod, module_list)])
    return nlayer

In [None]:
#@title Create a model and corresponding optmizer

model = DeeperCNN()
model.to(device)
loss_fn = nn.CrossEntropyLoss()

epochs = 20   # Number of training epochs
optimizer = torch.optim.Adam(
  model.parameters(),    # All trainable parameters.
  lr=1e-4,     # Learning rate
)

print(f'The model contains {model.get_n_conv_linear_layers()} conv or '
      f'linear layers')

The model contains 30 conv or linear layers


Running the training below.

Interesting, you will find that with deeper network, the accuracy actually goes down.

In [None]:
#@title Run training

for t in range(epochs):
  print(f"Epoch {t+1}\n-------------------------------")
  start_time =time.time()
  _, _ = train_one_epoch(train_loader, model, loss_fn, optimizer, device=device)
  print(f'One epoch takes {time.time() - start_time}')
  _, _ = test_all_samples(test_loader, model, loss_fn, device=device)

test_all_samples(test_loader, model, loss_fn, device=device)

print("Training done!")

Epoch 1
-------------------------------
loss: 2.301057 [   32/60000] 
loss: 2.303802 [ 3232/60000] 
loss: 2.298149 [ 6432/60000] 
loss: 2.307278 [ 9632/60000] 
loss: 2.303458 [12832/60000] 
loss: 2.304575 [16032/60000] 
loss: 2.303729 [19232/60000] 
loss: 2.306975 [22432/60000] 
loss: 2.300779 [25632/60000] 
loss: 2.298641 [28832/60000] 
loss: 2.304524 [32032/60000] 
loss: 2.297641 [35232/60000] 
loss: 2.301429 [38432/60000] 
loss: 2.307261 [41632/60000] 
loss: 2.300042 [44832/60000] 
loss: 2.304665 [48032/60000] 
loss: 2.303902 [51232/60000] 
loss: 2.302754 [54432/60000] 
loss: 2.301396 [57632/60000] 
One epoch takes 30.39143991470337
Test Error: 
 Accuracy: 10.0%, Avg loss: 2.302640 

Epoch 2
-------------------------------
loss: 2.301344 [   32/60000] 
loss: 2.301189 [ 3232/60000] 
loss: 2.303574 [ 6432/60000] 
loss: 2.302982 [ 9632/60000] 
loss: 2.304130 [12832/60000] 
loss: 2.302846 [16032/60000] 
loss: 2.303244 [19232/60000] 
loss: 2.300157 [22432/60000] 
loss: 2.298765 [25632/60

## Problem B.3. ResNet

Now, keep the general structure in B.2 almost no change, but add some skip connection (resnet structure). And try to achieve 89% accuracy again.

Hint, to define a resnet, when you run the module in the `forward`, instead call it as:
```
y = net1(x)
```
Call it as:
```
y = net1(x) + x
```

The only requirement is that the input and output of `net1` should be same dimension. For example, you can `self.mid_block` in the SimpleCNN can add a skip connection. Try to find whereelse you can add this skip connection.

In [None]:
class ResCNN(nn.Module):
  def __init__(
    self,
    img_size=32,
    in_channels=1,
    num_classes=10,
  ):
    #####################
    # Your code goes here
    #####################
    super().__init__()
    base_channel = 16

        # Starting block. One 2x downampling.
    self.start_conv = nn.Sequential(
        *gen_double_conv(1, base_channel),
        *gen_double_conv(base_channel, base_channel),
    )

    self.start_ds = nn.Sequential(
        nn.AvgPool2d(2),
    )

    mid_blocks = []
    for _ in range(5):
        mid_blocks.append(self._make_mid_block(base_channel))

    self.deepmid = nn.Sequential(*mid_blocks)

    # Last block. Two 2x downsampling.
    self.end_conv1 = nn.Sequential(
        *gen_double_conv(base_channel, base_channel * 2),
        *gen_double_conv(base_channel * 2, base_channel * 2),
    )
    self.end_ds1 = nn.Sequential(
        nn.AvgPool2d(2),
    )
    self.end_conv2 = nn.Sequential(
        *gen_double_conv(base_channel * 2, base_channel * 4),
        *gen_double_conv(base_channel * 4, base_channel * 4)
    )

    n_pooling_layer = 3
    last_layer_size = img_size // (2 ** n_pooling_layer)
    self.end_ds2_linear = nn.Sequential(
        nn.AvgPool2d(2),
        nn.Flatten(),
        nn.Linear(last_layer_size * last_layer_size *
                  base_channel * 4, base_channel * 16),
        nn.ReLU(),
        nn.Linear(base_channel * 16, base_channel * 8),
        nn.ReLU(),
        nn.Linear(base_channel * 8, num_classes)
    )

  def _make_mid_block(self, base_channel):
        # Mid block. No downsampling.
      return nn.Sequential(
            nn.Conv2d(base_channel, base_channel * 2, 3, padding=1),
            nn.ReLU(),
            nn.AvgPool2d(2),
            nn.Conv2d(base_channel * 2, base_channel * 2, 3, padding=1),
            nn.ReLU(),
           # We ConvTranspose2d to increase the resolution of an image by 2.
            nn.ConvTranspose2d(base_channel * 2, base_channel, 2, stride=2),
            nn.ReLU(),
      )


  def forward(self, x):
    #####################
    # Your code goes here
    #####################
    after_start = self.start_ds(self.start_conv(x))
    after_mid = self.deepmid(after_start) + after_start
    output = self.end_ds2_linear(
    self.end_conv2(self.end_ds1(self.end_conv1(after_mid))))
    return output

  def get_n_conv_linear_layers(self) -> int:
    #####################
    # Your code goes here
    #####################
    all_blocks = [self.start_conv, self.start_ds,
                  self.deepmid,
                  self.end_conv1, self.end_ds1,
                  self.end_conv2, self.end_ds2_linear]
    module_list = (nn.ConvTranspose2d, nn.Conv2d, nn.Linear)
    nlayer = 0
    for block in all_blocks:
      nlayer += len([mod for mod in block.modules()
                    if isinstance(mod, module_list)])
    return nlayer

In [None]:
#@title Create a model and corresponding optmizer

model = ResCNN()
model.to(device)
loss_fn = nn.CrossEntropyLoss()

epochs = 20   # Number of training epochs
optimizer = torch.optim.Adam(
  model.parameters(),    # All trainable parameters.
  lr=1e-4                # Learning rate
)

print(f'The model contains {model.get_n_conv_linear_layers()} conv or '
      f'linear layers')

The model contains 30 conv or linear layers


Running the training below.

Now, with resnet design, you will should reach 89% again.

In [None]:
#@title Run training

for t in range(epochs):
  print(f"Epoch {t+1}\n-------------------------------")
  start_time =time.time()
  _, _ = train_one_epoch(train_loader, model, loss_fn, optimizer, device=device)
  print(f'One epoch takes {time.time() - start_time}')
  _, _ = test_all_samples(test_loader, model, loss_fn, device=device)

test_all_samples(test_loader, model, loss_fn, device=device)

print("Training done!")

Epoch 1
-------------------------------
loss: 2.292154 [   32/60000] 
loss: 2.290864 [ 3232/60000] 
loss: 0.812189 [ 6432/60000] 
loss: 0.828353 [ 9632/60000] 
loss: 0.847551 [12832/60000] 
loss: 0.729506 [16032/60000] 
loss: 0.762317 [19232/60000] 
loss: 0.454517 [22432/60000] 
loss: 0.544702 [25632/60000] 
loss: 0.844015 [28832/60000] 
loss: 0.700097 [32032/60000] 
loss: 0.773231 [35232/60000] 
loss: 0.516823 [38432/60000] 
loss: 0.714197 [41632/60000] 
loss: 0.734287 [44832/60000] 
loss: 0.598461 [48032/60000] 
loss: 0.630968 [51232/60000] 
loss: 0.743768 [54432/60000] 
loss: 0.718938 [57632/60000] 
One epoch takes 30.364739656448364
Test Error: 
 Accuracy: 76.6%, Avg loss: 0.616047 

Epoch 2
-------------------------------
loss: 0.786737 [   32/60000] 
loss: 0.782411 [ 3232/60000] 
loss: 0.387957 [ 6432/60000] 
loss: 0.642474 [ 9632/60000] 
loss: 0.811866 [12832/60000] 
loss: 0.370436 [16032/60000] 
loss: 0.713840 [19232/60000] 
loss: 0.510155 [22432/60000] 
loss: 0.475609 [25632/6