In [None]:
!pip install git+https://github.com/deepvision-class/starter-code

Collecting git+https://github.com/deepvision-class/starter-code
  Cloning https://github.com/deepvision-class/starter-code to /tmp/pip-req-build-065hfv_o
  Running command git clone --filter=blob:none --quiet https://github.com/deepvision-class/starter-code /tmp/pip-req-build-065hfv_o
  Resolved https://github.com/deepvision-class/starter-code to commit e8d9fe711870a39796a2f8ad95538e57942d756f
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: Colab-Utils
  Building wheel for Colab-Utils (setup.py) ... [?25l[?25hdone
  Created wheel for Colab-Utils: filename=Colab_Utils-0.1.dev0-py3-none-any.whl size=10267 sha256=2043b778a699413fd096877e165ba4a9b56417905bed0d163a4f19ce4e34848d
  Stored in directory: /tmp/pip-ephem-wheel-cache-dt13fr2t/wheels/c9/fa/82/64b6c443adccf6982835123e4300fc4420373c553439ff0ad5
Successfully built Colab-Utils
Installing collected packages: Colab-Utils
Successfully installed Colab-Utils-0.1.dev0


In [None]:
import coutils
from coutils import fix_random_seed

from collections import OrderedDict
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

# for plotting
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

In [None]:
NUM_TRAIN = 49000

# The torchvision.transforms package provides tools for preprocessing data
# and for performing data augmentation; here we set up a transform to
# preprocess the data by subtracting the mean RGB value and dividing by the
# standard deviation of each RGB value; we've hardcoded the mean and std.
transform = T.Compose([
                T.ToTensor(),
                T.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
            ])

# We set up a Dataset object for each split (train / val / test); Datasets load
# training examples one at a time, so we wrap each Dataset in a DataLoader which
# iterates through the Dataset and forms minibatches. We divide the CIFAR-10
# training set into train and val sets by passing a Sampler object to the
# DataLoader telling how it should sample from the underlying Dataset.
cifar10_train = dset.CIFAR10('./datasets', train=True, download=True,
                             transform=transform)
loader_train = DataLoader(cifar10_train, batch_size=64,
                          sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN)))

cifar10_val = dset.CIFAR10('./datasets', train=True, download=True,
                           transform=transform)
loader_val = DataLoader(cifar10_val, batch_size=64,
                        sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN, 50000)))

cifar10_test = dset.CIFAR10('./datasets', train=False, download=True,
                            transform=transform)
loader_test = DataLoader(cifar10_test, batch_size=64)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./datasets/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:02<00:00, 79940281.97it/s]


Extracting ./datasets/cifar-10-python.tar.gz to ./datasets
Files already downloaded and verified
Files already downloaded and verified


In [None]:
dtype = torch.float
ltype = torch.long

if torch.cuda.is_available():
  device = torch.device('cuda:0')
else:
  device = torch.device('cpu')

# Constant to control how frequently we print train loss
print_every = 100

print('using device:', device)

using device: cpu


In [None]:
def flatten(x, start_dim=1, end_dim=-1):
  return x.flatten(start_dim=start_dim, end_dim=end_dim)

When we create a PyTorch Tensor with **requires_grad=True**, then operations involving that Tensor will not just compute values; they will also build up a computational graph in the background, allowing us to easily backpropagate through the graph to compute gradients of some Tensors with respect to a downstream loss. Concretely, if x is a Tensor with **x.requires_grad == True** then after backpropagation **x.grad** will be another Tensor holding the gradient of x with respect to the scalar loss at the end.

In [None]:
# we only need to define the forward pass, Pytorch will take care of the backward pass for us
def two_layer_fc(x, params):
  x = flatten(x)
  w1, b1, w2, b2 = params

  # the network flow is fully connected -> relu -> fully connected

  x = F.relu(F.linear(x, w1, b1))
  x = F.linear(x, w2, b2)
  return x

def two_layer_fc_test():
  hidden_layer_size = 42
  x = torch.zeros((64, 3, 16, 16), dtype=dtype)  # minibatch size 64, feature dimension 3*16*16
  w1 = torch.zeros((hidden_layer_size, 3*16*16), dtype=dtype)
  b1 = torch.zeros((hidden_layer_size,), dtype=dtype)
  w2 = torch.zeros((10, hidden_layer_size), dtype=dtype)
  b2 = torch.zeros((10,), dtype=dtype)
  scores = two_layer_fc(x, [w1, b1, w2, b2])
  print('Output size:', list(scores.size()))  # you should see [64, 10]

two_layer_fc_test()

Output size: [64, 10]


Next up, we build a convolution network that has the following structure:
<br> 1. A convolutional layer (with bias) with channel_1 filters, each with shape KW1 x KH1, and zero-padding of two
<br> 2. ReLU nonlinearity
<br> 3. A convolutional layer (with bias) with channel_2 filters, each with shape KW2 x KH2, and zero-padding of one
<br> 4. ReLU nonlinearity
<br> 5. Fully-connected layer with bias, producing scores for C classes.

In [None]:
def three_layer_convnet(x, params):
  conv_w1, conv_b1, conv_w2, conv_b2, fc_w, fc_b = params
  scores = None

  x = F.relu(F.conv2d(x, conv_w1, conv_b1, padding = 2))
  x = F.relu(F.conv2d(x, conv_w2, conv_b2, padding = 1))

  x = flatten(x)

  x = F.linear(x, fc_w, fc_b)

  return x

def three_layer_convnet_test():
  x = torch.zeros((64, 3, 32, 32), dtype=dtype)  # minibatch size 64, image size [3, 32, 32]

  conv_w1 = torch.zeros((6, 3, 5, 5), dtype=dtype)  # [out_channel, in_channel, kernel_H, kernel_W]
  conv_b1 = torch.zeros((6,))  # out_channel
  conv_w2 = torch.zeros((9, 6, 3, 3), dtype=dtype)  # [out_channel, in_channel, kernel_H, kernel_W]
  conv_b2 = torch.zeros((9,))  # out_channel

  # you must calculate the shape of the tensor after two conv layers, before the fully-connected layer
  fc_w = torch.zeros((10, 9 * 32 * 32))
  fc_b = torch.zeros(10)

  scores = three_layer_convnet(x, [conv_w1, conv_b1, conv_w2, conv_b2, fc_w, fc_b])
  print('Output size:', list(scores.size()))  # you should see [64, 10]
three_layer_convnet_test()

Output size: [64, 10]


In [None]:
fix_random_seed(0)

# create a weight of shape [3 x 5]
print(nn.init.kaiming_normal_(torch.empty(3,5, dtype=dtype, device=device)))
print(nn.init.zeros_(torch.empty(3,5, dtype=dtype, device=device)))

tensor([[ 0.9746, -0.1856, -1.3780,  0.3595, -0.6859],
        [-0.8845,  0.2551,  0.5300, -0.4549, -0.2551],
        [-0.3773,  0.1151, -0.5418,  0.6961, -0.6775]])
tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])


When checking accuracy we don't need to compute any gradients; as a result we don't need PyTorch to build a computational graph for us when we compute scores. To prevent a graph from being built we scope our computation under a **torch.no_grad()** context manager.

In [None]:
def check_accuracy_part2(loader, model_fn, params):
  """
  Check the accuracy of a classification model.

  Inputs:
  - loader: A DataLoader for the data split we want to check
  - model_fn: A function that performs the forward pass of the model,
    with the signature scores = model_fn(x, params)
  - params: List of PyTorch Tensors giving parameters of the model

  Returns: Nothing, but prints the accuracy of the model
  """
  split = 'val' if loader.dataset.train else 'test'
  print('Checking accuracy on the %s set' % split)
  num_correct, num_samples = 0, 0
  with torch.no_grad():
    for x, y in loader:
      x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
      y = y.to(device=device, dtype=ltype)
      scores = model_fn(x, params)
      _, preds = scores.max(1)
      num_correct += (preds == y).sum()
      num_samples += preds.size(0)
    acc = float(num_correct) / num_samples
    print('Got %d / %d correct (%.2f%%)' % (num_correct, num_samples, 100 * acc))


In [None]:
def train_part2(model_fn, params, learning_rate):
  """
  Train a model on CIFAR-10.

  Inputs:
  - model_fn: A Python function that performs the forward pass of the model.
    It should have the signature scores = model_fn(x, params) where x is a
    PyTorch Tensor of image data, params is a list of PyTorch Tensors giving
    model weights, and scores is a PyTorch Tensor of shape (N, C) giving
    scores for the elements in x.
  - params: List of PyTorch Tensors giving weights for the model
  - learning_rate: Python scalar giving the learning rate to use for SGD

  Returns: Nothing
  """
  for t, (x, y) in enumerate(loader_train):
    # Move the data to the proper device (GPU or CPU)
    x = x.to(device=device, dtype=dtype)
    y = y.to(device=device, dtype=ltype)

    # Forward pass: compute scores and loss
    scores = model_fn(x, params)
    loss = F.cross_entropy(scores, y)

    # Backward pass: PyTorch figures out which Tensors in the computational
    # graph has requires_grad=True and uses backpropagation to compute the
    # gradient of the loss with respect to these Tensors, and stores the
    # gradients in the .grad attribute of each Tensor.
    loss.backward()

    # Update parameters. We don't want to backpropagate through the
    # parameter updates, so we scope the updates under a torch.no_grad()
    # context manager to prevent a computational graph from being built.
    with torch.no_grad():
      for w in params:
        if w.requires_grad:
          w -= learning_rate * w.grad

          # Manually zero the gradients after running the backward pass
          # Pytorch tends to accumulate gradient after each training iteration
          # this behavior is useful when training RNN
          w.grad.zero_()

    if t % print_every == 0 or t == len(loader_train)-1:
      print('Iteration %d, loss = %.4f' % (t, loss.item()))
      check_accuracy_part2(loader_val, model_fn, params)
      print()

Now we are ready to run the training loop. We need to explicitly allocate tensors for the fully connected weights, w1 and w2.

Each minibatch of CIFAR has 64 examples, so the tensor shape is [64, 3, 32, 32].

After flattening, x shape should be [64, 3 * 32 * 32]. This will be the size of the second dimension of w1. The first dimension of w1 is the hidden layer size, which will also be the second dimension of w2.

Finally, the output of the network is a 10-dimensional vector that represents the probability distribution over 10 classes.

You don't need to tune any hyperparameters but you should see accuracies above 40% after training for one epoch.

In [None]:
fix_random_seed(0)

C, H, W = 3, 32, 32
num_classes = 10

hidden_layer_size = 4000
learning_rate = 1e-2

w1 = nn.init.kaiming_normal_(torch.empty(hidden_layer_size, C*H*W, dtype=dtype, device=device))
w1.requires_grad = True
b1 = nn.init.zeros_(torch.empty(hidden_layer_size, dtype=dtype, device=device))
b1.requires_grad = True
w2 = nn.init.kaiming_normal_(torch.empty(num_classes, hidden_layer_size, dtype=dtype, device=device))
w2.requires_grad = True
b2 = nn.init.zeros_(torch.empty(num_classes, dtype=dtype, device=device))
b2.requires_grad = True

train_part2(two_layer_fc, [w1, b1, w2, b2], learning_rate)

Iteration 0, loss = 3.9070
Checking accuracy on the val set
Got 148 / 1000 correct (14.80%)

Iteration 100, loss = 1.9613
Checking accuracy on the val set
Got 356 / 1000 correct (35.60%)

Iteration 200, loss = 2.1789
Checking accuracy on the val set
Got 381 / 1000 correct (38.10%)

Iteration 300, loss = 1.9420
Checking accuracy on the val set
Got 394 / 1000 correct (39.40%)

Iteration 400, loss = 2.0683
Checking accuracy on the val set
Got 409 / 1000 correct (40.90%)

Iteration 500, loss = 1.7853
Checking accuracy on the val set
Got 415 / 1000 correct (41.50%)

Iteration 600, loss = 1.6518
Checking accuracy on the val set
Got 438 / 1000 correct (43.80%)

Iteration 700, loss = 2.1798
Checking accuracy on the val set
Got 413 / 1000 correct (41.30%)

Iteration 765, loss = 1.2517
Checking accuracy on the val set
Got 415 / 1000 correct (41.50%)



In [None]:
fix_random_seed(0)

C, H, W = 3, 32, 32
num_classes = 10

channel_1 = 32
channel_2 = 16
kernel_size_1 = 5
kernel_size_2 = 3

learning_rate = 3e-3

conv_w1 = nn.init.kaiming_normal_(torch.empty((channel_1, C, kernel_size_1, kernel_size_1), device = device, dtype = dtype))
conv_w1.requires_grad = True
conv_b1 = torch.zeros(size = (channel_1,), device = device, dtype = dtype)
conv_b1.requires_grad = True
conv_w2 = nn.init.kaiming_normal_(torch.empty((channel_2, channel_1, kernel_size_2, kernel_size_2), device = device, dtype = dtype))
conv_w2.requires_grad = True
conv_b2 = torch.zeros(size = (channel_2,), device = device, dtype = dtype)
conv_b2.requires_grad = True
fc_w = nn.init.kaiming_normal_(torch.empty((num_classes, channel_2 * H * W)))
fc_w.requires_grad = True
fc_b = torch.zeros(size = (num_classes,), device = device, dtype = dtype)
fc_b.requires_grad = True

params = [conv_w1, conv_b1, conv_w2, conv_b2, fc_w, fc_b]
train_part2(three_layer_convnet, params, learning_rate)


Iteration 0, loss = 3.7824
Checking accuracy on the val set
Got 98 / 1000 correct (9.80%)

Iteration 100, loss = 1.8613
Checking accuracy on the val set
Got 350 / 1000 correct (35.00%)

Iteration 200, loss = 1.9013
Checking accuracy on the val set
Got 385 / 1000 correct (38.50%)

Iteration 300, loss = 1.7742
Checking accuracy on the val set
Got 424 / 1000 correct (42.40%)

Iteration 400, loss = 1.6233
Checking accuracy on the val set
Got 444 / 1000 correct (44.40%)

Iteration 500, loss = 1.5593
Checking accuracy on the val set
Got 468 / 1000 correct (46.80%)

Iteration 600, loss = 1.5191
Checking accuracy on the val set
Got 476 / 1000 correct (47.60%)

Iteration 700, loss = 1.5770
Checking accuracy on the val set
Got 458 / 1000 correct (45.80%)

Iteration 765, loss = 1.3101
Checking accuracy on the val set
Got 487 / 1000 correct (48.70%)



In [None]:
class TwoLayerFC(nn.Module):
  def __init__(self, input_size, hidden_size, num_classes):
    super().__init__()
    # assign layer objects to class attributes

    # the implementation looks different than when we use torch.nn.function
    self.fc1 = nn.Linear(input_size, hidden_size)
    self.fc2 = nn.Linear(hidden_size, num_classes)
    # nn.init package contains convenient initialization methods
    # https://pytorch.org/docs/stable/nn.init.html#torch.nn.init.kaiming_normal_
    nn.init.kaiming_normal_(self.fc1.weight)
    nn.init.kaiming_normal_(self.fc2.weight)
    nn.init.zeros_(self.fc1.bias)
    nn.init.zeros_(self.fc2.bias)

  def forward(self, x):
    # forward always defines connectivity
    x = flatten(x)
    scores = self.fc2(F.relu(self.fc1(x)))
    return scores

def test_TwoLayerFC():
  input_size = 3*16*16
  x = torch.zeros((42, 3 * 16 * 16), dtype=dtype)  # minibatch size 64, feature dimension 3*16*16
  model = TwoLayerFC(input_size, 42, 10)
  scores = model(x)
  print('Architecture:')
  print(model) # printing `nn.Module` shows the architecture of the module.
  print('Output size:', list(scores.size()))  # you should see [64, 10]
test_TwoLayerFC()

Architecture:
TwoLayerFC(
  (fc1): Linear(in_features=768, out_features=42, bias=True)
  (fc2): Linear(in_features=42, out_features=10, bias=True)
)
Output size: [42, 10]


In [None]:
class ThreeLayerConvNet(nn.Module):
  def __init__(self, in_channel, channel_1, channel_2, num_classes):
    super().__init__()
    H, W =  32, 32
    # assign CNN layers to class attributes:
    self.cnn1 = nn.Conv2d(in_channel, channel_1, kernel_size = 5, padding = 2)
    self.cnn2 = nn.Conv2d(channel_1, channel_2, kernel_size = 3, padding = 1)
    self.fc1 = nn.Linear(channel_2 * H * W, num_classes)

    nn.init.kaiming_normal_(self.cnn1.weight)
    nn.init.kaiming_normal_(self.cnn2.weight)
    nn.init.kaiming_normal_(self.fc1.weight)

    nn.init.zeros_(self.cnn1.bias)
    nn.init.zeros_(self.cnn2.bias)
    nn.init.zeros_(self.fc1.bias)

  def forward(self, x):
    scores = None
    H, W = 32, 32
    x = F.relu(self.cnn1(x))
    x = F.relu(self.cnn2(x))
    x = flatten(x)
    scores = self.fc1(x)

    return scores

def test_ThreeLayerConvNet():
  # nn.Module layers already support batches
  # for example, nn.conv accepts both 3d (a single data point) and 4d (batches of data)
  x = torch.zeros((64, 3, 32, 32), dtype=dtype)  # minibatch size 64, image size [3, 32, 32]
  model = ThreeLayerConvNet(in_channel=3, channel_1=12, channel_2=8, num_classes=10)
  scores = model(x)
  print(model) # printing `nn.Module` shows the architecture of the module.
  print('Output size:', list(scores.size()))  # you should see [64, 10]
test_ThreeLayerConvNet()

ThreeLayerConvNet(
  (cnn1): Conv2d(3, 12, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (cnn2): Conv2d(12, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=8192, out_features=10, bias=True)
)
Output size: [64, 10]


In [None]:
def check_accuracy_part34(loader, model):
  if loader.dataset.train:
    print('Checking accuracy on validation set')
  else:
    print('Checking accuracy on test set')
  num_correct = 0
  num_samples = 0
  model.eval()  # set model to evaluation mode
  with torch.no_grad():
    for x, y in loader:
      x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
      y = y.to(device=device, dtype=ltype)
      scores = model(x)
      _, preds = scores.max(1)
      num_correct += (preds == y).sum()
      num_samples += preds.size(0)
    acc = float(num_correct) / num_samples
    print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))
  return acc

In [None]:
def adjust_learning_rate(optimizer, lrd, epoch, schedule):
  """
  Multiply lrd to the learning rate if epoch is in schedule

  Inputs:
  - optimizer: An Optimizer object we will use to train the model
  - lrd: learning rate decay; a factor multiplied at scheduled epochs
  - epochs: the current epoch number
  - schedule: the list of epochs that requires learning rate update

  Returns: Nothing, but learning rate might be updated
  """
  if epoch in schedule:
    for param_group in optimizer.param_groups:
      print('lr decay from {} to {}'.format(param_group['lr'], param_group['lr'] * lrd))
      param_group['lr'] *= lrd

def train_part345(model, optimizer, epochs=1, learning_rate_decay=.1, schedule=[], verbose=True):
  """
  Train a model on CIFAR-10 using the PyTorch Module API.

  Inputs:
  - model: A PyTorch Module giving the model to train.
  - optimizer: An Optimizer object we will use to train the model
  - epochs: (Optional) A Python integer giving the number of epochs to train for

  Returns: Nothing, but prints model accuracies during training.
  """
  model = model.to(device=device)  # move the model parameters to CPU/GPU
  num_iters = epochs * len(loader_train)
  if verbose:
    num_prints = num_iters // print_every + 1
  else:
    num_prints = epochs
  acc_history = torch.zeros(num_prints, dtype=torch.float)
  iter_history = torch.zeros(num_prints, dtype=torch.long)
  for e in range(epochs):

    adjust_learning_rate(optimizer, learning_rate_decay, e, schedule)

    for t, (x, y) in enumerate(loader_train):
      model.train()  # put model to training mode
      x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
      y = y.to(device=device, dtype=ltype)

      scores = model(x)
      loss = F.cross_entropy(scores, y)

      # Zero out all of the gradients for the variables which the optimizer
      # will update.
      optimizer.zero_grad()

      # This is the backwards pass: compute the gradient of the loss with
      # respect to each  parameter of the model.
      loss.backward()

      # Actually update the parameters of the model using the gradients
      # computed by the backwards pass.
      optimizer.step()

      tt = t + e * len(loader_train)

      if verbose and (tt % print_every == 0 or (e == epochs - 1 and t == len(loader_train) - 1)):
        print('Epoch %d, Iteration %d, loss = %.4f' % (e, tt, loss.item()))
        acc = check_accuracy_part34(loader_val, model)
        acc_history[tt // print_every] = acc
        iter_history[tt // print_every] = tt
        print()
      elif not verbose and (t == len(loader_train) - 1):
        print('Epoch %d, Iteration %d, loss = %.4f' % (e, tt, loss.item()))
        acc = check_accuracy_part34(loader_val, model)
        acc_history[e] = acc
        iter_history[e] = tt
        print()
  return acc_history, iter_history

In [None]:
fix_random_seed(0)

C, H, W = 3, 32, 32
num_classes = 10

hidden_layer_size = 4000
learning_rate = 1e-2
weight_decay = 1e-4

model = TwoLayerFC(C*H*W, hidden_layer_size, num_classes)

optimizer = optim.SGD(model.parameters(), lr=learning_rate,
                      weight_decay=weight_decay)

_ = train_part345(model, optimizer)

Epoch 0, Iteration 0, loss = 3.3988
Checking accuracy on validation set
Got 139 / 1000 correct (13.90)

Epoch 0, Iteration 100, loss = 2.9728
Checking accuracy on validation set
Got 328 / 1000 correct (32.80)

Epoch 0, Iteration 200, loss = 2.1082
Checking accuracy on validation set
Got 335 / 1000 correct (33.50)

Epoch 0, Iteration 300, loss = 2.1665
Checking accuracy on validation set
Got 433 / 1000 correct (43.30)

Epoch 0, Iteration 400, loss = 1.9812
Checking accuracy on validation set
Got 433 / 1000 correct (43.30)

Epoch 0, Iteration 500, loss = 1.7855
Checking accuracy on validation set
Got 447 / 1000 correct (44.70)

Epoch 0, Iteration 600, loss = 2.1078
Checking accuracy on validation set
Got 471 / 1000 correct (47.10)

Epoch 0, Iteration 700, loss = 1.6742
Checking accuracy on validation set
Got 469 / 1000 correct (46.90)

Epoch 0, Iteration 765, loss = 1.6839
Checking accuracy on validation set
Got 408 / 1000 correct (40.80)



In [None]:
fix_random_seed(0)

C = 3
num_classes = 10

channel_1 = 32
channel_2 = 16

learning_rate = 3e-3
weight_decay = 1e-4

model = ThreeLayerConvNet(C, channel_1, channel_2, num_classes)

optimizer = optim.SGD(model.parameters(), lr = learning_rate,
                      weight_decay = weight_decay)

_ = train_part345(model, optimizer)

Epoch 0, Iteration 0, loss = 3.5309
Checking accuracy on validation set
Got 120 / 1000 correct (12.00)

Epoch 0, Iteration 100, loss = 1.9052
Checking accuracy on validation set
Got 354 / 1000 correct (35.40)

Epoch 0, Iteration 200, loss = 1.7804
Checking accuracy on validation set
Got 410 / 1000 correct (41.00)

Epoch 0, Iteration 300, loss = 1.7490
Checking accuracy on validation set
Got 459 / 1000 correct (45.90)

Epoch 0, Iteration 400, loss = 1.3964
Checking accuracy on validation set
Got 456 / 1000 correct (45.60)

Epoch 0, Iteration 500, loss = 1.6616
Checking accuracy on validation set
Got 480 / 1000 correct (48.00)

Epoch 0, Iteration 600, loss = 1.3306
Checking accuracy on validation set
Got 488 / 1000 correct (48.80)

Epoch 0, Iteration 700, loss = 1.6037
Checking accuracy on validation set
Got 505 / 1000 correct (50.50)

Epoch 0, Iteration 765, loss = 1.5525
Checking accuracy on validation set
Got 499 / 1000 correct (49.90)



In [None]:
fix_random_seed(0)

C, H, W = 3, 32, 32
num_classes = 10

hidden_layer_size = 4000
learning_rate = 1e-2
momentum = 0.5

# To give a specific name to each module, use OrderedDict.
model = nn.Sequential(OrderedDict([
  ('flatten', nn.Flatten()),
  ('fc1', nn.Linear(C*H*W, hidden_layer_size)),
  ('relu1', nn.ReLU()),
  ('fc2', nn.Linear(hidden_layer_size, num_classes)),
]))

print('Architecture:')
print(model) # printing `nn.Module` shows the architecture of the module.

# you can use Nesterov momentum in optim.SGD
optimizer = optim.SGD(model.parameters(), lr=learning_rate,
                      weight_decay=weight_decay,
                      momentum=momentum, nesterov=True)

_ = train_part345(model, optimizer)

In [None]:
fix_random_seed(0)

C, H, W = 3, 32, 32
num_classes = 10

channel_1 = 32
channel_2 = 16
kernel_size_1 = 5
pad_size_1 = 2
kernel_size_2 = 3
pad_size_2 = 1

learning_rate = 1e-2
momentum = 0.5
weight_decay = 1e-4

model = nn.Sequential(OrderedDict([
   ('conv1', nn.Conv2d(3, 32, kernel_size = 5, padding = 2)),
   ('relu1', nn.ReLU()),
   ('conv2', nn.Conv2d(32, 16, kernel_size = 3, padding = 1)),
   ('relu2', nn.ReLU()),
   ('flatten', nn.Flatten()),
   ('linear' , nn.Linear(16 * H * W, num_classes))
]))
optimizer = optim.SGD(model.parameters(), lr=learning_rate,
                      weight_decay=weight_decay,
                      momentum=momentum, nesterov=True)
_ = train_part345(model, optimizer)

Epoch 0, Iteration 0, loss = 2.2960
Checking accuracy on validation set
Got 136 / 1000 correct (13.60)

Epoch 0, Iteration 100, loss = 1.6669
Checking accuracy on validation set
Got 382 / 1000 correct (38.20)

Epoch 0, Iteration 200, loss = 1.4600
Checking accuracy on validation set
Got 479 / 1000 correct (47.90)

Epoch 0, Iteration 300, loss = 1.7128
Checking accuracy on validation set
Got 482 / 1000 correct (48.20)

Epoch 0, Iteration 400, loss = 1.6087
Checking accuracy on validation set
Got 490 / 1000 correct (49.00)

Epoch 0, Iteration 500, loss = 1.4321
Checking accuracy on validation set
Got 499 / 1000 correct (49.90)

Epoch 0, Iteration 600, loss = 1.5028
Checking accuracy on validation set
Got 543 / 1000 correct (54.30)

Epoch 0, Iteration 700, loss = 1.3407
Checking accuracy on validation set
Got 543 / 1000 correct (54.30)

Epoch 0, Iteration 765, loss = 1.3140
Checking accuracy on validation set
Got 533 / 1000 correct (53.30)



In [None]:
class PlainBlock(nn.Module):
  def __init__(self, Cin, Cout, downsample = False):
    super().__init__()

    self.net = None
    if downsample == True:
      s = 2
    else : s = 1
    self.net = nn.Sequential(
        nn.BatchNorm2d(num_features = Cin),
        nn.ReLU(),
        nn.Conv2d(Cin, Cout, kernel_size = 3, padding = 1, stride = s),
        nn.BatchNorm2d(num_features = Cout),
        nn.ReLU(),
        nn.Conv2d(Cout, Cout, kernel_size = 3, padding = 1)
    )

  def forward(self, x):
    return self.net(x)


In [None]:
data = torch.zeros(2, 3, 5, 6)
model = PlainBlock(3, 10)
if list(model(data).shape) == [2, 10, 5, 6]:
  print('The output of PlainBlock without downsampling has a *correct* dimension!')
else:
  print('The output of PlainBlock without downsampling has an *incorrect* dimension! expected:', [2, 10, 5, 6], 'got:', list(model(data).shape))

data = torch.zeros(2, 3, 5, 6)
model = PlainBlock(3, 10, downsample=True)
if list(model(data).shape) == [2, 10, 3, 3]:
  print('The output of PlainBlock with downsampling has a *correct* dimension!')
else:
  print('The output of PlainBlock with downsampling has an *incorrect* dimension! expected:', [2, 10, 3, 3], 'got:', list(model(data).shape))

The output of PlainBlock without downsampling has a *correct* dimension!
The output of PlainBlock with downsampling has a *correct* dimension!


In [None]:
class ResidualBlock(nn.Module):
  def __init__(self, Cin, Cout, downsample = False):
    super().__init__()

    self.block = None
    self.shortcut = None

    if downsample == True:
      s = 2
    else : s = 1

    self.block = nn.Sequential(
        nn.BatchNorm2d(num_features = Cin),
        nn.ReLU(),
        nn.Conv2d(Cin, Cout, kernel_size = 3, padding = 1, stride = s),
        nn.BatchNorm2d(num_features = Cout),
        nn.ReLU(),
        nn.Conv2d(Cout, Cout, kernel_size = 3, padding = 1))
    if Cin == Cout:
      self.shortcut = nn.Identity()
    elif Cin != Cout and not downsample:
      self.shortcut = nn.Conv2d(Cin, Cout, kernel_size = 1, stride = 1)
    else:
      self.shortcut = nn.Conv2d(Cin, Cout, kernel_size = 1, stride = 2)

  def forward(self, x):
    return self.block(x) + self.shortcut(x)

In [None]:
data = torch.zeros(2, 3, 5, 6)
model = ResidualBlock(3, 10)
if list(model(data).shape) == [2, 10, 5, 6]:
  print('The output of ResidualBlock without downsampling has a *correct* dimension!')
else:
  print('The output of ResidualBlock without downsampling has an *incorrect* dimension! expected:', [2, 10, 5, 6], 'got:', list(model(data).shape))

data = torch.zeros(2, 3, 5, 6)
model = ResidualBlock(3, 10, downsample=True)
if list(model(data).shape) == [2, 10, 3, 3]:
  print('The output of ResidualBlock with downsampling has a *correct* dimension!')
else:
  print('The output of ResidualBlock with downsampling has an *incorrect* dimension! expected:', [2, 10, 3, 3], 'got:', list(model(data).shape))

The output of ResidualBlock without downsampling has a *correct* dimension!
The output of ResidualBlock with downsampling has a *correct* dimension!


In [None]:
class ResNetStage(nn.Module):
  def __init__(self, Cin, Cout, num_blocks, downsample=True,
               block=ResidualBlock):
    super().__init__()
    blocks = [block(Cin, Cout, downsample)]
    for _ in range(num_blocks - 1):
      blocks.append(block(Cout, Cout))
    self.net = nn.Sequential(*blocks)

  def forward(self, x):
    return self.net(x)

In [None]:
print('Plain block stage:')
print(ResNetStage(3, 4, 2, block=PlainBlock))
print('Residual block stage:')
print(ResNetStage(3, 4, 2, block=ResidualBlock))

Plain block stage:
ResNetStage(
  (net): Sequential(
    (0): PlainBlock(
      (net): Sequential(
        (0): BatchNorm2d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (1): ReLU()
        (2): Conv2d(3, 4, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (3): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (4): ReLU()
        (5): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
    )
    (1): PlainBlock(
      (net): Sequential(
        (0): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (1): ReLU()
        (2): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (3): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (4): ReLU()
        (5): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
    )
  )
)
Residual block stage:
ResNetStage(
  (net): Sequential(
    (0): 

In [None]:
class ResNetStem(nn.Module):
  def __init__(self, Cin = 3, Cout = 8):
    super().__init__()
    layers = [
        nn.Conv2d(Cin, Cout, kernel_size = 3, padding = 1, stride = 1),
        nn.ReLU()
    ]
    self.net = nn.Sequential(*layers)

  def forward(self, x):
    return self.net(x)

In [None]:
# example of specifications
networks = {
  'plain32': {
    'block': PlainBlock,
    'stage_args': [
      (8, 8, 5, False),
      (8, 16, 5, True),
      (16, 32, 5, True),
    ]
  },
  'resnet32': {
    'block': ResidualBlock,
    'stage_args': [
      (8, 8, 5, False),
      (8, 16, 5, True),
      (16, 32, 5, True),
    ]
  },
}

In [None]:
class ResNet(nn.Module):
  def __init__(self, stage_args, Cin = 3, block = ResidualBlock, num_classes = 10):
    super().__init__()
    self.cnn = None
    sequence = [ResNetStem(Cin, stage_args[0][0])]
    self.downsample_count = 0
    for stage in stage_args:
      if stage[3]:
        self.downsample_count += 1
      if block == ResidualBlock:
        print(stage)
        stage_layers = ResNetStage(*stage)
        sequence.append(stage_layers)
      else:
        for i in range(stage[2]):
          cnn_layer = block(stage[0], stage[1], stage[3])
          sequence.append(cnn_layer)
    self.cnn = nn.Sequential(*sequence)
    self.fc = nn.Linear(stage_args[-1][1], num_classes)

  def forward(self, x):
    N, C, H, W = x.shape
    downsample_coeff = 2 ** self.downsample_count
    kern_size = H // downsample_coeff
    avg = nn.AvgPool2d(kernel_size = kern_size, stride = 1, padding = 0)
    x = self.cnn(x)
    x = avg(x)
    scores = self.fc(flatten(x))
    return scores

def get_resnet(name):
  return ResNet(**networks[name])


In [None]:
# def init_module(model):
#   for m in model.modules():
#     if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
#       nn.init.kaiming_normal_(m.weight.data)
#       if m.bias is not None: nn.init.zeros_(m.bias.data)
#     elif isinstance(m, nn.BatchNorm2d):
#       nn.init.ones_(m.weight.data)
#       if m.bias is not None: nn.init.zeros_(m.bias.data)

names = ['resnet32']
acc_history_dict = {}
iter_history_dict = {}
for name in names:
  fix_random_seed(0)
  print(name, '\n')
  model = get_resnet(name)
#   init_module(model)

  optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=.9, weight_decay=1e-4)

  acc_history, iter_history = train_part345(model, optimizer, epochs=10, schedule=[6, 8], verbose=False)
  acc_history_dict[name] = acc_history
  iter_history_dict[name] = iter_history

resnet32 

(8, 8, 5, False)
(8, 16, 5, True)
(16, 32, 5, True)
Epoch 0, Iteration 765, loss = 1.1459
Checking accuracy on validation set
Got 559 / 1000 correct (55.90)

Epoch 1, Iteration 1531, loss = 1.1654
Checking accuracy on validation set
Got 592 / 1000 correct (59.20)

Epoch 2, Iteration 2297, loss = 1.0086
Checking accuracy on validation set
Got 656 / 1000 correct (65.60)

Epoch 3, Iteration 3063, loss = 0.5333
Checking accuracy on validation set
Got 658 / 1000 correct (65.80)

Epoch 4, Iteration 3829, loss = 0.7034
Checking accuracy on validation set
Got 712 / 1000 correct (71.20)

Epoch 5, Iteration 4595, loss = 0.7803
Checking accuracy on validation set
Got 708 / 1000 correct (70.80)

lr decay from 0.01 to 0.001
Epoch 6, Iteration 5361, loss = 0.5074
Checking accuracy on validation set
Got 798 / 1000 correct (79.80)

Epoch 7, Iteration 6127, loss = 0.5070
Checking accuracy on validation set
Got 805 / 1000 correct (80.50)

lr decay from 0.001 to 0.0001
Epoch 8, Iteration 6893,

In [None]:
class ResidualBottleneckBlock(nn.Module):
  def __init__(self, Cin, Cout, downsample = False):
    super().__init__()

    self.block = None
    self.shortcut = None

    if downsample:
      stride = 2
    else: stride = 1
    self.net = nn.Sequential(nn.BatchNorm2d(Cin),
                              nn.ReLU(),
                              nn.Conv2d(Cin, Cout // 4, kernel_size = 1, stride = stride),
                              nn.BatchNorm2d(Cout // 4),
                              nn.ReLU(),
                              nn.Conv2d(Cout // 4, Cout // 4, kernel_size = 3, padding = 1),
                              nn.BatchNorm2d(Cout // 4),
                              nn.ReLU(),
                              nn.Conv2d(Cout // 4, Cout, kernel_size = 1)
                                        )
    if Cin == Cout:
      self.shortcut = nn.Identity()
    elif Cin != Cout and not downsample:
      self.shortcut = nn.Conv2d(Cin, Cout, kernel_size = 1, stride = 1)
    else:
      self.shortcut = nn.Conv2d(Cin, Cout, kernel_size = 1, stride = 2)

  def forward(self, x):
    return self.shortcut(x) + self.net(x)

In [None]:
data = torch.zeros(2, 3, 5, 6)
model = ResidualBottleneckBlock(3, 10)
if list(model(data).shape) == [2, 10, 5, 6]:
  print('The output of ResidualBlock without downsampling has a *correct* dimension!')
else:
  print('The output of ResidualBlock without downsampling has an *incorrect* dimension! expected:', [2, 10, 5, 6], 'got:', list(model(data).shape))

data = torch.zeros(2, 3, 5, 6)
model = ResidualBottleneckBlock(3, 10, downsample=True)
if list(model(data).shape) == [2, 10, 3, 3]:
  print('The output of ResidualBlock with downsampling has a *correct* dimension!')
else:
  print('The output of ResidualBlock with downsampling has an *incorrect* dimension! expected:', [2, 10, 3, 3], 'got:', list(model(data).shape))

The output of ResidualBlock without downsampling has a *correct* dimension!
The output of ResidualBlock with downsampling has a *correct* dimension!


In [None]:
# example of specification
networks.update({
  'resnet47': {
    'block': ResidualBottleneckBlock,
    'stage_args': [
      (32, 32, 5, False),
      (32, 64, 5, True),
      (64, 128, 5, True),
    ],
  },
})

print(get_resnet('resnet47'))

ResNet(
  (cnn): Sequential(
    (0): ResNetStem(
      (net): Sequential(
        (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU()
      )
    )
    (1): ResidualBottleneckBlock(
      (net): Sequential(
        (0): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (1): ReLU()
        (2): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
        (3): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (4): ReLU()
        (5): Conv2d(8, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (6): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (7): ReLU()
        (8): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
      )
      (shortcut): Identity()
    )
    (2): ResidualBottleneckBlock(
      (net): Sequential(
        (0): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (1): ReLU()
 