In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

import numpy as np

In [2]:
NUM_TRAIN = 49000

# The torchvision.transforms package provides tools for preprocessing data
# and for performing data augmentation; here we set up a transform to
# preprocess the data by subtracting the mean RGB value and dividing by the
# standard deviation of each RGB value; we've hardcoded the mean and std.
transform = T.Compose([
                T.ToTensor(),
                T.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
            ])

# We set up a Dataset object for each split (train / val / test); Datasets load
# training examples one at a time, so we wrap each Dataset in a DataLoader which
# iterates through the Dataset and forms minibatches. We divide the CIFAR-10
# training set into train and val sets by passing a Sampler object to the
# DataLoader telling how it should sample from the underlying Dataset.
cifar10_train = dset.CIFAR10('./cs231n/datasets', train=True, download=True,
                             transform=transform)
loader_train = DataLoader(cifar10_train, batch_size=64, 
                          sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN)))

cifar10_val = dset.CIFAR10('./cs231n/datasets', train=True, download=True,
                           transform=transform)
loader_val = DataLoader(cifar10_val, batch_size=64, 
                        sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN, 50000)))

cifar10_test = dset.CIFAR10('./cs231n/datasets', train=False, download=True, 
                            transform=transform)
loader_test = DataLoader(cifar10_test, batch_size=64)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./cs231n/datasets/cifar-10-python.tar.gz
Files already downloaded and verified
Files already downloaded and verified


In [3]:
USE_GPU = True

dtype = torch.float32 # we will be using float throughout this tutorial

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Constant to control how frequently we print train loss
print_every = 100

print('using device:', device)

using device: cuda


In [5]:
def flatten(x):
  N = x.shape[0] #[N,C,H,W]
  return x.view(N,-1)  # [N,C*H*W]

x =torch.arange(12).view(2,1,3,2)
print(x)
flatten_x = flatten(x)
print(flatten_x)

tensor([[[[ 0,  1],
          [ 2,  3],
          [ 4,  5]]],


        [[[ 6,  7],
          [ 8,  9],
          [10, 11]]]])
tensor([[ 0,  1,  2,  3,  4,  5],
        [ 6,  7,  8,  9, 10, 11]])


In [7]:

import torch.nn.functional as F
def two_layer_fc(x,params):
  x = flatten(x)
  w1,w2 = params
  
  x = F.relu(x.mm(w1))
  x = x.mm(w2)
  return x

h = 42
x = torch.zeros((64,50),dtype=dtype) # 64 minibatch size
w1 = torch.zeros((50,h),dtype=dtype)
w2 = torch.zeros((h,10),dtype=dtype)
scores = two_layer_fc(x,[w1,w2])
print(scores.size())

torch.Size([64, 10])


In [0]:
def three_layer_convnet(x,params):
  conv_w1, conv_b1, conv_w2, conv_b2, fc_w, fc_b = params
  scores = None
  
  conv1 = F.conv2d(x,weight=conv_w1,bias=conv_b1,padding=2)
  relu1 = F.relu(conv1)
  conv2 = F.conv2d(relu1,weight=conv_w2,bias=conv_b2,padding=1)
  relu2 = F.relu(conv2)
  relu2_flat = flatten(relu2)
  scores = relu2_flat.mm(fc_w)+fc_b
  return scores

In [12]:
def three_layer_convnet_test():
    x = torch.zeros((64, 3, 32, 32), dtype=dtype)  # minibatch size 64, image size [3, 32, 32]

    conv_w1 = torch.zeros((6, 3, 5, 5), dtype=dtype)  # [out_channel, in_channel, kernel_H, kernel_W]
    conv_b1 = torch.zeros((6,))  # out_channel
    conv_w2 = torch.zeros((9, 6, 3, 3), dtype=dtype)  # [out_channel, in_channel, kernel_H, kernel_W]
    conv_b2 = torch.zeros((9,))  # out_channel

    # you must calculate the shape of the tensor after two conv layers, before the fully-connected layer
    fc_w = torch.zeros((9 * 32 * 32, 10))
    fc_b = torch.zeros(10)

    scores = three_layer_convnet(x, [conv_w1, conv_b1, conv_w2, conv_b2, fc_w, fc_b])
    print(scores.size())  # you should see [64, 10]
three_layer_convnet_test()

torch.Size([64, 10])


In [13]:


def random_weight(shape):
    """
    Create random Tensors for weights; setting requires_grad=True means that we
    want to compute gradients for these Tensors during the backward pass.
    We use Kaiming normalization: sqrt(2 / fan_in)
    """
    if len(shape) == 2:  # FC weight
        fan_in = shape[0]
    else:
        fan_in = np.prod(shape[1:]) # conv weight [out_channel, in_channel, kH, kW]
    # randn is standard normal distribution generator. 
    w = torch.randn(shape, device=device, dtype=dtype) * np.sqrt(2. / fan_in)
    w.requires_grad = True
    return w

def zero_weight(shape):
    return torch.zeros(shape, device=device, dtype=dtype, requires_grad=True)

# create a weight of shape [3 x 5]
# you should see the type `torch.cuda.FloatTensor` if you use GPU. 
# Otherwise it should be `torch.FloatTensor`
random_weight((3, 5))



tensor([[-1.4078, -0.7440,  0.4925, -1.2667, -0.2544],
        [-0.2470,  1.1927, -0.0288,  1.3423,  0.1408],
        [ 1.2643, -1.5319,  0.1252, -1.2128,  0.6428]], device='cuda:0',
       requires_grad=True)

In [0]:
def check_accuracy(loader,model,params):
  """
  loader: a dataloader for the data split we want to check
  model_fn: a function that performs forward pass
  
  """
  
  split = 'val' if loader.dataset.train else 'test'
  num_correct,num_samples = 0,0
  with torch.no_grad():
    for x,y in loader:
      x = x.to(device=device,dtype=dtype)
      y = y.to(device=device,dtype=torch.int64)
      scores = model(x,params)
      _,preds = scores.max(1)
      num_correct += (preds==y).sum()
      num_samples += preds.size(0)
    acc = float(num_correct)/num_samples
    print('Got %d / %d correct (%.2f%%)' % (num_correct, num_samples, 100 * acc))
  

In [0]:
def train(model,params,lr):
  for t,(x,y) in enumerate(loader_train):
    x = x.to(device = device,dtype=dtype)
    y = y.to(device= device,dtype=torch.long)
    
    scores = model(x,params)
    loss = F.cross_entropy(scores,y)
    
    loss.backward()
    
    with torch.no_grad():
      for w in params:
        w -= learning_rate * w.grad
        
        w.grad.zero_()
    if t % print_every ==0:
      print('Iteration %d, loss = %.4f' % (t, loss.item()))
      check_accuracy(loader_val, model, params)
      print()

        

In [25]:


hidden_layer_size = 4000
learning_rate = 1e-2

w1 = random_weight((3 * 32 * 32, hidden_layer_size))
w2 = random_weight((hidden_layer_size, 10))

train(two_layer_fc, [w1, w2], learning_rate)



Iteration 0, loss = 3.2052
Got 156 / 1000 correct (15.60%)

Iteration 100, loss = 2.1699
Got 363 / 1000 correct (36.30%)

Iteration 200, loss = 2.4274
Got 294 / 1000 correct (29.40%)

Iteration 300, loss = 2.1906
Got 386 / 1000 correct (38.60%)

Iteration 400, loss = 2.0002
Got 409 / 1000 correct (40.90%)

Iteration 500, loss = 1.6979
Got 385 / 1000 correct (38.50%)

Iteration 600, loss = 1.7553
Got 418 / 1000 correct (41.80%)

Iteration 700, loss = 1.2617
Got 434 / 1000 correct (43.40%)



In [32]:
learning_rate = 3e-3

channel_1 = 32
channel_2 = 16

conv_w1 = None
conv_b1 = None
conv_w2 = None
conv_b2 = None
fc_w = None
fc_b = None

################################################################################
# TODO: Initialize the parameters of a three-layer ConvNet.                    #
################################################################################
conv_w1 = random_weight((channel_1,3,5,5))#[input,output,kh,kw]
conv_b1 = zero_weight((channel_1,))
conv_w2 = random_weight((channel_2,32,3,3))
conv_b2 = zero_weight((channel_2,))
fc_w = random_weight((channel_2*32*32,10)) #[input,output]
fc_b = zero_weight((10,))

################################################################################
#                                 END OF YOUR CODE                             #
################################################################################

params = [conv_w1, conv_b1, conv_w2, conv_b2, fc_w, fc_b]
train(three_layer_convnet, params, learning_rate)

Iteration 0, loss = 2.8519
Got 101 / 1000 correct (10.10%)

Iteration 100, loss = 1.8218
Got 342 / 1000 correct (34.20%)

Iteration 200, loss = 1.8742
Got 374 / 1000 correct (37.40%)

Iteration 300, loss = 1.6840
Got 405 / 1000 correct (40.50%)

Iteration 400, loss = 1.4674
Got 429 / 1000 correct (42.90%)

Iteration 500, loss = 1.8531
Got 449 / 1000 correct (44.90%)

Iteration 600, loss = 1.3403
Got 443 / 1000 correct (44.30%)

Iteration 700, loss = 1.7381
Got 462 / 1000 correct (46.20%)



In [39]:
class ThreeLayerConvNet(nn.Module):
    def __init__(self, in_channel, channel_1, channel_2, num_classes):
        super().__init__()
        ########################################################################
        # TODO: Set up the layers you need for a three-layer ConvNet with the  #
        # architecture defined above.                                          #
        ########################################################################
        self.conv1 = nn.Conv2d(in_channel,channel_1,kernel_size=5,padding=2,bias=True)
        nn.init.kaiming_normal_(self.conv1.weight)
        nn.init.constant_(self.conv1.bias,0)
        
        self.conv2 = nn.Conv2d(channel_1,channel_2,kernel_size=3,padding=1,bias=True)
        nn.init.kaiming_normal_(self.conv2.weight)
        nn.init.constant_(self.conv2.bias,0)
        
        self.fc = nn.Linear(channel_2*32*32,num_classes)
        nn.init.kaiming_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias,0)
        
        
        ########################################################################
        #                          END OF YOUR CODE                            #       
        ########################################################################

    def forward(self, x):
        scores = None
        ########################################################################
        # TODO: Implement the forward function for a 3-layer ConvNet. you      #
        # should use the layers you defined in __init__ and specify the        #
        # connectivity of those layers in forward()                            #
        ########################################################################
        relu1 = F.relu(self.conv1(x))
        relu2 = F.relu(self.conv2(relu1))
        scores = self.fc(flatten(relu2))
        ########################################################################
        #                             END OF YOUR CODE                         #
        ########################################################################
        return scores


def test_ThreeLayerConvNet():
    x = torch.zeros((64, 3, 32, 32), dtype=dtype)  # minibatch size 64, image size [3, 32, 32]
    model = ThreeLayerConvNet(in_channel=3, channel_1=12, channel_2=8, num_classes=10)
    scores = model(x)
    print(scores.size())  # you should see [64, 10]
test_ThreeLayerConvNet()

torch.Size([64, 10])


In [0]:
def check_accuracy_part34(loader, model):
    if loader.dataset.train:
        print('Checking accuracy on validation set')
    else:
        print('Checking accuracy on test set')   
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=torch.long)
            scores = model(x)
            _, preds = scores.max(1)
            num_correct += (preds == y).sum()
            num_samples += preds.size(0)
        acc = float(num_correct) / num_samples
        print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))

In [0]:
def train_part34(model, optimizer, epochs=1):
    """
    Train a model on CIFAR-10 using the PyTorch Module API.
    
    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for
    
    Returns: Nothing, but prints model accuracies during training.
    """
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
        for t, (x, y) in enumerate(loader_train):
            model.train()  # put model to training mode
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=torch.long)

            scores = model(x)
            loss = F.cross_entropy(scores, y)

            # Zero out all of the gradients for the variables which the optimizer
            # will update.
            optimizer.zero_grad()

            # This is the backwards pass: compute the gradient of the loss with
            # respect to each  parameter of the model.
            loss.backward()

            # Actually update the parameters of the model using the gradients
            # computed by the backwards pass.
            optimizer.step()

            if t % print_every == 0:
                print('Iteration %d, loss = %.4f' % (t, loss.item()))
                check_accuracy_part34(loader_val, model)
                print()

In [43]:
learning_rate = 3e-3
channel_1 = 32
channel_2 = 16

model = None
optimizer = None

model = ThreeLayerConvNet(3,channel_1,channel_2,10)
optimizer = optim.SGD(model.parameters(),lr=learning_rate)

train_part34(model, optimizer)

Iteration 0, loss = 3.4384
Checking accuracy on validation set
Got 147 / 1000 correct (14.70)

Iteration 100, loss = 1.6623
Checking accuracy on validation set
Got 351 / 1000 correct (35.10)

Iteration 200, loss = 1.8252
Checking accuracy on validation set
Got 394 / 1000 correct (39.40)

Iteration 300, loss = 1.4923
Checking accuracy on validation set
Got 421 / 1000 correct (42.10)

Iteration 400, loss = 1.5491
Checking accuracy on validation set
Got 439 / 1000 correct (43.90)

Iteration 500, loss = 1.5489
Checking accuracy on validation set
Got 477 / 1000 correct (47.70)

Iteration 600, loss = 1.4089
Checking accuracy on validation set
Got 476 / 1000 correct (47.60)

Iteration 700, loss = 1.4539
Checking accuracy on validation set
Got 488 / 1000 correct (48.80)



In [0]:
### Sequential API

def kaiming_normal(shape):
    """
    Create random Tensors for weights; setting requires_grad=True means that we
    want to compute gradients for these Tensors during the backward pass.
    We use Kaiming normalization: sqrt(2 / fan_in)
    """
    if len(shape) == 2:  # FC weight
        fan_in = shape[1]  # different from `random_weight()`, as weight for nn.Linear in pytorch is of shape: [out_feature, in_feature]
    else:
        fan_in = np.prod(shape[1:]) # conv weight [out_channel, in_channel, kH, kW]
    # randn is standard normal distribution generator. 
    w = torch.randn(shape, device=device, dtype=dtype) * np.sqrt(2. / fan_in)
    w.requires_grad = True
    return w

def xavier_normal(shape):
    """
    Create random Tensors for weights; setting requires_grad=True means that we
    want to compute gradients for these Tensors during the backward pass.
    We use Xavier normalization: sqrt(2 / (fan_in + fan_out))
    """
    if len(shape) == 2:  # FC weight
        fan_in = shape[1]
        fan_out = shape[0]
    else:
        fan_in = np.prod(shape[1:]) # conv weight [out_channel, in_channel, kH, kW]
        fan_out = shape[0] * shape[2] * shape[3]
    # randn is standard normal distribution generator. 
    w = torch.randn(shape, device=device, dtype=dtype) * np.sqrt(2. / (fan_in + fan_out))
    w.requires_grad = True
    return w

In [48]:
channel_1 = 32
channel_2 = 16
learning_rate = 1e-2

model = None
optimizer = None


class Flatten(nn.Module):
  def forward(self,X):
    return flatten(X)

model = nn.Sequential(

  nn.Conv2d(3,channel_1,kernel_size=5,padding=2),
  nn.ReLU(),
  nn.Conv2d(channel_1,channel_2,kernel_size=3,padding=1),
  nn.ReLU(),
  Flatten(),
  nn.Linear(channel_2*32*32,10),
)


optimizer = optim.SGD(model.parameters(),lr=learning_rate,momentum=0.9,nesterov=True)

def init_weights(m):
    if type(m) == nn.Conv2d or type(m) == nn.Linear:
#         m.weight.data = random_weight(m.weight.size())
#         m.weight.data = kaiming_normal(m.weight.size())
        m.weight.data = xavier_normal(m.weight.size())
        m.bias.data = zero_weight(m.bias.size())

model.apply(init_weights)

################################################################################
#                                 END OF YOUR CODE                             
################################################################################

train_part34(model, optimizer)

Iteration 0, loss = 2.3894
Checking accuracy on validation set
Got 113 / 1000 correct (11.30)

Iteration 100, loss = 1.4601
Checking accuracy on validation set
Got 448 / 1000 correct (44.80)

Iteration 200, loss = 1.4637
Checking accuracy on validation set
Got 496 / 1000 correct (49.60)

Iteration 300, loss = 1.3498
Checking accuracy on validation set
Got 529 / 1000 correct (52.90)

Iteration 400, loss = 1.1211
Checking accuracy on validation set
Got 521 / 1000 correct (52.10)

Iteration 500, loss = 1.0137
Checking accuracy on validation set
Got 526 / 1000 correct (52.60)

Iteration 600, loss = 1.1234
Checking accuracy on validation set
Got 568 / 1000 correct (56.80)

Iteration 700, loss = 1.1170
Checking accuracy on validation set
Got 569 / 1000 correct (56.90)



In [60]:
layer1 = nn.Sequential(
    nn.Conv2d(3,16,kernel_size=5,padding=2),
    nn.BatchNorm2d(16),
    nn.ReLU(),
    nn.MaxPool2d(2)
)

layer2 = nn.Sequential(
    nn.Conv2d(16,32,kernel_size=3,padding=1),
    nn.BatchNorm2d(32),
    nn.ReLU(),
    nn.MaxPool2d(2)
)


layer3 = nn.Sequential(
    nn.Conv2d(32,64,kernel_size=3,padding=1),
    nn.BatchNorm2d(64),
    nn.ReLU(),
    nn.MaxPool2d(2)
)

layer4 = nn.Sequential(
    nn.Conv2d(64,128,kernel_size=3,padding=1),
    nn.BatchNorm2d(128),
    nn.ReLU(),
    nn.MaxPool2d(2)
)


fc = nn.Linear(128*4,10)

model = nn.Sequential(

  layer1,
  layer2,
  layer3,
  layer4,
  Flatten(),
  fc,
)

learning_rate = 1e-3
optimizer = optim.Adam(model.parameters(),lr=learning_rate)
print_every = 10000

train_part34(model,optimizer,epochs=10
            )


Iteration 0, loss = 2.3082
Checking accuracy on validation set
Got 105 / 1000 correct (10.50)

Iteration 0, loss = 1.1285
Checking accuracy on validation set
Got 663 / 1000 correct (66.30)

Iteration 0, loss = 0.7189
Checking accuracy on validation set
Got 706 / 1000 correct (70.60)

Iteration 0, loss = 0.6058
Checking accuracy on validation set
Got 731 / 1000 correct (73.10)

Iteration 0, loss = 0.6886
Checking accuracy on validation set
Got 723 / 1000 correct (72.30)

Iteration 0, loss = 0.4477
Checking accuracy on validation set
Got 725 / 1000 correct (72.50)

Iteration 0, loss = 0.4083
Checking accuracy on validation set
Got 757 / 1000 correct (75.70)

Iteration 0, loss = 0.4539
Checking accuracy on validation set
Got 752 / 1000 correct (75.20)

Iteration 0, loss = 0.4746
Checking accuracy on validation set
Got 761 / 1000 correct (76.10)

Iteration 0, loss = 0.2062
Checking accuracy on validation set
Got 759 / 1000 correct (75.90)



In [61]:
best_model = model
check_accuracy_part34(loader_test, best_model)

Checking accuracy on test set
Got 7509 / 10000 correct (75.09)
