In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import sampler
import torchvision.datasets as dset
import torchvision.transforms as T
import numpy as np

In [3]:
NUM_TRAIN = 49000
transform = T.Compose([T.ToTensor(), T.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])

cifar10_train = dset.CIFAR10('./cs231n/datasets/', train=True, download=True, transform=transform)
loader_train = DataLoader(cifar10_train, batch_size=64, sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN)))

cifar10_val = dset.CIFAR10('./cs231n/datasets/', train=True, download=True, transform=transform)
loader_val = DataLoader(cifar10_val, batch_size=64, sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN, 50000)))

cifar10_test = dset.CIFAR10('./cs231n/datasets/', train=False, download=True, transform=transform)
loader_test = DataLoader(cifar10_test, batch_size=64)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [5]:
USE_GPU = True
dtype = torch.float32
print_every = 100

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    
print('using device:', device)

using device: cpu


In [7]:
def flatten(x):
    N = x.shape[0]
    return x.view(N, -1)

def test_flatten():
    x = torch.arange(12).view(2, 1, 3, 2)
    print('Before flattening: ', x.shape)
    print('After flattening: ', flatten(x).shape)
    
test_flatten()

Before flattening:  torch.Size([2, 1, 3, 2])
After flattening:  torch.Size([2, 6])


Barebones PyTorch
---

### Two-Layer Network


In [10]:
import torch.nn.functional as F

In [12]:
def two_layer_fc(x, params):
    x = flatten(x)
    w1, w2 = params
    
    x = F.relu(x.mm(w1))
    x = x.mm(w2)
    
    return x

def two_layer_fc_test():
    hidden_layer_size = 42
    x = torch.zeros((64, 50), dtype=dtype)
    w1 = torch.zeros((50, hidden_layer_size), dtype=dtype)
    w2 = torch.zeros((hidden_layer_size, 10), dtype=dtype)
    params = (w1, w2)
    scores = two_layer_fc(x, params)
    print(scores.shape)
    
two_layer_fc_test()

torch.Size([64, 10])


### Three-Layer ConvNet

In [14]:
def three_layer_convnet(x, params):
    w1, b1, w2, b2, w3, b3 = params
    scores = None
    
    out1 = F.conv2d(x, w1, bias=b1, stride=1, padding=(2, 2))
    relu1 = F.relu(out1)
    out2 = F.conv2d(relu1, w2, bias=b2, stride=1, padding=(1, 1))
    relu2 = F.relu(out2)
    scores = torch.mm(flatten(relu2), w3) + b3
    
    return scores

In [16]:
def three_layer_convnet_test():
    x = torch.zeros((64, 3, 32, 32), dtype=dtype)
    
    w1 = torch.zeros((6, 3, 5, 5), dtype=dtype)
    b1 = torch.zeros((6, ), dtype=dtype)
    w2 = torch.zeros((9, 6, 3, 3), dtype=dtype)
    b2 = torch.zeros((9, ), dtype=dtype)
    w3 = torch.zeros((32*32*9, 10), dtype=dtype)
    b3 = torch.zeros((10, ), dtype=dtype)
    
    params = (w1, b1, w2, b2, w3, b3)
    scores = three_layer_convnet(x, params)
    print(scores.shape)
    
three_layer_convnet_test()

torch.Size([64, 10])


### Initialization

In [137]:
def random_weight(shape):
    if len(shape) == 2:
        mode = 'fan_out'
    else:
        mode = 'fan_in'
    weight = torch.empty(shape)
    nn.init.kaiming_normal_(weight, mode=mode)
    weight.requires_grad = True
    return weight

def zero_weight(shape):
    return torch.zeros(shape, device=device, dtype=dtype, requires_grad=True)

# def random_weight(shape):
#     if len(shape) == 2:
#         fan_in = shape[0]
#     else:
#         fan_in = np.prod(shape[1:])
#     w = torch.randn(shape, device=device, dtype=dtype) * np.sqrt(2. / fan_in)
#     w.requires_grad = True
#     return w

random_weight((3, 5))

tensor([[-1.5531,  1.6935,  0.9294,  0.5106, -1.2486],
        [-0.2500,  1.1149, -0.2589, -0.5693,  0.5004],
        [-1.2586, -0.0858, -0.2165,  0.1567,  1.4835]])

### Check Accuracy

In [138]:
def check_accuracy(loader, model_fn, params):
    split = 'val' if loader.dataset.train else 'test'
    print('Checking accuracy on the %s set' % split)
    num_correct, num_samples = 0, 0
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device, dtype=dtype)
            y = y.to(device=device, dtype=torch.long)
            scores = model_fn(x, params)
            max_scores, preds = torch.max(scores, dim=1)
            num_correct += (preds == y).sum()
            num_samples += x.shape[0]

        acc = float(num_correct) / num_samples
        print('Got %d / %d correct (%.2f%%)' % (num_correct, num_samples, 100 * acc))

### Training Loop

In [139]:
def train(model_fn, params, learning_rate):
    for t, (x, y) in enumerate(loader_train):
        x = x.to(device=device, dtype=dtype)
        y = y.to(device=device, dtype=torch.long)
        
        scores = model_fn(x, params)
        loss = F.cross_entropy(scores, y)
        
        loss.backward()
        
        with torch.no_grad():
            for w in params:
                w -= learning_rate * w.grad
                
                w.grad.zero_()
                
        if t % print_every == 0:
            print('Iteration %d, loss = %.4f' % (t, loss.item()))
            check_accuracy(loader_val, model_fn, params)
            print()

### Train a Two-Layer Network

In [140]:
hidden_layer_size = 4000
learning_rate = 1e-2

w1 = random_weight((3 * 32 * 32, hidden_layer_size))
w2 = random_weight((hidden_layer_size, 10))

train(two_layer_fc, [w1, w2], learning_rate)

Iteration 0, loss = 4.0419
Checking accuracy on the val set
Got 144 / 1000 correct (14.40%)

Iteration 100, loss = 1.9465
Checking accuracy on the val set
Got 343 / 1000 correct (34.30%)

Iteration 200, loss = 2.5488
Checking accuracy on the val set
Got 362 / 1000 correct (36.20%)

Iteration 300, loss = 2.1277
Checking accuracy on the val set
Got 365 / 1000 correct (36.50%)

Iteration 400, loss = 2.3016
Checking accuracy on the val set
Got 395 / 1000 correct (39.50%)

Iteration 500, loss = 1.6212
Checking accuracy on the val set
Got 426 / 1000 correct (42.60%)

Iteration 600, loss = 1.6771
Checking accuracy on the val set
Got 421 / 1000 correct (42.10%)

Iteration 700, loss = 1.7955
Checking accuracy on the val set
Got 430 / 1000 correct (43.00%)



### Training a ConvNet

In [141]:
learning_rate = 3e-3
channel_1 = 32
channel_2 = 16

w1 = random_weight((channel_1, 3, 5, 5))
b1 = zero_weight((channel_1, ))
w2 = random_weight((channel_2, channel_1, 3, 3))
b2 = zero_weight((channel_2, ))
w3 = random_weight((32*32*channel_2, 10))
b3 = zero_weight((10, ))

params = (w1, b1, w2, b2, w3, b3)
train(three_layer_convnet, params, learning_rate)

Iteration 0, loss = 3.2404
Checking accuracy on the val set
Got 103 / 1000 correct (10.30%)

Iteration 100, loss = 2.0333
Checking accuracy on the val set
Got 333 / 1000 correct (33.30%)

Iteration 200, loss = 1.8083
Checking accuracy on the val set
Got 383 / 1000 correct (38.30%)

Iteration 300, loss = 1.6545
Checking accuracy on the val set
Got 418 / 1000 correct (41.80%)

Iteration 400, loss = 1.7084
Checking accuracy on the val set
Got 450 / 1000 correct (45.00%)

Iteration 500, loss = 1.4164
Checking accuracy on the val set
Got 462 / 1000 correct (46.20%)

Iteration 600, loss = 1.3614
Checking accuracy on the val set
Got 460 / 1000 correct (46.00%)

Iteration 700, loss = 1.5666
Checking accuracy on the val set
Got 455 / 1000 correct (45.50%)



PyTorch Module API
---

### Two-Layer Network

In [147]:
class TwoLayerFC(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size, bias=True)
        nn.init.kaiming_normal_(self.fc1.weight)
        self.fc2 = nn.Linear(hidden_size, num_classes, bias=True)
        nn.init.kaiming_normal_(self.fc2.weight)
        
    def forward(self, x):
        x = flatten(x)
        scores = self.fc2(F.relu(self.fc1(x)))
        return scores

def test_TwoLayerFC():
    input_size = 50
    x = torch.zeros((64, input_size), dtype=dtype)
    model = TwoLayerFC(input_size, 42, 10)
    scores = model.forward(x)
    print(scores.shape)

test_TwoLayerFC()

torch.Size([64, 10])


### Three-Layer ConvNet

In [154]:
class ThreeLayerConvNet(nn.Module):
    def __init__(self, params):
        super().__init__()
        in_channel = params.get('in_channel')
        channel_1 = params.get('channel_1')
        channel_2 = params.get('channel_2')
        num_classes = params.get('num_classes')

        self.conv1 = nn.Conv2d(in_channel, channel_1, kernel_size=(5, 5), stride=1, padding=(2, 2))
        nn.init.kaiming_normal_(self.conv1.weight)
        self.conv2 = nn.Conv2d(channel_1, channel_2, kernel_size=(3, 3), stride=1, padding=(1, 1))
        nn.init.kaiming_normal_(self.conv2.weight)
        self.fc = nn.Linear(channel_2*32*32, num_classes)
        nn.init.kaiming_normal_(self.fc.weight)

    def forward(self, x):
        out_1 = F.relu(self.conv1(x))
        out_2 = F.relu(self.conv2(out_1))
        scores = self.fc(flatten(out_2))
        return scores

def test_ThreeLayerConvNet():
    x = torch.zeros((64, 3, 32, 32), dtype=dtype)
    kwargs = {'in_channel': 3, 'channel_1': 12, 'channel_2': 8, 'num_classes':10}
    model = ThreeLayerConvNet(kwargs)
    scores = model.forward(x)
    print(scores.shape)

test_ThreeLayerConvNet()

torch.Size([64, 10])


In [155]:
def check_accuracy_2(loader, model):
    if loader.dataset.train:
        print('Checking accuracy on validation set')
    else:
        print('Checking accuracy on test set')
        
    num_samples = 0
    num_correct = 0
    model.eval()
    for x, y in loader:
        x = x.to(device=device, dtype=dtype)
        y = y.to(device=device, dtype=torch.long)
        scores = model(x)
        _, preds = torch.max(scores, dim=1)
        num_correct += (preds == y).sum()
        num_samples += x.shape[0]
    acc = float(num_correct) / num_samples
    print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))

### Training Loop

In [158]:
def train2(model, optimizer, epochs=1):
    model = model.to(device=device)
    for e in range(epochs):
        for t, (x, y) in enumerate(loader_train):
            model.train()
            x = x.to(device=device, dtype=dtype)
            y = y.to(device=device, dtype=torch.long)
            
            scores = model(x)
            loss = F.cross_entropy(scores, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if t % print_every == 0:
                print('Iteration %d, loss = %.4f' % (t, loss.item()))
                check_accuracy_2(loader_val, model)
                print()

### Train a Two-Layer Network

In [159]:
hidden_layer_size = 4000
learning_rate = 1e-2
model = TwoLayerFC(3*32*32, hidden_layer_size, 10)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

train2(model, optimizer)

Iteration 0, loss = 2.8094
Checking accuracy on validation set
Got 161 / 1000 correct (16.10)

Iteration 100, loss = 2.2876
Checking accuracy on validation set
Got 335 / 1000 correct (33.50)

Iteration 200, loss = 1.7721
Checking accuracy on validation set
Got 415 / 1000 correct (41.50)

Iteration 300, loss = 1.9384
Checking accuracy on validation set
Got 392 / 1000 correct (39.20)

Iteration 400, loss = 2.2530
Checking accuracy on validation set
Got 316 / 1000 correct (31.60)

Iteration 500, loss = 2.0894
Checking accuracy on validation set
Got 390 / 1000 correct (39.00)

Iteration 600, loss = 1.7047
Checking accuracy on validation set
Got 424 / 1000 correct (42.40)

Iteration 700, loss = 1.6730
Checking accuracy on validation set
Got 414 / 1000 correct (41.40)



### Train a Three-Layer ConvNet

In [162]:
learning_rate = 3e-3
channel_1 = 32
channel_2 = 16

params = {'in_channel': 3, 'channel_1': channel_1, 'channel_2': channel_1, 'num_classes': 10}
model = ThreeLayerConvNet(params)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

train2(model, optimizer)

Iteration 0, loss = 3.0832
Checking accuracy on validation set
Got 110 / 1000 correct (11.00)

Iteration 100, loss = 1.7194
Checking accuracy on validation set
Got 384 / 1000 correct (38.40)

Iteration 200, loss = 1.7438
Checking accuracy on validation set
Got 417 / 1000 correct (41.70)

Iteration 300, loss = 1.5973
Checking accuracy on validation set
Got 423 / 1000 correct (42.30)

Iteration 400, loss = 1.3857
Checking accuracy on validation set
Got 461 / 1000 correct (46.10)

Iteration 500, loss = 1.4872
Checking accuracy on validation set
Got 465 / 1000 correct (46.50)

Iteration 600, loss = 1.4933
Checking accuracy on validation set
Got 478 / 1000 correct (47.80)

Iteration 700, loss = 1.3025
Checking accuracy on validation set
Got 485 / 1000 correct (48.50)



PyTorch Sequential API
---

### Two-Layer Network

In [163]:
class Flatten(nn.Module):
    def forward(self, x):
        return flatten(x)
    
hidden_layer_size = 4000
learning_rate = 1e-2

model = nn.Sequential(
    Flatten(),
    nn.Linear(3*32*32, hidden_layer_size),
    nn.ReLU(),
    nn.Linear(hidden_layer_size, 10))

optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, nesterov=True)
train2(model, optimizer)

Iteration 0, loss = 2.3646
Checking accuracy on validation set
Got 161 / 1000 correct (16.10)

Iteration 100, loss = 1.9941
Checking accuracy on validation set
Got 380 / 1000 correct (38.00)

Iteration 200, loss = 1.8165
Checking accuracy on validation set
Got 398 / 1000 correct (39.80)

Iteration 300, loss = 1.5029
Checking accuracy on validation set
Got 424 / 1000 correct (42.40)

Iteration 400, loss = 1.8718
Checking accuracy on validation set
Got 407 / 1000 correct (40.70)

Iteration 500, loss = 1.4071
Checking accuracy on validation set
Got 427 / 1000 correct (42.70)

Iteration 600, loss = 1.6137
Checking accuracy on validation set
Got 437 / 1000 correct (43.70)

Iteration 700, loss = 1.6795
Checking accuracy on validation set
Got 445 / 1000 correct (44.50)



### Three-Layer ConvNet

In [165]:
channel_1 = 32
channel_2 = 16
learning_rate = 1e-2

model = nn.Sequential(
    nn.Conv2d(in_channels=3, out_channels=channel_1, kernel_size=(5, 5), stride=1, padding=(2, 2), bias=True),
    nn.ReLU(),
    nn.Conv2d(in_channels=channel_1, out_channels=channel_2, kernel_size=(3, 3), stride=1, padding=(1, 1), bias=True),
    nn.ReLU(),
    Flatten(),
    nn.Linear(channel_2*32*32, 10)
)
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, nesterov=True)

train2(model, optimizer)

Iteration 0, loss = 2.3005
Checking accuracy on validation set
Got 105 / 1000 correct (10.50)

Iteration 100, loss = 1.6214
Checking accuracy on validation set
Got 423 / 1000 correct (42.30)

Iteration 200, loss = 1.5319
Checking accuracy on validation set
Got 520 / 1000 correct (52.00)

Iteration 300, loss = 1.5658
Checking accuracy on validation set
Got 501 / 1000 correct (50.10)

Iteration 400, loss = 1.2787
Checking accuracy on validation set
Got 535 / 1000 correct (53.50)

Iteration 500, loss = 1.3388
Checking accuracy on validation set
Got 546 / 1000 correct (54.60)

Iteration 600, loss = 1.1717
Checking accuracy on validation set
Got 575 / 1000 correct (57.50)

Iteration 700, loss = 1.3267
Checking accuracy on validation set
Got 532 / 1000 correct (53.20)

