In [1]:
import torch
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l
import torchvision as tv
import os
import numpy as np

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
class Residual(nn.Module):
    def __init__(self, input_channels, num_channels,
                 use_1x1conv=False, strides=1):
        super().__init__()
        self.conv1 = nn.Conv2d(input_channels, num_channels//4,
                               kernel_size=1, padding=0)
        self.conv2 = nn.Conv2d(num_channels//4, num_channels//4,
                               kernel_size=3, padding=1,stride=strides)
        self.conv3 = nn.Conv2d(num_channels//4, num_channels,
                               kernel_size=1, padding=0)
        
        if use_1x1conv:
            self.conv4 = nn.Conv2d(input_channels, num_channels,
                                   kernel_size=1, stride=strides)
        else:
            self.conv4 = None
            
        self.bn1 = nn.BatchNorm2d(num_channels//4)
        self.bn2 = nn.BatchNorm2d(num_channels//4)
        self.bn3 = nn.BatchNorm2d(num_channels)

    def forward(self, X):
        Y = F.relu(self.bn1(self.conv1(X)))
        Y = F.relu(self.bn2(self.conv2(Y)))
        Y = self.bn3(self.conv3(Y))
        
        if self.conv4:
            X = self.conv4(X)
            
        Y += X
        return F.relu(Y)

In [3]:
b1 = nn.Sequential(nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
                   nn.BatchNorm2d(64), nn.ReLU(),
                   nn.MaxPool2d(kernel_size=3, stride=1, padding=1))

In [4]:
def resnet_block(input_channels, num_channels, num_residuals,
                 first_block=False):
    blk = []
    for i in range(num_residuals):
        if i == 0 and not first_block:
            blk.append(Residual(input_channels, num_channels,
                                use_1x1conv=True, strides=2))
        elif i == 0 and first_block:
            blk.append(Residual(input_channels, num_channels,
                                use_1x1conv=True))
        else:
            blk.append(Residual(num_channels, num_channels))
    return blk

In [5]:
b2 = nn.Sequential(*resnet_block(64, 256, 3, first_block=True))
b3 = nn.Sequential(*resnet_block(256, 512, 4))
b4 = nn.Sequential(*resnet_block(512, 1024, 6))
b5 = nn.Sequential(*resnet_block(1024, 2048, 3))

In [6]:
net = nn.Sequential(b1, b2, b3, b4, b5,
                    nn.AdaptiveAvgPool2d((1,1)),
                    nn.Flatten(), nn.Dropout(0.25), nn.Linear(2048, 10))

if device == 'cuda':
	net = torch.nn.DataParallel(net)

In [7]:
X = torch.rand(size=(1, 3, 32, 32))
for layer in net:
    X = layer(X)
    print(layer.__class__.__name__,'output shape:\t', X.shape)

Sequential output shape:	 torch.Size([1, 64, 32, 32])
Sequential output shape:	 torch.Size([1, 256, 32, 32])
Sequential output shape:	 torch.Size([1, 512, 16, 16])
Sequential output shape:	 torch.Size([1, 1024, 8, 8])
Sequential output shape:	 torch.Size([1, 2048, 4, 4])
AdaptiveAvgPool2d output shape:	 torch.Size([1, 2048, 1, 1])
Flatten output shape:	 torch.Size([1, 2048])
Dropout output shape:	 torch.Size([1, 2048])
Linear output shape:	 torch.Size([1, 10])


In [8]:
def evaluate_accuracy_gpu(net, data_iter, device=None): 
    if isinstance(net, nn.Module):
        net.eval()  
        if not device:
            device = next(iter(net.parameters())).device
    metric = d2l.Accumulator(2)
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(X, list):
                X = [x.to(device) for x in X]
            else:
                X = X.to(device)
            y = y.to(device)
            metric.add(d2l.accuracy(net(X), y), y.numel())
    return metric[0] / metric[1]

In [9]:
def train_net(net, train_iter, test_iter, num_epochs, lr, device):
    def init_weights(m):
        if type(m) == nn.Linear or type(m) == nn.Conv2d:
            torch.nn.init.kaiming_uniform_(m.weight)
    net.apply(init_weights)
    print('training on', device)
    net.to(device)
    optimizer = torch.optim.SGD(params=net.parameters(),
                          lr=lr,
                          momentum=0.9,
                          weight_decay=0.001)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=[50, 100, 150], gamma=0.1)
    loss = nn.CrossEntropyLoss()
    num_batches = len(train_iter)
    for epoch in range(num_epochs):
        metric = d2l.Accumulator(3)
        net.train()
        for i, (X, y) in enumerate(train_iter):
            optimizer.zero_grad()
            X, y = X.to(device), y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            l.backward()
            optimizer.step()
            with torch.no_grad():
                metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0])
            train_l = metric[0] / metric[2]
            train_acc = metric[1] / metric[2]
        scheduler.step()
        test_acc = evaluate_accuracy_gpu(net, test_iter)
        print(f'------- ------- Epoch{epoch+1}/{num_epochs} ------- -------')
        print(f'loss {train_l:.5f}, train acc {train_acc:.5f}, '
          f'test acc {test_acc:.5f}')

In [10]:
shape_aug = tv.transforms.RandomResizedCrop(
    (28, 28), scale=(0.1, 1), ratio=(0.5, 2))

In [11]:
train_augs = tv.transforms.Compose([
    tv.transforms.RandomCrop(32, padding=4),
    tv.transforms.RandomHorizontalFlip(),
    tv.transforms.ToTensor(), shape_aug,
    tv.transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

test_augs = tv.transforms.Compose([
    tv.transforms.ToTensor(), 
    tv.transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

In [12]:
def load_cifar10(is_train, augs, batch_size):
    dataset = tv.datasets.CIFAR10(root="../data", train=is_train,
                                           transform=augs, download=True)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                    shuffle=is_train, num_workers=2)
    return dataloader

In [13]:
lr, num_epochs, batch_size = 0.01, 200, 128

In [14]:
train_iter = load_cifar10(True, train_augs, batch_size)
test_iter = load_cifar10(False, test_augs, batch_size)

Files already downloaded and verified
Files already downloaded and verified


In [15]:
train_net(net, train_iter, test_iter, num_epochs, lr, device)

training on cuda
------- ------- Epoch1/200 ------- -------
loss 3.40733, train acc 0.18326, test acc 0.31320
------- ------- Epoch2/200 ------- -------
loss 2.05797, train acc 0.27400, test acc 0.36030
------- ------- Epoch3/200 ------- -------
loss 1.86335, train acc 0.32742, test acc 0.41460
------- ------- Epoch4/200 ------- -------
loss 1.76549, train acc 0.36428, test acc 0.43320
------- ------- Epoch5/200 ------- -------
loss 1.68314, train acc 0.38964, test acc 0.46890
------- ------- Epoch6/200 ------- -------
loss 1.60910, train acc 0.41830, test acc 0.47560
------- ------- Epoch7/200 ------- -------
loss 1.54691, train acc 0.44454, test acc 0.47340
------- ------- Epoch8/200 ------- -------
loss 1.48863, train acc 0.46638, test acc 0.53590
------- ------- Epoch9/200 ------- -------
loss 1.43597, train acc 0.48660, test acc 0.55680
------- ------- Epoch10/200 ------- -------
loss 1.38013, train acc 0.50632, test acc 0.57820
------- ------- Epoch11/200 ------- -------
loss 1.3