In [1]:
import torch
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l
import torchvision as tv
import os
import numpy as np

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
class ChannelAttentionModule(nn.Module):
    def __init__(self, num_channels, ratio=16):
        super(ChannelAttentionModule, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)
 
        self.shared_MLP = nn.Sequential(
            nn.Conv2d(num_channels, num_channels // ratio, 1, bias=False),
            nn.ReLU(),
            nn.Conv2d(num_channels // ratio, num_channels, 1, bias=False)
        )
        self.sigmoid = nn.Sigmoid()
 
    def forward(self, X):
        avgout = self.shared_MLP(self.avg_pool(X))
        maxout = self.shared_MLP(self.max_pool(X))
        return self.sigmoid(avgout + maxout)
 
class SpatialAttentionModule(nn.Module):
    def __init__(self):
        super(SpatialAttentionModule, self).__init__()
        self.conv2d = nn.Conv2d(in_channels=2, out_channels=1, kernel_size=3, stride=1, padding=1)
        self.sigmoid = nn.Sigmoid()
 
    def forward(self, X):
        avgout = torch.mean(X, dim=1, keepdim=True)
        maxout, _ = torch.max(X, dim=1, keepdim=True)
        Y = torch.cat([avgout, maxout], dim=1)
        Y = self.sigmoid(self.conv2d(Y))
        return Y
 
class CBAM(nn.Module):
    def __init__(self, num_channels):
        super(CBAM, self).__init__()
        self.channel_attention = ChannelAttentionModule(num_channels)
        self.spatial_attention = SpatialAttentionModule()
 
    def forward(self, X):
        Y = self.channel_attention(X) * X
        Y = self.spatial_attention(Y) * Y
        
        return Y

In [3]:
class CBAM_Residual(nn.Module):
    def __init__(self, input_channels, num_channels,
                 use_1x1conv=False, strides=1):
        super().__init__()
        self.conv1 = nn.Conv2d(input_channels, num_channels,
                               kernel_size=3, padding=1, stride=strides)
        self.conv2 = nn.Conv2d(num_channels, num_channels,
                               kernel_size=3, padding=1)
        if use_1x1conv:
            self.conv3 = nn.Conv2d(input_channels, num_channels,
                                   kernel_size=1, stride=strides)
        else:
            self.conv3 = None
            
        self.bn1 = nn.BatchNorm2d(num_channels)
        self.bn2 = nn.BatchNorm2d(num_channels)
        
        self.cbam = CBAM(num_channels)

    

    def forward(self, X):
        Y = F.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        
        Y = self.cbam(Y)
        
        if self.conv3:
            X = self.conv3(X)

        Y += X
        return F.relu(Y)

In [4]:
b1 = nn.Sequential(nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
                   nn.BatchNorm2d(64), nn.ReLU(),
                   nn.MaxPool2d(kernel_size=3, stride=1, padding=1))

In [5]:
def CBAM_resnet_block(input_channels, num_channels, num_residuals,
                 first_block=False):
    blk = []
    for i in range(num_residuals):
        if i == 0 and not first_block:
            blk.append(CBAM_Residual(input_channels, num_channels,
                                use_1x1conv=True, strides=2))
        else:
            blk.append(CBAM_Residual(num_channels, num_channels))
    return blk

In [6]:
b2 = nn.Sequential(*CBAM_resnet_block(64, 64, 2, first_block=True))
b3 = nn.Sequential(*CBAM_resnet_block(64, 128, 2))
b4 = nn.Sequential(*CBAM_resnet_block(128, 256, 2))
b5 = nn.Sequential(*CBAM_resnet_block(256, 512, 2))

In [7]:
net = nn.Sequential(b1, b2, b3, b4, b5,
                    nn.AdaptiveAvgPool2d((1,1)),
                    nn.Flatten(), nn.Dropout(0.25), nn.Linear(512, 10))

if device == 'cuda':
	net = torch.nn.DataParallel(net)

In [8]:
X = torch.rand(size=(1, 3, 32, 32))
for layer in net:
    X = layer(X)
    print(layer.__class__.__name__,'output shape:\t', X.shape)

Sequential output shape:	 torch.Size([1, 64, 32, 32])
Sequential output shape:	 torch.Size([1, 64, 32, 32])
Sequential output shape:	 torch.Size([1, 128, 16, 16])
Sequential output shape:	 torch.Size([1, 256, 8, 8])
Sequential output shape:	 torch.Size([1, 512, 4, 4])
AdaptiveAvgPool2d output shape:	 torch.Size([1, 512, 1, 1])
Flatten output shape:	 torch.Size([1, 512])
Dropout output shape:	 torch.Size([1, 512])
Linear output shape:	 torch.Size([1, 10])


In [9]:
def evaluate_accuracy_gpu(net, data_iter, device=None): 
    if isinstance(net, nn.Module):
        net.eval()  
        if not device:
            device = next(iter(net.parameters())).device
    metric = d2l.Accumulator(2)
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(X, list):
                X = [x.to(device) for x in X]
            else:
                X = X.to(device)
            y = y.to(device)
            metric.add(d2l.accuracy(net(X), y), y.numel())
    return metric[0] / metric[1]

In [10]:
def train_net(net, train_iter, test_iter, num_epochs, lr, device):
    def init_weights(m):
        if type(m) == nn.Linear or type(m) == nn.Conv2d:
            torch.nn.init.kaiming_uniform_(m.weight)
    net.apply(init_weights)
    print('training on', device)
    net.to(device)
    optimizer = torch.optim.SGD(params=net.parameters(),
                          lr=lr,
                          momentum=0.9,
                          weight_decay=0.001)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=[50, 90, 120], gamma=0.1)
    loss = nn.CrossEntropyLoss()
    timer, num_batches = d2l.Timer(), len(train_iter)
    for epoch in range(num_epochs):
        metric = d2l.Accumulator(3)
        net.train()
        for i, (X, y) in enumerate(train_iter):
            optimizer.zero_grad()
            X, y = X.to(device), y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            l.backward()
            optimizer.step()
            with torch.no_grad():
                metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0])
            train_l = metric[0] / metric[2]
            train_acc = metric[1] / metric[2]
        scheduler.step()
        test_acc = evaluate_accuracy_gpu(net, test_iter)
        print(f'------- -------- Epoch{epoch+1}/{num_epochs} ------- -------')
        print(f'loss {train_l:.5f}, train acc {train_acc:.5f}, '
           f'test acc {test_acc:.5f}')

In [11]:
shape_aug = tv.transforms.RandomResizedCrop(
    (28, 28), scale=(0.1, 1), ratio=(0.5, 2))

In [12]:
train_augs = tv.transforms.Compose([
    tv.transforms.RandomCrop(32, padding=4),
    tv.transforms.RandomHorizontalFlip(),
    tv.transforms.ToTensor(), shape_aug,
    tv.transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

test_augs = tv.transforms.Compose([
    tv.transforms.ToTensor(),
    tv.transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

In [13]:
def load_cifar10(is_train, augs, batch_size):
    dataset = tv.datasets.CIFAR10(root="../data", train=is_train,
                                           transform=augs, download=True)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                    shuffle=is_train, num_workers=2)
    return dataloader

In [14]:
lr, num_epochs, batch_size = 0.01, 150, 128

In [15]:
train_iter = load_cifar10(True, train_augs, batch_size)
test_iter = load_cifar10(False, test_augs, batch_size)

Files already downloaded and verified
Files already downloaded and verified


In [16]:
train_net(net, train_iter, test_iter, num_epochs, lr, device)

training on cuda
------- -------- Epoch1/150 ------- -------
loss 1.89083, train acc 0.30774, test acc 0.40680
------- -------- Epoch2/150 ------- -------
loss 1.62597, train acc 0.40736, test acc 0.49380
------- -------- Epoch3/150 ------- -------
loss 1.49939, train acc 0.45564, test acc 0.55130
------- -------- Epoch4/150 ------- -------
loss 1.41044, train acc 0.49280, test acc 0.56690
------- -------- Epoch5/150 ------- -------
loss 1.33137, train acc 0.52464, test acc 0.52760
------- -------- Epoch6/150 ------- -------
loss 1.27236, train acc 0.54712, test acc 0.61920
------- -------- Epoch7/150 ------- -------
loss 1.22122, train acc 0.56724, test acc 0.68010
------- -------- Epoch8/150 ------- -------
loss 1.17176, train acc 0.58388, test acc 0.67640
------- -------- Epoch9/150 ------- -------
loss 1.12791, train acc 0.60124, test acc 0.69310
------- -------- Epoch10/150 ------- -------
loss 1.09250, train acc 0.61466, test acc 0.65730
------- -------- Epoch11/150 ------- -----