In [1]:
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torchsummary import summary
from torch.utils.data import DataLoader
 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import os 

%matplotlib inline
%config InlineBackend.figure_format = 'retina'



In [2]:
!pip install einops

Collecting einops
  Downloading https://files.pythonhosted.org/packages/1e/00/919421f097de2a6ca2d9b4d9f3f596274e44c243a6ecca210cd0811032c0/einops-0.3.2-py3-none-any.whl
Installing collected packages: einops
Successfully installed einops-0.3.2


In [3]:
from einops import rearrange, repeat
from einops.layers.torch import Rearrange

In [4]:
# helpers

def pair(t):
    return t if isinstance(t, tuple) else (t, t)

# classes

class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)

class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)

In [5]:
class Attention(nn.Module):
    def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
        super().__init__()
        inner_dim = dim_head *  heads
        project_out = not (heads == 1 and dim_head == dim)

        self.heads = heads
        self.scale = dim_head ** -0.5

        self.attend = nn.Softmax(dim = -1)
        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)

        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        ) if project_out else nn.Identity()

    def forward(self, x):
        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)

        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale

        attn = self.attend(dots)

        out = torch.matmul(attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        return self.to_out(out)

class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
        super().__init__()
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),
                PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
            ]))
    def forward(self, x):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
        return x

class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
            nn.Linear(patch_dim, dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        x = self.to_patch_embedding(img)
        b, n, _ = x.shape

        cls_tokens = repeat(self.cls_token, '() n d -> b n d', b = b)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        x = self.dropout(x)

        x = self.transformer(x)

        x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]

        x = self.to_latent(x)
        return self.mlp_head(x)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
model = ViT(image_size = 32,
    patch_size = 32,
    num_classes = 10,
    dim = 128,
    depth = 6,
    heads = 16,
    mlp_dim = 256,
    dropout = 0.1,
    emb_dropout = 0.1)
summary(model.to(device), (3,32,32))

Layer (type:depth-idx)                   Output Shape              Param #
├─Sequential: 1-1                        [-1, 1, 128]              --
|    └─Rearrange: 2-1                    [-1, 1, 3072]             --
|    └─Linear: 2-2                       [-1, 1, 128]              393,344
├─Dropout: 1-2                           [-1, 2, 128]              --
├─Transformer: 1-3                       [-1, 2, 128]              --
├─Identity: 1-4                          [-1, 128]                 --
├─Sequential: 1-5                        [-1, 10]                  --
|    └─LayerNorm: 2-3                    [-1, 128]                 256
|    └─Linear: 2-4                       [-1, 10]                  1,290
Total params: 394,890
Trainable params: 394,890
Non-trainable params: 0
Total mult-adds (M): 7.87
Input size (MB): 0.01
Forward/backward pass size (MB): 0.00
Params size (MB): 1.51
Estimated Total Size (MB): 1.52


Layer (type:depth-idx)                   Output Shape              Param #
├─Sequential: 1-1                        [-1, 1, 128]              --
|    └─Rearrange: 2-1                    [-1, 1, 3072]             --
|    └─Linear: 2-2                       [-1, 1, 128]              393,344
├─Dropout: 1-2                           [-1, 2, 128]              --
├─Transformer: 1-3                       [-1, 2, 128]              --
├─Identity: 1-4                          [-1, 128]                 --
├─Sequential: 1-5                        [-1, 10]                  --
|    └─LayerNorm: 2-3                    [-1, 128]                 256
|    └─Linear: 2-4                       [-1, 10]                  1,290
Total params: 394,890
Trainable params: 394,890
Non-trainable params: 0
Total mult-adds (M): 7.87
Input size (MB): 0.01
Forward/backward pass size (MB): 0.00
Params size (MB): 1.51
Estimated Total Size (MB): 1.52

In [8]:
def train(train_loader, model, criterion, optimizer, epoch):
    """
        Run one train epoch
    """
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to train mode
    model.train()

    for i, (input, target) in enumerate(train_loader):

        input_var = input.cuda()
        target_var = target.cuda()

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        output = output.float()
        loss = loss.float()
        # measure accuracy and record loss
        acc = accuracy(output.data, target_var)[0]
        losses.update(loss.item(), input.size(0))
        top1.update(acc.item(), input.size(0))

    print(f'Epoch: [{epoch}]\t Loss {losses.val:.4f} ({losses.avg:.4f})\t acc {top1.val:.3f} ({top1.avg:.3f})')
    return top1.avg, losses.avg

In [9]:
def testing(val_loader, model, criterion):
    """
    Run evaluation
    """
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        for i, (input, target) in enumerate(val_loader):
            input_var = input.cuda()
            target_var = target.cuda()

            # compute output
            output = model(input_var)
            loss = criterion(output, target_var)

            output = output.float()
            loss = loss.float()

            # measure accuracy and record loss and acc
            acc = accuracy(output.data, target_var)[0]
            losses.update(loss.item(), input.size(0))
            top1.update(acc.item(), input.size(0))


    print(f'Test\t  accuracy: {top1.avg:.3f} (Err: {losses.avg:.3f} )\n')

    return top1.avg, losses.avg

In [10]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def accuracy(output, target, topk=3):
    batch_size = target.size(0)

    _, pred = output.topk(topk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    correct_k = correct[:topk].reshape(-1).float().sum(0, keepdim=True)
    res.append(correct_k.mul_(100.0 / batch_size))
    return res

In [12]:
train_loader = DataLoader(dataset=datasets.CIFAR10(
    root="dataset/", train=True, transform=transforms.ToTensor(), download=True)
, batch_size=128, shuffle=True, num_workers=2, pin_memory=True)


test_loader = DataLoader(
        datasets.CIFAR10(root='dataset/', train=False, transform=transforms.Compose([
            transforms.ToTensor()
        ])),
        batch_size=128, shuffle=False,
        num_workers=2, pin_memory=True)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to dataset/cifar-10-python.tar.gz


HBox(children=(IntProgress(value=0, max=170498071), HTML(value='')))


Extracting dataset/cifar-10-python.tar.gz to dataset/


In [14]:
def main(model):
    best_acc = 0

    model.cuda()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    optimizer = torch.optim.SGD(model.parameters(), .1, momentum=.9,
                                weight_decay=1e-4)
    # print('Training {} model'.format(args.arch))
    test_accs, test_losses = [], []
    train_accs, train_losses = [], []
    for epoch in range(0,400):

        train_acc, train_loss = train(train_loader, model, criterion, optimizer, epoch)
        train_accs.append(train_acc)
        train_losses.append(train_loss)

        # evaluate on validation set
        acc, loss = testing(test_loader, model, criterion)
        test_accs.append(acc)
        test_losses.append(loss)

    return {"best_acc": max(test_accs), "test_accs":test_accs, "test_losses": test_losses, "train_acss": train_accs, "train_losses": train_losses}

In [17]:
if __name__ == '__main__':
    model_info = main(model)
    model_info.cuda()
    print(model_info)

Epoch: [0]	 Loss 1.2837 (1.2310)	 acc 85.000 (85.546)
Test	  accuracy: 82.610 (Err: 1.350 )

Epoch: [1]	 Loss 1.2353 (1.2246)	 acc 87.500 (85.546)
Test	  accuracy: 82.690 (Err: 1.374 )

Epoch: [2]	 Loss 1.0628 (1.2170)	 acc 90.000 (85.790)
Test	  accuracy: 81.810 (Err: 1.392 )

Epoch: [3]	 Loss 1.2177 (1.1966)	 acc 86.250 (86.158)
Test	  accuracy: 82.770 (Err: 1.357 )

Epoch: [4]	 Loss 1.0561 (1.1900)	 acc 91.250 (86.332)
Test	  accuracy: 82.540 (Err: 1.370 )

Epoch: [5]	 Loss 1.1811 (1.1830)	 acc 90.000 (86.514)
Test	  accuracy: 83.570 (Err: 1.337 )

Epoch: [6]	 Loss 1.1555 (1.1781)	 acc 86.250 (86.690)
Test	  accuracy: 83.420 (Err: 1.332 )

Epoch: [7]	 Loss 1.2718 (1.1624)	 acc 85.000 (86.946)
Test	  accuracy: 83.140 (Err: 1.355 )

Epoch: [8]	 Loss 1.1678 (1.1539)	 acc 83.750 (87.252)
Test	  accuracy: 82.790 (Err: 1.376 )

Epoch: [9]	 Loss 1.3423 (1.1448)	 acc 78.750 (87.340)
Test	  accuracy: 83.910 (Err: 1.322 )

Epoch: [10]	 Loss 0.9938 (1.1383)	 acc 95.000 (87.716)
Test	  accuracy

AttributeError: 'dict' object has no attribute 'cuda'