<a href="https://colab.research.google.com/github/Abudhagir/EVA8/blob/main/S10/S10_start_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import numpy as np
import time
import argparse

In [2]:
class Residual(nn.Module):
  def __init__(self, fn):
    super().__init__()
    self.fn = fn 
  def forward(self, x):
    return self.fn(x) + x


def ConvMixer(dim, depth, kernel_size=5, patch_size=2, n_classes=10):
  return nn.Sequential(
      nn.Conv2d(3, dim, kernel_size=patch_size, stride=patch_size),
      nn.GELU(),
      nn.BatchNorm2d(dim),
      *[nn.Sequential(
          Residual(nn.Sequential(
              nn.Conv2d(dim, dim, kernel_size, groups=dim, padding="same"),
              nn.GELU(),
              nn.BatchNorm2d(dim)
          )),
          nn.Conv2d(dim, dim, kernel_size=1),
          nn.GELU(),
          nn.BatchNorm2d(dim)
      ) for i in range(depth)],
      nn.AdaptiveAvgPool2d((1, 1)),
      nn.Flatten(),
      nn.Linear(dim, n_classes)
  )

In [3]:
cifar10_mean = (0.4914, 0.4822, 0.4465)
cifar10_std = (0.2471, 0.2435, 0.2616)

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(32, scale=(0.75, 1.0), ratio = (1.0, 1.0)),
    transforms.RandomHorizontalFlip(p=0.5), 
    transforms.RandAugment(num_ops=1, magnitude=8),
    transforms.ColorJitter(0.1, 0.1, 0.1),
    transforms.ToTensor(),
    transforms.Normalize(cifar10_mean, cifar10_std),
    transforms.RandomErasing(p=0.25)
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(cifar10_mean, cifar10_std)
])

epochs = 25
batch_size = 512

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=train_transform)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=4)

testset = torchvision.datasets.CIFAR10(root='.data', train=False,
                                        download=True, transform=test_transform)

testloader = torch.utils.data.DataLoader(testset, batch_size=64,
                                        shuffle=False, num_workers=4)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:14<00:00, 11446315.52it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data




Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to .data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:13<00:00, 12897367.60it/s]


Extracting .data/cifar-10-python.tar.gz to .data


In [4]:
torch.cuda.get_device_name(0)

'Tesla T4'

In [5]:
lr_scheduler = lambda t: np.interp([t], [0, epochs*2//5, epochs*4//5, epochs],
                                    [0,0.01,0.01/20.0,0])[0]

depth = 10
hdim = 256
psize = 2
conv_ks = 5
clip_norm = True

model = ConvMixer(hdim, depth, patch_size=psize, kernel_size=conv_ks, n_classes=10)
model = nn.DataParallel(model, device_ids=[0]).cuda()

opt = optim.AdamW(model.parameters(), lr=0.01, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()
scaler = torch.cuda.amp.GradScaler()

for epoch in range(epochs):
    start = time.time()
    train_loss, train_acc, n = 0, 0, 0
    for i, (X,y) in enumerate(trainloader):
        model.train()
        X, y = X.cuda(), y.cuda()

        lr = lr_scheduler(epoch + (i +1)/len(trainloader))
        opt.param_groups[0].update(lr=lr)

        opt.zero_grad()
        with torch.cuda.amp.autocast():
            output = model(X)
            loss = criterion(output, y)

        scaler.scale(loss).backward()
        if clip_norm:
            scaler.unscale_(opt)
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        scaler.step(opt)
        scaler.update()

        train_loss += loss.item() * y.size(0)
        train_acc += (output.max(1)[1] == y).sum().item()
        n += y.size(0)

    model.eval()
    test_acc, m = 0, 0

    with torch.no_grad():
        for i, (X,y) in enumerate(testloader):
            X,y = X.cuda(), y.cuda()
            with torch.cuda.amp.autocast():
                output = model(X)

            test_acc += (output.max(1)[1] == y).sum().item()
            m += y.size(0)

    print(f'ConvMixer: Epoch: {epoch} | Train Acc: {train_acc/n:.4f}, Test Acc: {test_acc/m:.4f}, Time: {time.time() - start:.1f}, lr: {lr:.6f}')


ConvMixer: Epoch: 0 | Train Acc: 0.3440, Test Acc: 0.5052, Time: 58.6, lr: 0.001000
ConvMixer: Epoch: 1 | Train Acc: 0.5468, Test Acc: 0.5940, Time: 55.0, lr: 0.002000
ConvMixer: Epoch: 2 | Train Acc: 0.6435, Test Acc: 0.6913, Time: 56.4, lr: 0.003000
ConvMixer: Epoch: 3 | Train Acc: 0.7016, Test Acc: 0.7066, Time: 56.5, lr: 0.004000
ConvMixer: Epoch: 4 | Train Acc: 0.7394, Test Acc: 0.7450, Time: 56.0, lr: 0.005000
ConvMixer: Epoch: 5 | Train Acc: 0.7606, Test Acc: 0.7678, Time: 56.0, lr: 0.006000
ConvMixer: Epoch: 6 | Train Acc: 0.7806, Test Acc: 0.7962, Time: 56.7, lr: 0.007000
ConvMixer: Epoch: 7 | Train Acc: 0.7939, Test Acc: 0.7931, Time: 55.8, lr: 0.008000
ConvMixer: Epoch: 8 | Train Acc: 0.8035, Test Acc: 0.8144, Time: 56.0, lr: 0.009000
ConvMixer: Epoch: 9 | Train Acc: 0.8133, Test Acc: 0.8229, Time: 56.7, lr: 0.010000
ConvMixer: Epoch: 10 | Train Acc: 0.8251, Test Acc: 0.8296, Time: 55.7, lr: 0.009050
ConvMixer: Epoch: 11 | Train Acc: 0.8413, Test Acc: 0.8508, Time: 55.6, lr: