## imports


In [2]:
import pytorch_lightning as pl
from tqdm.notebook import tqdm, trange
from pytorch_lightning.strategies import DDPStrategy
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import LearningRateMonitor
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision
import torchvision.transforms as transforms
import numpy as np
import time

  warn(f"Failed to load image Python extension: {e}")


## parameters 

In [3]:
parameter= { "name": "ConvMixer",
             "batch-size": 1,
             "scale": 0.75,
             "reprob": 0.25,
             "ra-m": 8,
             "ra-n": 1,
             "jitter": 0.1,
             
             "hdim":    128,
             "depth":   4,
             "psize":   1,
             "conv-ks": 8,

             "wd" : 0.01,
             "epochs" : 200,
             "lr-max": 0.01,
             "workers": 32,
}

## model

In [4]:
class Residual(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn

    def forward(self, x):
        return self.fn(x) + x


def ConvMixer(dim, depth, kernel_size=5, patch_size=2, n_classes=10):
    return nn.Sequential(
        nn.Conv2d(3, dim, kernel_size=patch_size, stride=patch_size),
        nn.GELU(),
        nn.BatchNorm2d(dim),
        *[nn.Sequential(
                Residual(nn.Sequential(
                    nn.Conv2d(dim, dim, kernel_size, groups=dim, padding="same"),
                    nn.GELU(),
                    nn.BatchNorm2d(dim)
                )),
                nn.Conv2d(dim, dim, kernel_size=1),
                nn.GELU(),
                nn.BatchNorm2d(dim)
        ) for i in range(depth)],
        nn.AdaptiveAvgPool2d((1,1)),
        nn.Flatten(),
        nn.Linear(dim, n_classes)
    )

In [5]:
class ConvMixerModule(pl.LightningModule):
    def __init__(self, checkpoint=None):
        super().__init__()
        
        self.model = ConvMixer(parameter['hdim'], parameter['depth'], patch_size=parameter['psize'], kernel_size=parameter['conv-ks'], n_classes=10)
        self.lossFunction= nn.CrossEntropyLoss()
        self.train_loss=0
        self.train_acc=0
        if(checkpoint is not None):
             self.load_state_dict(checkpoint)
                
    def forward (self, x):
        return self.model(x)
        
    def training_step(self, batch, batch_idx):
        x, y= batch    
        print(f"y={y} y.shape={y.shape} dtype={y.dtype}")
        with torch.cuda.amp.autocast():
            x_predicted= self(x)  
            loss = self.lossFunction(x_predicted, y)
        print(f"x_predicted= {x_predicted}, y= {y}")
        self.log('train_loss', loss)
        
    #    self.train_loss = loss.item() * y.size(0)
    #    self.train_acc  = (x_predicted.max(1)[1] == y).sum().item()ff
    #    print(f"train loss = {self.train_loss}, train_acc={self.train_acc}")
        return loss


    def test_step(self, batch, batch_idx):
        test_acc, m = 0, 0
        x, y= batch   
        with torch.cuda.amp.autocast():
            output = self.model(x)
        test_acc += (output.max(1)[1] == y).sum().item()
        m += y.size(0)
        return  test_acc/ m

    def configure_optimizers(self):
        step_size_up= (parameter["batch-size"] + parameter["epochs"])*0.4 

        optimizer = optim.AdamW(self.parameters(), lr=parameter['lr-max'], weight_decay=parameter['wd'])
        lr_scheduler = {
            'scheduler' : optim.lr_scheduler.CyclicLR(optimizer=optimizer, base_lr=parameter['lr-max']/20.0, 
                            max_lr=parameter['lr-max'], step_size_up=step_size_up, cycle_momentum=False),
            'name': 'lr_monitor'}
        return ([optimizer], [lr_scheduler])
        
        
    
    def save(self, path= '/models'):
        torch.save(self.state_dict(), path)

## dataset

In [6]:
class cifar10Datamodule(pl.LightningDataModule):
    def __init__(self):
        super().__init__()
        self.cifar10_mean= (0.4914, 0.4822, 0.4465)
        self.cifar10_std = (0.2471, 0.2435, 0.2616)
        self.batch_size=parameter['batch-size']
        self.num_workers=parameter['workers']

        self.train_transform = transforms.Compose([
            transforms.RandomResizedCrop(32, scale=(parameter['scale'], 1.0), ratio=(1.0, 1.0)),
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandAugment(num_ops=parameter['ra-n'], magnitude=parameter['ra-m']),
            transforms.ColorJitter(parameter['jitter'], parameter['jitter'], parameter['jitter']),
            transforms.ToTensor(),
            transforms.Normalize(self.cifar10_mean, self.cifar10_std),
            transforms.RandomErasing(p=parameter['reprob'])
            ])
        self.test_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(self.cifar10_mean, self.cifar10_std)
            ])

    def setup(self, stage=None):
        self.trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=self.train_transform)
        self.testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=self.test_transform)  

    def train_dataloader(self):
        
          # Generating train_dataloader
        return DataLoader(self.trainset, 
                            batch_size= self.batch_size,
                            shuffle=True, 
                            num_workers=self.num_workers)

    def test_dataloader(self):
        
        # Generating test_dataloader
        return DataLoader(self.testset, 
                            batch_size= self.batch_size,
                            shuffle=True, 
                            num_workers=self.num_workers)

Files already downloaded and verified
<class 'int'>


## training

In [8]:
model  = ConvMixerModule()
data= cifar10Datamodule()
#trainer = Trainer(callbacks=[lr_monitor]) TODO
lr_monitor = LearningRateMonitor(logging_interval='epoch')
trainer = pl.Trainer(gpus=1, max_epochs=parameter["epochs"], progress_bar_refresh_rate=20, callbacks=[lr_monitor])
trainer.fit(model,data )


  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Files already downloaded and verified
Files already downloaded and verified


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name         | Type             | Params
--------------------------------------------------
0 | model        | Sequential       | 103 K 
1 | lossFunction | CrossEntropyLoss | 0     
--------------------------------------------------
103 K     Trainable params
0         Non-trainable params
103 K     Total params
0.414     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

y=tensor([7], device='cuda:0') y.shape=torch.Size([1]) dtype=torch.int64


  return F.conv2d(input, weight, bias, self.stride,


x_predicted= tensor([[-0.0706,  0.0283,  0.0028,  0.0353,  0.0818, -0.0310,  0.0626, -0.0312,
         -0.0216, -0.0844]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddmmBackward0>), y= tensor([7], device='cuda:0')
y=tensor([1], device='cuda:0') y.shape=torch.Size([1]) dtype=torch.int64
x_predicted= tensor([[-0.0713,  0.0275,  0.0018,  0.0348,  0.0810, -0.0313,  0.0620, -0.0278,
         -0.0223, -0.0852]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddmmBackward0>), y= tensor([1], device='cuda:0')
y=tensor([4], device='cuda:0') y.shape=torch.Size([1]) dtype=torch.int64
x_predicted= tensor([[-0.0720,  0.0293,  0.0011,  0.0340,  0.0798, -0.0322,  0.0609, -0.0262,
         -0.0228, -0.0861]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddmmBackward0>), y= tensor([4], device='cuda:0')
y=tensor([9], device='cuda:0') y.shape=torch.Size([1]) dtype=torch.int64
x_predicted= tensor([[-0.0729,  0.0302,  0.0004,  0.0332,  0.0807, -0.0328,  0.0601, -0.0254,
      

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
