In [8]:
import torch 
from torch import nn

from typing import List, Optional

from climex.data.data_loader import CIFAR10DataModule

from climex.models.ResSKBlock import ResSKBlock
from climex.models.SKConv import SKConv

# Data review

In [2]:
data = CIFAR10DataModule("/CIFAR10/datasets/raw")

In [3]:
data.prepare_data()
data.setup()
data = data.train_dataloader()

Files already downloaded and verified
Files already downloaded and verified
None


In [4]:
dt = data.dataset.dataset

In [5]:
dt.data.shape

(50000, 32, 32, 3)

In [6]:
for i,j in data:
    print(i.shape)
    print(j.shape)
    break

torch.Size([256, 3, 32, 32])
torch.Size([256])


#

Data shapes is 255 3 32 32 (b,c,H,H) and 50000 3 32 32 in original dataset

# SK Res Block

In [9]:
from cifar_cnn.models.ResSKBlock import ResSKBlock

In [10]:
from typing import Tuple
from torch import nn
import torch
import lightning as L
from torchmetrics.functional import accuracy, auroc
from torch.nn import functional as F

In [11]:
BATCH_SIZE = 256 if torch.cuda.is_available() else 64

class CIFAR10Model(L.LightningModule):

    def __init__(self,num_classes,lr):
        super().__init__()
        self.learning_rate = lr
        self.num_classes = num_classes
        self.cnn_relu_seq = nn.Sequential(
            nn.Conv2d(3,16,5),
            nn.ReLU(),
            ResSKBlock(in_channels=16,out_channels=16*2,groups = 4),
            nn.ReLU(),
        )

        self.lin_layer_seq = nn.Sequential(
            nn.Linear(32*28*28,12544),
            nn.ReLU(),
            nn.Linear(12544,6272),
            nn.ReLU(),
            nn.Linear(6272,784),
            nn.ReLU(),
            nn.Linear(784,10)
        )
        

    def forward(self,x):
        conv_res = self.cnn_relu_seq(x)
        flattened = conv_res.view(conv_res.size(0), -1)
        lin_res = self.lin_layer_seq(flattened)
        return F.log_softmax(lin_res,dim=1)

    def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_nb: int) -> torch.Tensor:
        x,y = batch
        loss = F.cross_entropy(self(x), y)
        preds = self(x)

        """
        metrics
        """
        rocauc = auroc(preds, y, task="multiclass",num_classes=self.num_classes)
        self.log("train_rocauc", rocauc, prog_bar=True)
        acc = accuracy(preds, y, task="multiclass",num_classes=self.num_classes)
        self.log("train_accuracy", acc, prog_bar=True)
        return {'loss': loss, 'prediction': preds}

    def validation_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        """
        metrics
        """
        rocauc = auroc(logits,y,task="multiclass",num_classes=self.num_classes)
        acc = accuracy(logits, y, task="multiclass",num_classes=self.num_classes)
        self.log("val_accuracy", acc, prog_bar=True)
        self.log("val_rocauc",rocauc,prog_bar=True)
        self.log("val_loss", loss, prog_bar=True)

    def test_step(self,batch: Tuple[torch.Tensor, torch.Tensor], batch_idx):
        x, y = batch
        logits = self(x)

        test_loss = F.cross_entropy(logits, y)
        """
        metrics
        """
        rocauc = auroc(logits,y,task="multiclass",num_classes=self.num_classes)
        acc = accuracy(logits, y, task="multiclass",num_classes=self.num_classes)
        self.log("test_accuracy", acc, prog_bar=True)
        self.log("test_rocauc",rocauc,prog_bar=True)
        self.log("test_loss", test_loss, prog_bar=True)
        
    
    def configure_optimizers(self):
        optimizer = torch.optim.SGD(self.parameters(), self.learning_rate,momentum=0.9)
        scheduler = {
            "scheduler": torch.optim.lr_scheduler.OneCycleLR(
                optimizer=optimizer,
                max_lr=1e-2,
                epochs=self.trainer.max_epochs,
                steps_per_epoch = 50000 // BATCH_SIZE),
            "interval": "step"
        }
        return {"optimizer": optimizer, "lr_scheduler":  scheduler}

In [12]:
from typing import List, Union
from cifar_cnn.data.data_loader import CIFAR10DataModule
import torch
import lightning as L
from lightning.pytorch.loggers import WandbLogger
from lightning.pytorch.callbacks import LearningRateMonitor
import wandb

ACCELERATOR = "gpu" if torch.cuda.is_available() else "cpu"
LOGGER = WandbLogger(log_model=True)
LOGGER.experiment.config.update({"architecture": "resnet", "batch_size": 256})


class ImageCallback(L.Callback):
    def __init__(self) -> None:
        super().__init__()
        self.outputs = None
        self.x = None
        self.y = None


    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
        if batch_idx == trainer.num_training_batches-1:
            self.x, self.y = batch
            self.outputs = torch.argmax(outputs["prediction"],dim=1)


    def on_train_epoch_end(self, trainer, pl_module):
        n = 10
        x, y = self.x, self.y

        images = [img for img in x[:n]]
        captions = [f'Target: {y_i} - Prediction: {y_pred}' 
            for y_i, y_pred in zip(y[:n], self.outputs[:n])]

        trainer.logger.log_image(
                key='sample_images', 
                images=images, 
                caption=captions)


callbacks =[
    LearningRateMonitor(logging_interval='step'),
    ImageCallback()  
]

   
def train(epoch: int = 45,
          device: str = "auto",
          lr: float = 2e-3,
          path: str  = "/CIFAR10/datasets/raw") -> None:

    data_module = CIFAR10DataModule(path)
    model_module = CIFAR10Model(num_classes=data_module.num_classes,lr=lr)
    try:
        trainer = L.Trainer(
            accelerator=ACCELERATOR,
            devices=device,
            max_epochs=epoch,
            logger=LOGGER,
            callbacks= callbacks
        )

        trainer.fit(model_module, datamodule = data_module)           
        trainer.test(model_module,datamodule = data_module)

        wandb.finish(0)
    except RuntimeError:
        wandb.finish(1)


if __name__ == "__main__":
    train(epoch=8,lr=1e-5)
        

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Files already downloaded and verified
Files already downloaded and verified


You are using a CUDA device ('NVIDIA GeForce RTX 4060 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision


TrainerFn.FITTING


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type       | Params
---------------------------------------------
0 | cnn_relu_seq  | Sequential | 15.7 K
1 | lin_layer_seq | Sequential | 398 M 
---------------------------------------------
398 M     Trainable params
0         Non-trainable params
398 M     Total params
1,593.361 Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/root/miniconda3/envs/dl_env/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
  return F.conv2d(input, weight, bias, self.stride,


                                                                           

/root/miniconda3/envs/dl_env/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 7: 100%|██████████| 176/176 [00:44<00:00,  3.99it/s, v_num=6bty, train_rocauc=1.000, train_accuracy=0.985, val_accuracy=0.685, val_rocauc=0.948, val_loss=1.130]

`Trainer.fit` stopped: `max_epochs=8` reached.


Epoch 7: 100%|██████████| 176/176 [00:55<00:00,  3.17it/s, v_num=6bty, train_rocauc=1.000, train_accuracy=0.985, val_accuracy=0.685, val_rocauc=0.948, val_loss=1.130]
Files already downloaded and verified
Files already downloaded and verified
TrainerFn.TESTING


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/root/miniconda3/envs/dl_env/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 40/40 [00:05<00:00,  6.97it/s]




────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_accuracy         0.6758000254631042
        test_loss            1.149863362312317
       test_rocauc          0.9462881088256836
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────




0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇█
lr-SGD,▁▂▃▄▅▆▇██████▇▇▆▆▆▅▄▄▃▃▂▂▂▁▁
test_accuracy,▁
test_loss,▁
test_rocauc,▁
train_accuracy,▁▂▃▃▄▄▄▅▄▅▅▅▆▆▅▆▆▆▆▇▇▇▇█▇███
train_rocauc,▁▃▄▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇██████████
trainer/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
val_accuracy,▁▄▅▆▆▆▇█
val_loss,█▄▂▁▁▃▂▁

0,1
epoch,8.0
lr-SGD,0.00052
test_accuracy,0.6758
test_loss,1.14986
test_rocauc,0.94629
train_accuracy,0.98438
train_rocauc,0.99982
trainer/global_step,1408.0
val_accuracy,0.6846
val_loss,1.12585
