In [None]:

import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms
import wandb
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning import Trainer

In [2]:
import os
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms
import pytorch_lightning as pl

class Nature12KDataModule(pl.LightningDataModule):
    def __init__(self, data_dir="../../inaturalist_12K", batch_size=64, image_size=(512, 512), data_aug=False):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.image_size = image_size
        self.data_aug = data_aug

    @staticmethod
    def get_transform(image_size, data_aug=False):
        transform_list = [transforms.Resize(image_size)]

        if data_aug:
            transform_list += [
                transforms.RandomHorizontalFlip(),
                transforms.RandomRotation(15),
                transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2)
            ]

        transform_list += [
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ]

        return transforms.Compose(transform_list)

    def setup(self, stage=None):
        train_transform = self.get_transform(self.image_size, self.data_aug)
        test_transform = self.get_transform(self.image_size, False)

        # Load train and test datasets
        full_train = datasets.ImageFolder(os.path.join(self.data_dir, "train"), transform=train_transform)
        test_set = datasets.ImageFolder(os.path.join(self.data_dir, "test"), transform=test_transform)

        # Split train into train + val (80-20 split)
        val_size = int(0.2 * len(full_train))
        train_size = len(full_train) - val_size
        self.train_set, self.val_set = random_split(full_train, [train_size, val_size])

        self.test_set = test_set
        self.class_names = full_train.classes

    def train_dataloader(self):
        return DataLoader(self.train_set, batch_size=self.batch_size, shuffle=True, num_workers=2)

    def val_dataloader(self):
        return DataLoader(self.val_set, batch_size=self.batch_size, shuffle=False, num_workers=2)

    def test_dataloader(self):
        return DataLoader(self.test_set, batch_size=self.batch_size, shuffle=False, num_workers=2)


In [3]:
class CNN(pl.LightningModule):
    def __init__(self,
                 input_channels,
                 conv_filters,
                 kernel_sizes,
                 activation,
                 dense_neurons,
                 num_classes,
                 lr,
                 batch_norm=False,        # ← ADD THIS
                 dropout=0.0):            # ← AND THIS IF NOT PRESENT
        super().__init__()
        self.save_hyperparameters()


        self.activation_fn = self._get_activation_fn(activation)

        # Conv Layers
        layers = []
        in_channels = input_channels
        for out_channels, ksize in zip(conv_filters, kernel_sizes):
            layers.append(nn.Conv2d(in_channels, out_channels, kernel_size=ksize, padding=ksize // 2))
    
            if batch_norm:
                layers.append(nn.BatchNorm2d(out_channels))
    
            layers.append(self._get_activation_fn(activation))
            layers.append(nn.MaxPool2d(2, 2))

            if dropout > 0:
                layers.append(nn.Dropout2d(dropout))
    
            in_channels = out_channels

        
        self.conv_blocks = nn.Sequential(*layers)

        # Flattened dim
        with torch.no_grad():
            dummy_input = torch.zeros(1, input_channels, 256, 256)
            dummy_output = self.conv_blocks(dummy_input)
            flatten_dim = dummy_output.view(1, -1).shape[1]

        # Fully connected
        self.classifier = nn.Sequential(
            nn.Linear(flatten_dim, dense_neurons),
            self.activation_fn,
            nn.Linear(dense_neurons, num_classes)
        )

        self.loss_fn = nn.CrossEntropyLoss()
        self.lr = lr

    def _get_activation_fn(self, name):
        name = name.lower()
        if name == 'relu':
            return nn.ReLU()
        elif name == 'gelu':
            return nn.GELU()
        elif name == 'silu':
            return nn.SiLU()
        elif name == 'mish':
            return nn.Mish()
        else:
            raise ValueError(f"Unsupported activation: {name}")

    def forward(self, x):
        x = self.conv_blocks(x)
        x = x.view(x.size(0), -1)
        return self.classifier(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss_fn(logits, y)
        acc = (logits.argmax(dim=1) == y).float().mean()
        self.log("train_loss", loss)
        self.log("train_acc", acc)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss_fn(logits, y)
        acc = (logits.argmax(dim=1) == y).float().mean()
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss_fn(logits, y)
        acc = (logits.argmax(dim=1) == y).float().mean()
        # self.log("test_loss", loss)
        # self.log("test_acc", acc)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)


In [4]:
def train():
    wandb.init()
    config = wandb.config

    data_module = Nature12KDataModule(
        data_dir="../../inaturalist_12K",  # ✅ Correct path here
        batch_size=config.batch_size,
        image_size=(256, 256),
        data_aug=config.data_augmentation
    )

    data_module.prepare_data()
    data_module.setup()

    model = CNN(
        input_channels=3,
        conv_filters=config.conv_filters,
        kernel_sizes=config.kernel_sizes,
        activation=config.activation,
        dense_neurons=config.dense_neurons,
        num_classes=len(data_module.class_names),
        lr=config.lr,
        batch_norm=config.batch_norm,
        dropout=config.dropout
    )

    wandb_logger = WandbLogger(project=wandb.run.project, name=wandb.run.name)

    trainer = pl.Trainer(
        max_epochs=5,
        accelerator="auto",
        devices="auto",
        log_every_n_steps=10,
        logger=wandb_logger  # ✅ Correct logger
    )

    print("🚀 Training model...")
    trainer.fit(model, data_module.train_dataloader(), data_module.val_dataloader())

    print("🧪 Evaluating on test set...")
    trainer.test(model, data_module.test_dataloader())


In [5]:
def launch_sweep():
    sweep_config = {
        'method': 'bayes',
        'metric': {
            'name': 'val_acc',
            'goal': 'maximize'
        },
        'parameters': {
            'conv_filters': {
                'values': [[32,64 , 128]]
            },
            'kernel_sizes': {
                'values': [
                    [3, 3, 3],
                ]
            },
            'activation': {
                'values': ['relu', 'gelu', 'silu', 'mish']
            },
            'dense_neurons': {
                'values': [512, 256]
            },
            'lr': {
                'min': 0.01,
                'max': 0.1
            },
            'batch_norm': {
                'values': [True, False]
            },
            'dropout': {
                'values': [0.2, 0.3]
            },
            'batch_size': {
                'values': [8]
            },
            'data_augmentation': {
                'values': [True, False]
            }
        }
    }

    sweep_id = wandb.sweep(sweep_config, project='iNaturalist_CNN_Sweep')
    wandb.agent(sweep_id, function=train, count=50)
    # 6mpxfky1

In [None]:
launch_sweep()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: 3pzarrcm
Sweep URL: https://wandb.ai/da24s019-indian-institute-of-technology-madras/iNaturalist_CNN_Sweep/sweeps/3pzarrcm


[34m[1mwandb[0m: Agent Starting Run: h9uoeuoi with config:
[34m[1mwandb[0m: 	activation: mish
[34m[1mwandb[0m: 	batch_norm: False
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	conv_filters: [32, 64, 128]
[34m[1mwandb[0m: 	data_augmentation: False
[34m[1mwandb[0m: 	dense_neurons: 256
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	kernel_sizes: [3, 3, 3]
[34m[1mwandb[0m: 	lr: 0.03444723990965818
[34m[1mwandb[0m: Currently logged in as: [33mda24s019[0m ([33mda24s019-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/user/anaconda3/lib/python3.12/site-packages/pytorch_lightning/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


🚀 Training model...



  | Name          | Type             | Params | Mode 
-----------------------------------------------------------
0 | activation_fn | Mish             | 0      | train
1 | conv_blocks   | Sequential       | 93.2 K | train
2 | classifier    | Sequential       | 33.6 M | train
3 | loss_fn       | CrossEntropyLoss | 0      | train
-----------------------------------------------------------
33.7 M    Trainable params
0         Non-trainable params
33.7 M    Total params
134.602   Total estimated model params size (MB)
18        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


🧪 Evaluating on test set...


Testing: |          | 0/? [00:00<?, ?it/s]

0,1
epoch,▁▁▁▁▁▁▁▁▃▃▃▃▃▃▃▃▃▃▅▅▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆█████
train_acc,▃▃▃▆▆▁▁▆▁▁▁▃▁▁▃▁▆▃▁▁▃▃▆█▃▁▃▃▁▁▆▁▁▃▃▃▁▃▃▃
train_loss,▅▅▄██▇▅▇▆▅▅▅▅▆▄▆▆▄▇▃▆▆▅▅▁▄▆▇▄▄▅▃▆▇▆▅▆█▄▅
trainer/global_step,▁▁▁▁▁▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▆▆█████
val_acc,▂▄▁█▂
val_loss,▇▃█▄▁

0,1
epoch,4.0
train_acc,0.0
train_loss,2.39292
trainer/global_step,4999.0
val_acc,0.09655
val_loss,2.30862


[34m[1mwandb[0m: Agent Starting Run: s1w7lggt with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	batch_norm: True
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	conv_filters: [32, 64, 128]
[34m[1mwandb[0m: 	data_augmentation: True
[34m[1mwandb[0m: 	dense_neurons: 256
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	kernel_sizes: [3, 3, 3]
[34m[1mwandb[0m: 	lr: 0.031067390713818963


You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/user/anaconda3/lib/python3.12/site-packages/pytorch_lightning/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type             | Params | Mode 
-----------------------------------------------------------
0 | activation_fn | GELU             | 0      | train
1 | conv_blocks   | Sequential       | 93.7 K | train
2 | classifier    | Sequential       | 33.6 M | train
3 | loss_fn       | CrossEntropyLoss | 0      | train
-----------------------------------------------------------
33.7 M    Trainable params
0         No

🚀 Training model...


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7d8199d70e00>> (for post_run_cell), with arguments args (<ExecutionResult object at 7d819c18acf0, execution_count=6 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7d819c2b1b80, raw_cell="launch_sweep()" store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/home/user/Documents/Books/DA24S019_Assignment_2/Part_A/train2.ipynb#W5sZmlsZQ%3D%3D> result=None>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

Exception in thread Thread-9 (_run_job):
Traceback (most recent call last):
  File "/home/user/anaconda3/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1251, in _try_get_data
    data = self._data_queue.get(timeout=timeout)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/anaconda3/lib/python3.12/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/anaconda3/lib/python3.12/site-packages/torch/multiprocessing/reductions.py", line 541, in rebuild_storage_fd
    fd = df.detach()
         ^^^^^^^^^^^
  File "/home/user/anaconda3/lib/python3.12/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_connection(self._id) as conn:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/anaconda3/lib/python3.12/multiprocessing/resource_sharer.py", line 86, in get_connection
    c = Client(address, authkey=process.current_proce