In [1]:
import os
import re
import PIL
import torch
import wandb
import torch.nn.functional as F
import numpy as np
from torch import nn
from pathlib import Path
from sklearn.utils.class_weight import compute_class_weight
from pytorch_lightning import LightningModule, Trainer, LightningDataModule
from pytorch_lightning.loggers import WandbLogger
from torchvision import transforms
from torchmetrics.functional import accuracy as tm_acc
from PIL import Image
from fastai.vision.all import *

device = torch.device('cuda')

In [5]:
# use fastai to load data with dataloader
class CustomTransform(Transform):
    def __init__(self, p=0.5):
        self.p = p
        # import dinov2 for embedding creation
        dinov2 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14') # dinov2_vits14, dinov2_vitb14, dinov2_vitl14, dinov2_vitg14
        self.dinov2 = dinov2.to(device)

    def encodes(self, x: PIL.Image.Image):
        if random.random() < self.p:
            transform = transforms.Compose([
                transforms.Resize(280),
                transforms.CenterCrop(280),
                transforms.ToTensor(),
                transforms.Normalize(*imagenet_stats)# mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
            ])
            x = transform(x) # this has to be applied first
            x = x.to(device)
            # apply your custom transformation here
            with torch.no_grad():
                embedding = self.dinov2(x.unsqueeze(0))
            # x = x
        return embedding.squeeze(0)

def get_images(dataset_path, batch_size, img_size, seed, subfolders=('train','valid')):
    "The beetles dataset"
    files = get_image_files(path=dataset_path, recurse=True, folders=subfolders)
    item_tfms = [CustomTransform(p=1)]
    
    dblock = DataBlock(blocks = (ImageBlock, CategoryBlock),
                       get_items = get_image_files,
                       splitter = GrandparentSplitter(train_name=subfolders[0], valid_name=subfolders[1]),
                       get_y = parent_label,
                       item_tfms = item_tfms, # resize trasnformation is applied during inference too                                    
                      )
    dls = dblock.dataloaders(dataset_path, bs = batch_size, num_workers=0)
    return dls

class FastaiDataModule(LightningDataModule):
    def __init__(self, train_loader, val_loader, test_loader):
        super().__init__()
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.test_loader = test_loader

    def train_dataloader(self):
        return self.train_loader

    def val_dataloader(self):
        return self.val_loader

    def test_dataloader(self):
        return self.test_loader

class MLP(LightningModule):
    def __init__(self, input_size, num_classes, weight, hidden_layers=None, learning_rate=1e-3, label_smoothing=0.1):
        super().__init__()
        layers = []
        if hidden_layers:
            layers.append(nn.Linear(input_size, hidden_layers[0]))
            layers.append(nn.ReLU())
            for i in range(len(hidden_layers)-1):
                layers.append(nn.Linear(hidden_layers[i], hidden_layers[i+1]))
                layers.append(nn.ReLU())
            layers.append(nn.Linear(hidden_layers[-1], num_classes))
        else:
            layers.append(nn.Linear(input_size, num_classes))
        
        self.layers = nn.Sequential(*layers)
        self.learning_rate = learning_rate
        self.label_smoothing = label_smoothing
        self.weight = weight
        self.predictions = []
        self.labels = []

    def forward(self, x):
        x = self.layers(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y, reduction='mean', label_smoothing=self.label_smoothing, weight=self.weight)
        preds = torch.argmax(y_hat, dim=1)
        acc = tm_acc(preds, y)
        self.log('train_acc', acc)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y, reduction='mean', label_smoothing=self.label_smoothing, weight=self.weight)
        preds = torch.argmax(y_hat, dim=1)
        acc = tm_acc(preds, y)
        self.predictions.append(preds)
        self.labels.append(y)
        
        # Log loss and metric
        self.log('val_acc', acc)
        self.log('val_loss', loss)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer
    
    def get_predictions_and_labels(self):
        return self.predictions, self.labels

def sweep_function():
    # initialize wandb run
    run = wandb.init()
    # Load data
    dls = get_images(dataset_path=r"/blue/hulcr/gmarais/Beetle_data/selected_images/train_data", batch_size=wandb.config.batch_size, img_size=280, seed=42, subfolders=('train','valid'))
    embedding_size = dls.one_batch()[0].shape[1]
    labels = np.array([re.split(r'/|\\', str(x))[-2] for x in dls.items])
    classes = np.unique(labels)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=labels)
    class_weights = {c: w for c, w in zip(classes, weights)}
    weights = tensor([class_weights[c] for c in dls.vocab]).to(dls.device)
    # Create a wandb logger
    wandb_logger = WandbLogger(project='DINOv2_sweep')
    # Create a trainer and pass the wandb logger
    trainer = Trainer(max_epochs=wandb.config.epochs, logger=wandb_logger)
    # Get the dataloaders in the correct format
    data_module = FastaiDataModule(train_loader=dls.train, val_loader=dls.valid, test_loader=dls.valid)
    # Define linear NN model
    model = MLP(input_size=embedding_size, num_classes=len(classes), weight=weights, hidden_layers=None, learning_rate=wandb.config.learning_rate, label_smoothing=wandb.config.label_smoothing)
    # Fit the model
    trainer.fit(model, datamodule=data_module)
    # finish wandb run
    run.finish()

# Sweep to tune parameters

In [None]:
# Define sweep configuration
sweep_config = {
  'method': 'bayes',
  'metric': {
    'name': 'val_loss',
    'goal': 'minimize'  
  },
  'parameters': {
      'learning_rate': {'min':1e-8,'max': 1e-1},
      'batch_size': {'values': [8, 16, 32, 64, 128, 256, 512, 1024, 2048]},
      'label_smoothing': {'min': 0.01, 'max': 0.99},
      'epochs': {'min': 1, 'max': 10}
  }
}

# Initialize sweep
sweep_id = wandb.sweep(sweep_config)
sweep_id = "v46lz29w"
# Run sweep agent
wandb.agent(sweep_id, function=sweep_function)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Create sweep with ID: v46lz29w
Sweep URL: https://wandb.ai/christopher-marais/uncategorized/sweeps/v46lz29w


[34m[1mwandb[0m: Agent Starting Run: up63omd0 with config:
[34m[1mwandb[0m: 	batch_size: 1024
[34m[1mwandb[0m: 	epochs: 7
[34m[1mwandb[0m: 	label_smoothing: 0.6626195003940363
[34m[1mwandb[0m: 	learning_rate: 0.005711904853087127
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mchristopher-marais[0m. Use [1m`wandb login --relogin`[0m to force relogin


Using cache found in /home/gmarais/.cache/torch/hub/facebookresearch_dinov2_main
  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [7]

  | Name   | Type       | Params
--------------------------------------
0 | layers | Sequential | 12.3 K
--------------------------------------
12.3 K    Trainable params
0         Non-trainable params
12.3 K    Total params
0.049     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


                                                                           

  rank_zero_warn(


Epoch 0: 100%|██████████| 24/24 [11:23<00:00, 28.48s/it, v_num=omd0]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/8 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/8 [00:00<?, ?it/s][A
Validation DataLoader 0:  12%|█▎        | 1/8 [00:00<00:00, 66.74it/s][A
Validation DataLoader 0:  25%|██▌       | 2/8 [00:27<01:21, 13.54s/it][A
Validation DataLoader 0:  38%|███▊      | 3/8 [00:54<01:30, 18.16s/it][A
Validation DataLoader 0:  50%|█████     | 4/8 [01:21<01:21, 20.41s/it][A
Validation DataLoader 0:  62%|██████▎   | 5/8 [01:55<01:09, 23.03s/it][A
Validation DataLoader 0:  75%|███████▌  | 6/8 [02:22<00:47, 23.67s/it][A
Validation DataLoader 0:  88%|████████▊ | 7/8 [02:49<00:24, 24.24s/it][A
Epoch 0: 100%|██████████| 24/24 [14:54<00:00, 37.28s/it, v_num=omd0]t][A
Epoch 1: 100%|██████████| 24/24 [11:39<00:00, 29.15s/it, v_num=omd0]  [A
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/8 [00:00<?, ?it/s][A
Validation DataLoader 0: 

`Trainer.fit` stopped: `max_epochs=7` reached.


Epoch 6: 100%|██████████| 24/24 [14:37<00:00, 36.54s/it, v_num=omd0]


0,1
epoch,▁▂▃▃▅▆▆▇██
train_acc,▁▅█
train_loss,▅█▁
trainer/global_step,▁▂▂▃▅▅▆▇▇█
val_acc,▁▅▇▇███
val_loss,█▂▁▁▁▁▁

0,1
epoch,6.0
train_acc,0.97754
train_loss,2.45347
trainer/global_step,167.0
val_acc,0.96197
val_loss,2.65149


[34m[1mwandb[0m: Agent Starting Run: et4xn11n with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	label_smoothing: 0.057627711072090915
[34m[1mwandb[0m: 	learning_rate: 0.030176510405410757
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cache found in /home/gmarais/.cache/torch/hub/facebookresearch_dinov2_main
  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [7]

  | Name   | Type       | Params
--------------------------------------
0 | layers | Sequential | 12.3 K
--------------------------------------
12.3 K    Trainable params
0         Non-trainable params
12.3 K    Total params
0.049     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 0: 100%|██████████| 97/97 [11:20<00:00,  7.02s/it, v_num=n11n]       
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/30 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/30 [00:00<?, ?it/s][A
Validation DataLoader 0:   3%|▎         | 1/30 [00:00<00:00, 66.16it/s][A
Validation DataLoader 0:   7%|▋         | 2/30 [00:07<01:46,  3.82s/it][A
Validation DataLoader 0:  10%|█         | 3/30 [00:14<02:12,  4.90s/it][A
Validation DataLoader 0:  13%|█▎        | 4/30 [00:22<02:26,  5.62s/it][A
Validation DataLoader 0:  17%|█▋        | 5/30 [00:29<02:25,  5.81s/it][A
Validation DataLoader 0:  20%|██        | 6/30 [00:35<02:22,  5.95s/it][A
Validation DataLoader 0:  23%|██▎       | 7/30 [00:42<02:18,  6.04s/it][A
Validation DataLoader 0:  27%|██▋       | 8/30 [00:49<02:16,  6.20s/it][A
Validation DataLoader 0:  30%|███       | 9/30 [00:56<02:11,  6.25s/it][A
Validation DataLoader 0:  33%|███▎      | 10/30 [01:02<02:05,  6.29s/it][A
Validation DataLoa

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 97/97 [14:41<00:00,  9.08s/it, v_num=n11n]


0,1
epoch,▁▁
train_acc,▁
train_loss,▁
trainer/global_step,▁█
val_acc,▁
val_loss,▁

0,1
epoch,0.0
train_acc,0.97656
train_loss,2.02256
trainer/global_step,96.0
val_acc,0.95768
val_loss,0.90767


[34m[1mwandb[0m: Agent Starting Run: nv24nguy with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	label_smoothing: 0.44890641967500666
[34m[1mwandb[0m: 	learning_rate: 0.0184930750109463
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Using cache found in /home/gmarais/.cache/torch/hub/facebookresearch_dinov2_main
  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [7]

  | Name   | Type       | Params
--------------------------------------
0 | layers | Sequential | 12.3 K
--------------------------------------
12.3 K    Trainable params
0         Non-trainable params
12.3 K    Total params
0.049     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Epoch 0: 100%|██████████| 390/390 [11:28<00:00,  1.77s/it, v_num=nguy]     
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/117 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/117 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/117 [00:00<00:01, 77.38it/s][A
Validation DataLoader 0:   2%|▏         | 2/117 [00:01<01:37,  1.18it/s][A
Validation DataLoader 0:   3%|▎         | 3/117 [00:03<02:07,  1.12s/it][A
Validation DataLoader 0:   3%|▎         | 4/117 [00:05<02:24,  1.28s/it][A
Validation DataLoader 0:   4%|▍         | 5/117 [00:06<02:31,  1.35s/it][A
Validation DataLoader 0:   5%|▌         | 6/117 [00:08<02:35,  1.40s/it][A
Validation DataLoader 0:   6%|▌         | 7/117 [00:10<02:37,  1.43s/it][A
Validation DataLoader 0:   7%|▋         | 8/117 [00:11<02:38,  1.46s/it][A
Validation DataLoader 0:   8%|▊         | 9/117 [00:13<02:39,  1.48s/it][A
Validation DataLoader 0:   9%|▊         | 10/117 [00:15<02:40,  1.50s/it][A
Valida

# Single training run

In [None]:
# define parameters
batch_size=512
learning_rate=1e-3
label_smoothing=0.1
max_epochs=5

# prepare datalaodersby extracting features using DINOv2
dls = get_images(dataset_path=r"/blue/hulcr/gmarais/Beetle_data/selected_images/train_data", batch_size=batch_size, img_size=280, seed=42, subfolders=('train','valid'))
embedding_size = dls.one_batch()[0].shape[1]
labels = np.array([re.split(r'/|\\', str(x))[-2] for x in dls.items])
classes = np.unique(labels)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=labels)
class_weights = {c: w for c, w in zip(classes, weights)}
weights = tensor([class_weights[c] for c in dls.vocab]).to(dls.device)

# create a wandb logger
wandb_logger = WandbLogger(project='DINOv2_single')

# create a trainer and pass the wandb logger
trainer = Trainer(logger=wandb_logger)

# get the dataloaders in the correct format
data_module = FastaiDataModule(train_loader=dls.train, val_loader=dls.valid, test_loader=dls.valid)

# define linear NN model
model = MLP(input_size=embedding_size, num_classes=len(classes), weight=weights, hidden_layers=None, learning_rate=learning_rate, label_smoothing=label_smoothing) # the dinov2 output is of shape 384

# give details on trainer
trainer = Trainer(max_epochs=max_epochs, logger=wandb_logger, log_every_n_steps=8)

# fit the model
trainer.fit(model, datamodule=data_module)

# 5-Fold Cross validation

In [None]:
# define parameters
batch_size=512
learning_rate=1e-3
label_smoothing=0.1
max_epochs=5

# CV training
predictions_lst = []
labels_lst = []
for i in range(5):
    dls = get_images(dataset_path=r"/blue/hulcr/gmarais/Beetle_data/selected_images/train_data", batch_size=batch_size, img_size=280, seed=42, subfolders=('train_'+str(i),'valid_'+str(i)))
    embedding_size = dls.one_batch()[0].shape[1]
    labels = np.array([re.split(r'/|\\', str(x))[-2] for x in dls.items])
    classes = np.unique(labels)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=labels)
    class_weights = {c: w for c, w in zip(classes, weights)}
    weights = tensor([class_weights[c] for c in dls.vocab]).to(dls.device)

    # create a wandb logger
    wandb_logger = WandbLogger(project='DINOv2_CV')

    # create a trainer and pass the wandb logger
    trainer = Trainer(logger=wandb_logger)

    # get the dataloaders in the correct format
    data_module = FastaiDataModule(train_loader=dls.train, val_loader=dls.valid, test_loader=dls.valid)

    # define linear NN model
    model = MLP(input_size=embedding_size, num_classes=len(classes), weight=weights, hidden_layers=None, learning_rate=learning_rate, label_smoothing=label_smoothing) # the dinov2 output is of shape 384

    # give details on trainer
    trainer = Trainer(max_epochs=max_epochs, logger=wandb_logger)

    # fit the model
    trainer.fit(model, datamodule=data_module)
    predictions, labels = model.get_predictions_and_labels()
    predictions_lst.append(predictions)
    labels_lst.append(labels)