Copy and move BenthicNet data over to local node storage

In [None]:
import subprocess

move_script_path = "./slurm/copy_and_extract_data.sh"
subprocess.run(["bash", move_script_path], check=False)

Display system GPU resources

In [None]:
import torch


def get_available_gpus():
    """Get a list of available GPUs on the system."""
    if torch.cuda.is_available():
        num_gpus = torch.cuda.device_count()
        gpu_names = [torch.cuda.get_device_name(i) for i in range(num_gpus)]
        return gpu_names
    else:
        return []


available_gpus = get_available_gpus()
if available_gpus:
    print("Available GPUs:")
    for i, gpu in enumerate(available_gpus):
        print(f"GPU {i + 1}: {gpu}")
else:
    print("No GPUs available on the system.")

Set arguments and parameters

In [None]:
name = "rn50hl_one_hot_probe"
model = "hl_rn50"
train_cfg_path = "./cfgs/cnn/resnet50_hl.json"
model_checkpoint = f"./pretrained_encoders/{model}.ckpt"
data_csv_path = "/lustre06/project/6012565/isaacxu/benthicnet_probes/data_csv/one_hots/substrate_depth_2_data/substrate_depth_2_data.csv"
tar_dir = "/lustre06/project/6012565/become_labelled/compiled_labelled_512px/tar"

one_hot = True
random_partition = False

seed = 42
batch_size = 256
num_workers = 4
max_epochs = 100
dims = [2048]
dropout = 0.7
column = "CATAMI Substrate"

nodes = 1
gpus = len(available_gpus)

test_mode = False
fine_tune_mode = False

Prepare dataloaders

In [None]:
import ast

from omegaconf import OmegaConf

from utils.benthicnet_dataset import OneHotBenthicNetDataset, gen_datasets
from utils.utils import construct_dataloaders, get_augs, get_df, set_seed


def process_one_hot_df(data_df, col):
    data_df[col] = data_df[col].apply(lambda x: ast.literal_eval(x)[0])
    return data_df


set_seed(seed)

data_df = process_one_hot_df(get_df(data_csv_path), column)
num_classes = len(data_df[column].unique())
dims.append(num_classes)

train_cfg = {
    "backbone": {"name": "resnet50", "weights": None, "grad": True},
    "batch_size": batch_size,
    "num_workers": num_workers,
    "max_epochs": max_epochs,
    "dims": dims,
    "dropout": dropout,
    "backbone_params": {"zero_init_residual": True},
    "optimizer": {
        "name": "adamw",
        "lr": 1e-5,
        "weight_decay": 1e-05,
        "extra_optimizer_args": {"betas": [0.9, 0.999]},
    },
    "scheduler": {
        "name": "warmup_cosine",
        "interval": "epoch",
        "warmup_epochs": 10,
        "warmup_start_lr": 1e-06,
        "min_lr": 1e-06,
    },
}
train_kwargs = OmegaConf.create(train_cfg)

train_transform, val_transform = get_augs(colour_jitter=False)
transform = [train_transform, val_transform]

train_dataset, val_dataset, test_dataset = gen_datasets(
    data_df, tar_dir, transform, random_partition, one_hot=one_hot, seed=seed
)

dataloaders = construct_dataloaders(
    [train_dataset, val_dataset, test_dataset], train_kwargs
)

train_dataloader = dataloaders[0]
val_dataloader = dataloaders[1]
test_dataloader = dataloaders[2]

Prepare model

In [None]:
from utils.utils import construct_one_hot_model

# Build model
model = construct_one_hot_model(
    train_kwargs,
    enc_pth=model_checkpoint,
    test_mode=test_mode,
    fine_tune_mode=fine_tune_mode,
)

Set up checkpointing and logging details

In [None]:
import os
from argparse import Namespace
from datetime import datetime

from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.strategies.ddp import DDPStrategy

# Set up callbacks
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
directory_path = os.path.join("./scripts/checkpoints", timestamp)

csv_logger = CSVLogger("logs", name=name + "_logs", version=timestamp)

checkpoint_callback = ModelCheckpoint(
    dirpath=directory_path,
    filename=name + "_{epoch:02d}-{val_loss:.4f}",
    save_top_k=1,
    monitor="val_loss",
    mode="min",
    every_n_epochs=train_kwargs.max_epochs,
    save_weights_only=True,
)

# Determine logging rate
total_steps_per_epoch = len(train_dataloader)
# Number of times to update logs per epoch (needs to be adjusted if sample size is small and batch size is big)
num_log_updates_per_epoch = 4

log_every_n_steps = total_steps_per_epoch // num_log_updates_per_epoch

# Automatically log learning rate
lr_monitor = LearningRateMonitor(logging_interval="epoch")

callbacks = [checkpoint_callback, lr_monitor]

trainer_args = Namespace(**train_kwargs)

In [None]:
if test_mode:
    del train_dataloader, val_dataloader
    trainer = Trainer.from_argparse_args(
        trainer_args,
        logger=csv_logger,
        callbacks=callbacks,
        strategy="ddp_fork",
        accelerator="cuda",
        num_nodes=1,
        devices=[0],
        log_every_n_steps=log_every_n_steps,
        enable_progress_bar=True,
    )

    trainer.test(model, dataloaders=test_dataloader)
else:
    del test_dataloader
    trainer = Trainer.from_argparse_args(
        trainer_args,
        logger=csv_logger,
        callbacks=callbacks,
        strategy="ddp_fork",
        accelerator="cuda",
        num_nodes=nodes,
        devices=gpus,
        log_every_n_steps=log_every_n_steps,
        enable_progress_bar=True,
    )

    trainer.fit(model, train_dataloader, val_dataloaders=val_dataloader)