<a href="https://colab.research.google.com/github/AndromedaOMA/Advanced_Chapters_of_Neural_Network---Laboratories/blob/main/NoisyCIFAR100_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing the environment requirements


## Mount drive

In [2]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Install pip packages


In [3]:
import yaml
import subprocess

root_path = "/content/drive/MyDrive/NoisyCIFAR100"

with open(f"{root_path}/environment.yaml") as f:
    env = yaml.safe_load(f)

# Collect pip packages
pip_packages = []
for dep in env.get("dependencies", []):
    if isinstance(dep, dict) and "pip" in dep:
        pip_packages.extend(dep["pip"])

# Install all pip packages at once
if pip_packages:
    subprocess.run(["pip", "install", *pip_packages])

In [4]:
!pip install detectors

Collecting detectors
  Using cached detectors-0.1.11-py3-none-any.whl.metadata (9.3 kB)
Collecting optuna (from detectors)
  Using cached optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting wilds (from detectors)
  Downloading wilds-2.0.0-py3-none-any.whl.metadata (22 kB)
Collecting faiss-cpu (from detectors)
  Using cached faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Collecting colorlog (from optuna->detectors)
  Using cached colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Collecting ogb>=1.2.6 (from wilds->detectors)
  Using cached ogb-1.3.6-py3-none-any.whl.metadata (6.2 kB)
Collecting outdated>=0.2.0 (from wilds->detectors)
  Using cached outdated-0.2.2-py2.py3-none-any.whl.metadata (4.7 kB)
Collecting littleutils (from outdated>=0.2.0->wilds->detectors)
  Using cached littleutils-0.2.4-py3-none-any.whl.metadata (679 bytes)
Downloading detectors-0.1.11-py3-none-any.whl (616 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

### Verify the installation


In [5]:
!pip show detectors

Name: detectors
Version: 0.1.11
Summary: Detectors: a python package to benchmark generalized out-of-distribution detection methods.
Home-page: https://github.com/edadaltocg/detectors
Author: Eduardo Dadalto
Author-email: edadaltocg@gmail.com
License: APACHE 2.0
Location: /usr/local/lib/python3.12/dist-packages
Requires: accelerate, faiss-cpu, matplotlib, numpy, optuna, pandas, Pillow, psutil, scikit-image, scikit-learn, scipy, timm, torch, torchvision, tqdm, wilds
Required-by: 


### Import

In [6]:
import sys

import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

import sys

import torch

import sys
import torch

import timm
import sys
import detectors

import sys
import torch
# from sam import SAM

import yaml

import torch

# Preprocessing the data

In [7]:
def preprocessing(config):
    if config['dataset']['name'] == 'MNIST':
        train_transformer = transforms.Compose([
            transforms.RandomAffine(degrees=2, translate=[0.1, 0.1]),
            transforms.ToTensor(),
            transforms.Normalize(mean=(0.1307,), std=(0.3081,))
        ])
        test_transformer = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=(0.1307,), std=(0.3081,))
        ])
        train_dataset = datasets.MNIST(root=f'{config["dataset"]["data_dir"]}/train', train=True, transform=train_transformer, download=True)
        test_dataset = datasets.MNIST(root=f'{config["dataset"]["data_dir"]}/test', train=False, transform=test_transformer, download=True)
        train_loader = DataLoader(train_dataset, batch_size=config['dataset']['batch_size'], shuffle=True, pin_memory=True)
        test_loader = DataLoader(test_dataset, batch_size=config['dataset']['batch_size'], shuffle=False, pin_memory=True)
        return train_loader, test_loader

    elif config['dataset']['name'] == 'CIFAR10':
        mean = list(map(float, config["dataset"]["mean"]))
        std = list(map(float, config["dataset"]["std"]))

        train_transformer = transforms.Compose([
            # transforms.ToPILImage(),
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(15),
            transforms.ToTensor(),
            transforms.Normalize(mean, std)
        ])
        test_transformer = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean, std)
        ])

        train_dataset = datasets.CIFAR10(root=f'{config["dataset"]["data_dir"]}/train', train=True, transform=train_transformer, download=True)
        test_dataset = datasets.CIFAR10(root=f'{config["dataset"]["data_dir"]}/test', train=False, transform=test_transformer, download=True)
        train_loader = DataLoader(train_dataset, batch_size=config['dataset']['batch_size'], shuffle=True, pin_memory=True)
        test_loader = DataLoader(test_dataset, batch_size=config['dataset']['batch_size'], shuffle=False, pin_memory=True)
        return train_loader, test_loader

    elif config['dataset']['name'] == 'CIFAR100':
        mean = list(map(float, config["dataset"]["mean"]))
        std = list(map(float, config["dataset"]["std"]))

        train_transformer = transforms.Compose([
            # transforms.ToPILImage(),
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(15),
            transforms.ToTensor(),
            transforms.Normalize(mean, std)
        ])
        test_transformer = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean, std)
        ])

        train_dataset = datasets.CIFAR100(root=f'{config["dataset"]["data_dir"]}/train', train=True, transform=train_transformer, download=True)
        test_dataset = datasets.CIFAR100(root=f'{config["dataset"]["data_dir"]}/test', train=False, transform=test_transformer, download=True)
        train_loader = DataLoader(train_dataset, batch_size=config['dataset']['batch_size'], shuffle=True, pin_memory=True)
        test_loader = DataLoader(test_dataset, batch_size=config['dataset']['batch_size'], shuffle=False, pin_memory=True)
        return train_loader, test_loader

    elif config['dataset']['name'] == 'OxfordIIITPet':
        mean = list(map(float, config["dataset"]["mean"]))
        std = list(map(float, config["dataset"]["std"]))

        train_transformer = transforms.Compose([
            # transforms.ToPILImage(),
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(15),
            transforms.ToTensor(),
            transforms.Normalize(mean, std)
        ])
        test_transformer = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean, std)
        ])

        train_dataset = datasets.OxfordIIITPet(root=f'{config["dataset"]["data_dir"]}/train', train=True, transform=train_transformer, download=True)
        test_dataset = datasets.OxfordIIITPet(root=f'{config["dataset"]["data_dir"]}/test', train=False, transform=test_transformer, download=True)
        train_loader = DataLoader(train_dataset, batch_size=config['dataset']['batch_size'], shuffle=True, pin_memory=True)
        test_loader = DataLoader(test_dataset, batch_size=config['dataset']['batch_size'], shuffle=False, pin_memory=True)
        return train_loader, test_loader
    else:
        print('The dataset name you have entered is not supported!')
        sys.exit()

# Get the Loss Function

In [8]:
def get_loss_function(name):
    name = name.lower()
    if name == 'crossentropyloss':
        print('CrossEntropyLoss Loss Function loaded!')
        return torch.nn.CrossEntropyLoss()
    elif name == 'mseloss':
        print('MSELoss Loss Function loaded!')
        return torch.nn.MSELoss()
    else:
        print('The loss function name you have entered is not supported!')
        sys.exit()

# Get Learning Rate Scheduler

In [9]:
def get_lr_scheduler(configs, optimizer):
    name = configs["training"]["scheduler"].lower()
    if name == 'steplr':
        print('StepLR Learning Rate Scheduler loaded!')
        return torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=configs["training"]["step_size"],
                                               gamma=configs["training"]["gamma"])
    elif name == 'reducelronplateau':
        print('ReduceLROnPlateau Learning Rate Scheduler loaded!')
        return torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                          mode=configs["training"]["mode"],
                                                          factor=configs["training"]["factor"],
                                                          patience=configs["training"]["patience"],
                                                          threshold=configs["training"]["threshold"])
    else:
        print('The Learning Rate Scheduler name you have entered is not supported!')
        sys.exit()

# Get the model

In [10]:
def get_model(name, config):
    name = name.lower()
    if name == 'resnet50':
        # model = timm.create_model("hf_hub:anonauthors/cifar100-timm-resnet50", pretrained=config['model']['pretrained'])  # https://huggingface.co/anonauthors/cifar100-timm-resnet50
        model = timm.create_model("resnet50_cifar100", pretrained=True)     # https://huggingface.co/edadaltocg/resnet50_cifar100
        print('Model resnet50_cifar100 loaded!')
        return model
    elif name == 'resnet18':
        model = timm.create_model("resnet18_cifar100", pretrained=config['model']['pretrained'])    # https://huggingface.co/edadaltocg/resnet18_cifar100
        print('Model resnet18_cifar100 loaded!')
        return model
    elif name == 'resnest14d':
        model = timm.create_model("hf_hub:timm/resnest14d.gluon_in1k", pretrained=config['model']['pretrained'])
        print('Model resnest14d.gluon_in1k loaded!')
        return model
    elif name == 'resnest26d':
        model = timm.create_model("hf_hub:timm/resnest26d.gluon_in1k", pretrained=config['model']['pretrained'])
        print('Model resnest26d.gluon_in1k loaded!')
        return model
    elif name == 'MLP':
        pass
    else:
        print('The network name you have entered is not supported!')
        sys.exit()

# Get the optimizer

In [11]:
def get_optimizer(configs, params):
    name = configs["training"]["optimizer"].lower()
    if name == 'sgd':
        print('SGD Optimizer loaded!')
        return torch.optim.SGD(params, lr=configs["training"]["learning_rate"], momentum=configs["training"]["momentum"], weight_decay=float(configs["training"]["weight_decay"]))
    elif name == 'adam':
        print('Adam Optimizer loaded!')
        return torch.optim.Adam(params, lr=configs["training"]["learning_rate"])
    elif name == 'adamw':
        print('AdamW Optimizer loaded!')
        return torch.optim.AdamW(params, lr=configs["training"]["learning_rate"], weight_decay=float(configs["training"]["weight_decay"]))
    elif name == 'muon':
        print('Muon Optimizer loaded!')
        return torch.optim.Muon(params, lr=configs["training"]["learning_rate"], weight_decay=float(configs["training"]["weight_decay"]))
    # elif name == 'sam':
    #     print('Optimizer loaded!')
    #     # base_optim = torch.optim.SGD(params, lr=configs["training"]["learning_rate"], momentum=configs["training"]["momentum"], weight_decay=float(configs["training"]["weight_decay"]))
    #     base_optim = torch.optim.SGD
    #     return SAM(params, base_optim, lr=configs["training"]["learning_rate"], momentum=float(configs["training"]["momentum"]))
    else:
        print('The optimizer name you have entered is not supported!')
        sys.exit()

# Get YAML Configuration

In [12]:
def load_config(config_path):
    with open(config_path) as f:
        config = yaml.safe_load(f)
    return config

# Get Mixed Precision

In [13]:
def get_mixed_precision():
    return torch.amp.GradScaler()

# Training the model

In [14]:
import os
from tqdm import tqdm
import torch
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

import pprint
import wandb

wandb.login()
scaler = get_mixed_precision()


def train_per_epoch(epoch):
    global model, optimizer, loss_function, scheduler, writer, train_loader

    model.train()
    train_loss = 0.0

    for batch_index, (train_images, train_labels) in enumerate(
            tqdm(train_loader, desc=f"Epoch {epoch}")
    ):
        train_images, train_labels = train_images.to(device), train_labels.to(device)

        optimizer.zero_grad()
        with torch.amp.autocast(device_type='cuda'):
            outputs = model(train_images)
            loss = loss_function(outputs, train_labels)

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()

        n_iter = (epoch - 1) * len(train_loader) + batch_index + 1
        writer.add_scalar("Train/Loss", loss.item(), n_iter)

        # W&B
        wandb.log({"train_loss": loss.item()})

        train_loss += loss.item()

    scheduler.step()

    return train_loss / len(train_loader)


@torch.no_grad()
def eval_training(epoch=0):
    global model, loss_function, test_loader, writer

    model.eval()
    test_loss = 0.0
    correct = 0

    for test_images, test_labels in test_loader:
        test_images, test_labels = test_images.to(device), test_labels.to(device)

        outputs = model(test_images)
        test_loss += loss_function(outputs, test_labels).item()

        correct += (outputs.argmax(1) == test_labels).sum().item()

    avg_loss = test_loss / len(test_loader)
    accuracy = 100.0 * correct / len(test_loader.dataset)

    print(f"Epoch {epoch}: Test Loss {avg_loss:.3f}, Accuracy {accuracy:.3f}%")

    writer.add_scalar("Test/Loss", avg_loss, epoch)
    writer.add_scalar("Test/Accuracy", accuracy, epoch)

    wandb.log({"val_loss": avg_loss, "val_acc": accuracy})

    return accuracy


def sweep_train():
    global model, train_loader, test_loader, writer, loss_function, optimizer, scheduler, device

    run = wandb.init(project="NoisyCIFAR100")
    config_wb = wandb.config

    device = config['experiment']['device']

    experiment_number = config['experiment']['number']

    # override with sweep hyperparameters
    config['training']['learning_rate'] = config_wb.learning_rate
    config['training']['weight_decay'] = config_wb.weight_decay
    config['dataset']['batch_size'] = config_wb.batch_size
    config['training']['optimizer'] = config_wb.optimizer

    train_loader, test_loader = preprocessing(config)
    model = get_model(config['model']['name'], config).to(device)
    loss_function = get_loss_function(config["training"]["loss_function"])
    optimizer = get_optimizer(config, model.parameters())
    scheduler = get_lr_scheduler(config, optimizer)

    log_dir = f'../experiments/experiment{experiment_number}/results'
    os.makedirs(log_dir, exist_ok=True)

    writer = SummaryWriter(
        os.path.join(log_dir, datetime.now().strftime(config["experiment"]["date_format"]))
    )

    best_acc = 0
    for epoch in range(1, config['training']['epochs'] + 1):
        train_per_epoch(epoch)
        acc = eval_training(epoch)

        if acc > best_acc:
            best_acc = acc
            checkpoint_dir = f'../experiments/experiment{experiment_number}/checkpoints'
            os.makedirs(checkpoint_dir, exist_ok=True)
            torch.save(model.state_dict(), os.path.join(
                checkpoint_dir, f'best_model_{best_acc}.pth'
            ))

    writer.close()
    run.finish()


if __name__ == '__main__':
    print("Type the number of the experiment you want to run:")
    experiment_number = int(input())
    config = load_config(f"{root_path}/experiment{experiment_number}/config.yml")

    pprint.pprint(f"Sweep configuration: {config['sweep']}")

    sweep_id = wandb.sweep(config['sweep'], project="NoisyCIFAR100")
    wandb.agent(sweep_id, function=sweep_train)


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmarius-alexandru-olaru[0m ([33mmarius-alexandru-olaru-fii-uaic[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Type the number of the experiment you want to run:
1
("Sweep configuration: {'method': 'bayes', 'metric': {'name': 'val_acc', "
 "'goal': 'maximize'}, 'parameters': {'learning_rate': {'distribution': "
 "'uniform', 'min': '1e-4', 'max': '3e-3'}, 'batch_size': {'values': [32, 64, "
 "128]}, 'weight_decay': {'distribution': 'uniform', 'min': 0.0, 'max': "
 "0.005}, 'optimizer': {'values': ['SGD', 'Adam', 'AdamW']}}}")
Create sweep with ID: 8lfbxp0y
Sweep URL: https://wandb.ai/marius-alexandru-olaru-fii-uaic/NoisyCIFAR100/sweeps/8lfbxp0y


[34m[1mwandb[0m: Agent Starting Run: et4y8sbl with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.0008017152659061111
[34m[1mwandb[0m: 	optimizer: AdamW
[34m[1mwandb[0m: 	weight_decay: 0.0038541705822874145


100%|██████████| 169M/169M [00:05<00:00, 30.9MB/s]
100%|██████████| 169M/169M [00:04<00:00, 35.9MB/s]


Downloading: "https://huggingface.co/edadaltocg/resnet50_cifar100/resolve/main/pytorch_model.bin" to /root/.cache/torch/hub/checkpoints/resnet50_cifar100.pth


Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/wandb/agents/pyagent.py", line 297, in _run_job
    self._function()
  File "/tmp/ipython-input-941492003.py", line 94, in sweep_train
    model = get_model(config['model']['name'], config).to(device)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-3748646692.py", line 5, in get_model
    model = timm.create_model("resnet50_cifar100", pretrained=True)     # https://huggingface.co/edadaltocg/resnet50_cifar100
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/timm/models/_factory.py", line 138, in create_model
    model = create_fn(
            ^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/detectors/models/resnet.py", line 210, in resnet50_cifar100
    return _create_resnet_small("resnet50_cifar100", features_dim=2048, pretrained=pretrained, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

[34m[1mwandb[0m: [32m[41mERROR[0m Run et4y8sbl errored: HTTP Error 429: Too Many Requests
[34m[1mwandb[0m: Agent Starting Run: 7ym414pw with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	learning_rate: 0.0006228157055957308
[34m[1mwandb[0m: 	optimizer: SGD
[34m[1mwandb[0m: 	weight_decay: 0.00021143341900904775


Downloading: "https://huggingface.co/edadaltocg/resnet50_cifar100/resolve/main/pytorch_model.bin" to /root/.cache/torch/hub/checkpoints/resnet50_cifar100.pth


Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/wandb/agents/pyagent.py", line 297, in _run_job
    self._function()
  File "/tmp/ipython-input-941492003.py", line 94, in sweep_train
    model = get_model(config['model']['name'], config).to(device)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-3748646692.py", line 5, in get_model
    model = timm.create_model("resnet50_cifar100", pretrained=True)     # https://huggingface.co/edadaltocg/resnet50_cifar100
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/timm/models/_factory.py", line 138, in create_model
    model = create_fn(
            ^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/detectors/models/resnet.py", line 210, in resnet50_cifar100
    return _create_resnet_small("resnet50_cifar100", features_dim=2048, pretrained=pretrained, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

[34m[1mwandb[0m: [32m[41mERROR[0m Run 7ym414pw errored: HTTP Error 429: Too Many Requests
[34m[1mwandb[0m: Agent Starting Run: 311lftvj with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.002575775054348585
[34m[1mwandb[0m: 	optimizer: AdamW
[34m[1mwandb[0m: 	weight_decay: 0.002810919414539855


Downloading: "https://huggingface.co/edadaltocg/resnet50_cifar100/resolve/main/pytorch_model.bin" to /root/.cache/torch/hub/checkpoints/resnet50_cifar100.pth


Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/wandb/agents/pyagent.py", line 297, in _run_job
    self._function()
  File "/tmp/ipython-input-941492003.py", line 94, in sweep_train
    model = get_model(config['model']['name'], config).to(device)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-3748646692.py", line 5, in get_model
    model = timm.create_model("resnet50_cifar100", pretrained=True)     # https://huggingface.co/edadaltocg/resnet50_cifar100
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/timm/models/_factory.py", line 138, in create_model
    model = create_fn(
            ^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/detectors/models/resnet.py", line 210, in resnet50_cifar100
    return _create_resnet_small("resnet50_cifar100", features_dim=2048, pretrained=pretrained, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

[34m[1mwandb[0m: [32m[41mERROR[0m Run 311lftvj errored: HTTP Error 429: Too Many Requests
[34m[1mwandb[0m: [32m[41mERROR[0m Detected 3 failed runs in the first 60 seconds, killing sweep.
[34m[1mwandb[0m: To disable this check set WANDB_AGENT_DISABLE_FLAPPING=true
