In [1]:
from torchvision.transforms import transforms
from torch import optim
import pytorch_lightning as pl
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from baal.active import ActiveLearningDataset, get_heuristic
from baal.bayesian.dropout import patch_module
from baal.utils.pytorch_lightning import (
    ActiveLightningModule,
    ResetCallback,
    BaalTrainer,
    BaaLDataModule,
)
from src.dataset.dataset import Dataset
from src.dataset.utils import train_test_validation_split
from torchvision.models import vgg16


In [2]:
class Cifar10DataModule(BaaLDataModule):
    def __init__(self):
        train_transform = transforms.Compose(
            [transforms.Resize((256, 256))]
        )
        test_transform = transforms.Compose([transforms.Resize((256, 256))])
        dataset = Dataset(path_to_npy_data="data/NPY/volumes/",
                             path_to_npy_targets="data/NPY/labels/")
        train_ds, self.test_set, valid_ds = train_test_validation_split(dataset)
        self.active_set = ActiveLearningDataset(
            train_ds,
        )
        super().__init__(
            active_dataset=self.active_set,
            batch_size=2,
        )

    def train_dataloader(self, *args, **kwargs) -> DataLoader:
        return DataLoader(self.active_dataset, self.batch_size, shuffle=True, num_workers=4)

    def test_dataloader(self, *args, **kwargs) -> DataLoader:
        return DataLoader(self.test_set, self.batch_size, shuffle=False, num_workers=4)

In [3]:
class VGG16(ActiveLightningModule):
    def __init__(self, **kwargs):
        super().__init__()
        self.save_hyperparameters()
        self.name = "VGG16"
        self.version = "0.0.1"
        self.criterion = CrossEntropyLoss()
        self._build_model()

    def _build_model(self):
        # We use `patch_module` to swap Dropout modules in the model
        # for our implementation which enables MC-Dropou
        self.vgg16 = patch_module(vgg16(num_classes=3))

    def forward(self, x):
        return self.vgg16(x)

    def training_step(self, batch, batch_idx):
        """
        Lightning calls this inside the training loop
        :param batch:
        :return:
        """
        # forward pass
        x, y = batch
        y_hat = self(x)

        # calculate loss
        loss_val = self.criterion(y_hat, y)

        self.log("train_loss", loss_val, prog_bar=True, on_epoch=True)
        return loss_val

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)

        # calculate loss
        loss_val = self.criterion(y_hat, y)

        self.log("test_loss", loss_val, prog_bar=True, on_epoch=True)
        return loss_val

    def configure_optimizers(self):
        """
        return whatever optimizers we want here
        :return: list of optimizers
        """
        optimizer = optim.SGD(
            self.parameters(), lr=1e-4, momentum=0.9, weight_decay=5e-4
        )
        return [optimizer], []

    @classmethod
    def add_model_specific_args(cls, parser):
        parser.add_argument("--num_classes", type=int, default=3)
        parser.add_argument("--learning_rate", type=float, default=0.001)
        parser.add_argument(
            "--iterations", type=int, default=1, help="Number of MC-Sampling to perform"
        )
        parser.add_argument("--replicate_in_memory", type=bool, default=False)
        parser.add_argument("--batch_size", type=int, default=2)
        return parser

In [4]:
from argparse import ArgumentParser
def parse_arguments():
    parser = ArgumentParser()
    parser = VGG16.add_model_specific_args(parser)
    parser = ArgumentParser(parents=[parser], conflict_handler="resolve", add_help=False)
    parser.add_argument("--heuristic", type=str, default="bald", help="Which heuristic to use.")
    parser.add_argument("--data_root", type=str, default="/tmp", help="Where to store data.")
    parser.add_argument(
        "--query_size", type=int, default=20, help="How many items to label per step."
    )
    parser.add_argument(
        "--training_duration", type=int, default=1, help="How many epochs per step."
    )
    parser.add_argument("--gpus", type=int, default=1, help="How many GPUs to use.")
    return parser.parse_args()

In [5]:
import copy

def main():
    pl.seed_everything(42)
    # Create our dataset.
    # args = parse_arguments()
    datamodule = Cifar10DataModule()
    datamodule.active_dataset.label_randomly(100)
    # Get our heuristic to compute uncertainty.
    heuristic = get_heuristic("bald", shuffle_prop=0.0, reduction="none")
    model = VGG16()  # Instantiate VGG16

    # Make our PL Trainer
    trainer = BaalTrainer(
        # The weights of the model will change as it gets
        # trained; we need to keep a copy (deepcopy) so that
        # we can reset them.
        callbacks=[ResetCallback(copy.deepcopy(model.state_dict()))],
        dataset=datamodule.active_dataset,
        max_epochs=1,
        heuristic=heuristic,
        query_size=20,
    )

    AL_STEPS = 100
    for al_step in range(AL_STEPS):
        print(f"Step {al_step} Dataset size {len(datamodule.active_dataset)}")
        trainer.fit(model, datamodule=datamodule)  # Train the model on the labelled set.
        trainer.test(model, datamodule=datamodule)  # Get test performance.
        should_continue = trainer.step(
            model, datamodule=datamodule
        )  # Label the top-k most uncertain examples.
        if not should_continue:
            break

In [6]:
%cd ..

c:\Users\Bastian\Documents\Master Mathematik\MasterArbeit\Deep Bayesian Active Learning for Covid-19 Diagnosis\Deep-Bayesian-Active-Learning-for-Covid-19


In [7]:

main()

Global seed set to 42


4082


It seems that data augmentation is not disabled when iterating on the pool.
You can disable it by overriding attributes using `pool_specifics` 
when instantiating ActiveLearningDataset.
Example:
```
from torchvision.transforms import *
train_transform = Compose([Resize((224, 224)), RandomHorizontalFlip(),
                            RandomRotation(30), ToTensor()])
test_transform = Compose([Resize((224, 224)),ToTensor()])
dataset = CIFAR10(..., transform=train_transform)

al_dataset = ActiveLearningDataset(dataset,
                                    pool_specifics={'transform': test_transform})
```   

GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(


Step 0 Dataset size 100


  rank_zero_deprecation(
  rank_zero_deprecation(

  | Name      | Type             | Params
-----------------------------------------------
0 | criterion | CrossEntropyLoss | 0     
1 | vgg16     | VGG              | 134 M 
-----------------------------------------------
134 M     Trainable params
0         Non-trainable params
134 M     Total params
537.091   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "c:\Users\Bastian\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\worker.py", line 287, in _worker_loop
    data = fetcher.fetch(index)
  File "c:\Users\Bastian\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\fetch.py", line 52, in fetch
    return self.collate_fn(data)
  File "c:\Users\Bastian\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\collate.py", line 172, in default_collate
    return [default_collate(samples) for samples in transposed]  # Backwards compatibility.
  File "c:\Users\Bastian\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\collate.py", line 172, in <listcomp>
    return [default_collate(samples) for samples in transposed]  # Backwards compatibility.
  File "c:\Users\Bastian\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\collate.py", line 137, in default_collate
    out = elem.new(storage).resize_(len(batch), *list(elem.size()))
RuntimeError: Trying to resize storage that is not resizable


In [None]:
%tb

In [None]:
class ImageDataModule(ImageClassificationData):
    @property
    def num_classes(self):
        return 3


def get_data_module(heuristic):
    dataset = Dataset(path_to_npy_data="data/NPY/volumes/", path_to_npy_targets="data/NPY/labels/")
    train_set, test_set, valid = train_test_validation_split(dataset=dataset)
    dm = ImageDataModule.from_datasets(
        train_dataset=train_set,
        test_dataset=test_set,
        transform_kwargs=dict(image_size=(256, 256)),
        batch_size=2,
    )
    active_dm = ActiveLearningDataModule(
        dm,
        heuristic=get_heuristic(heuristic),
        initial_num_labels=1024,
        query_size=100,
        val_split=0.0,
    )
    assert active_dm.has_test, "No test set?"
    return active_dm



def get_model():
    loss_fn = nn.CrossEntropyLoss()
    head = nn.Sequential(
        nn.Linear(512, 512),
        nn.ReLU(True),
        nn.Dropout(),
        nn.Linear(512, 512),
        nn.ReLU(True),
        nn.Dropout(),
        nn.Linear(512, 3),
    )
    LR = 0.001
    model = ImageClassifier(
        num_classes=3,
        head=head,
        backbone="vgg16",
        pretrained=True,
        loss_fn=loss_fn,
        optimizer=partial(torch.optim.SGD, momentum=0.9, weight_decay=5e-4),
        learning_rate=LR,
    )
    return model

In [None]:
gpus = 1 if torch.cuda.is_available() else 0
active_dm: ActiveLearningDataModule = get_data_module("bald")
model: ImageClassifier = get_model()
# We use Flash trainer without validation set.
# In practice, using a validation set is risky because we overfit often.
trainer = flash.Trainer(
    gpus=gpus,
    max_epochs=2500,
    limit_val_batches=0,
)

# We will train for 20 epochs before doing 20 MC-Dropout iterations to estimate uncertainty.
active_learning_loop = ActiveLearningLoop(label_epoch_frequency=20, inference_iteration=20)
active_learning_loop.connect(trainer.fit_loop)
trainer.fit_loop = active_learning_loop
# We do not freeze the backbone, this gives better performance.
trainer.finetune(model, datamodule=active_dm, strategy="no_freeze")