In [1]:
import sys; sys.path.insert(0, '..')
import torch
import torch.nn as nn
import torch.optim as optim
from src.models.conv_net import MnistExampleModel
from src.dataset.dataset import CustomDataset, collate_fn_padd, CustomFileDataset
from src.dataset.utils import get_active_learning_datasets


In [2]:
data_path = "C:/Users/Bastian/Documents/Master Mathematik/MasterArbeit/Deep Bayesian Active Learning for Covid-19 Diagnosis/Deep-Bayesian-Active-Learning-for-Covid-19/data"

In [3]:
model = MnistExampleModel()
dataset = CustomDataset(data_path= data_path +"/NPY/volumes/", 
                                      target_path=data_path + "/NPY/labels/")


In [4]:
train_indices, test_indices, initial_pool = get_active_learning_datasets(dataset)

CP sample len 1531
NCP sample len 1473
Normal sample len 1078
Number of samples per class: 862
size train files: 2586
size test files: 645
size initial pool files: 3


In [5]:
from baal.active import ActiveLearningDataset
from torchvision import transforms

train_transform = transforms.Compose([transforms.Resize((256, 256)),
                                      transforms.RandomCrop(224),
                                      transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])

# We use -1 to specify that the data is unlabeled.
train_dataset = CustomFileDataset([dataset.data_paths[idx] for idx in train_indices],
                                  [dataset.targets[idx] for idx in train_indices],
                                    train_transform)

test_transform = transforms.Compose([transforms.Resize((256, 256)),
                                     transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])

# We use -1 to specify that the data is unlabeled.
test_dataset = CustomFileDataset([dataset.data_paths[idx] for idx in test_indices],
                                  [dataset.targets[idx] for idx in test_indices],
                                    test_transform)
active_learning_ds = ActiveLearningDataset(train_dataset, pool_specifics={'transform': test_transform})


In [None]:
from baal.active import ActiveLearningDataset

train_indices, test_indices, initial_pool = dataset.get_active_learning_datasets(3)
train = torch.utils.data.Subset(dataset, train_indices)
test = torch.utils.data.Subset(dataset, test_indices)
print("train set length {}".format(len(train)))
print("test set length {}".format(len(test)))
# Here we set `pool_specifics`, where we set the transform attribute for the pool.
active_set = ActiveLearningDataset(train)
active_set.label(initial_pool)
print(f"Num. labeled: {len(active_set)}/{len(train_indices)}")

In [6]:
import torch
from torch import nn, optim
from baal.modelwrapper import ModelWrapper
from baal.bayesian.dropout import MCDropoutModule

USE_CUDA = torch.cuda.is_available()
# This will modify all Dropout layers to be usable at test time which is
# required to perform Active Learning.
model = MCDropoutModule(model)
if USE_CUDA:
  model.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=5e-4)

# ModelWrapper is an object similar to keras.Model.
baal_model = ModelWrapper(model, criterion, replicate_in_memory=False)

In [7]:
from baal.active.heuristics import BALD
heuristic = BALD(shuffle_prop=0.1)

In [8]:
initial_pool

[811, 422, 672]

In [9]:
import numpy as np
# Let's label 100 training examples randomly first.
# Note: the indices here are relative to the pool of unlabelled items!
active_learning_ds.can_label = False
# train_idxs = np.random.permutation(np.arange(len(train_dataset)))[:9].tolist()
active_learning_ds.label(initial_pool)

print(f"Num. labeled: {len(active_learning_ds)}/{len(train_dataset)}")

Num. labeled: 3/2586


In [10]:
# 2. Train the model for a few epoch on the training set.
baal_model.train_on_dataset(active_learning_ds, optimizer, batch_size=1, epoch=1, collate_fn=collate_fn_padd, workers=4, use_cuda=USE_CUDA)
baal_model.test_on_dataset(test_dataset, batch_size=1, collate_fn=collate_fn_padd, workers=4, use_cuda=USE_CUDA)

print("Metrics:", {k:v.avg for k,v in baal_model.metrics.items()})



[6064-MainThread ] [baal.modelwrapper:train_on_dataset:83] 2023-02-06T16:16:56.223684Z [info     ] Starting training              dataset=3 epoch=1
[6064-MainThread ] [baal.modelwrapper:train_on_dataset:94] 2023-02-06T16:17:04.928177Z [info     ] Training complete              train_loss=0.837359607219696
[6064-MainThread ] [baal.modelwrapper:test_on_dataset:123] 2023-02-06T16:17:04.937728Z [info     ] Starting evaluating            dataset=645
[6064-MainThread ] [baal.modelwrapper:test_on_dataset:133] 2023-02-06T16:19:15.199137Z [info     ] Evaluation complete            test_loss=1.098615288734436
Metrics: {'test_loss': 1.098615288734436, 'train_loss': 0.837359607219696}


In [None]:
import random
idx = random.sample(range(len(active_learning_ds.pool)), 100)
active_learning_ds[idx]

In [11]:
import random

# 3. Select the K-top uncertain samples according to the heuristic.
pool = active_learning_ds.pool
if len(pool) == 0:
  raise ValueError("We're done!")
# We make 15 MCDropout iterations to approximate the uncertainty.
predictions = baal_model.predict_on_dataset(pool, batch_size=1, collate_fn=collate_fn_padd, iterations=1, use_cuda=USE_CUDA, verbose=True)
# We will label the 10 most uncertain samples.
top_uncertainty = heuristic(predictions)[:10]



[6064-MainThread ] [baal.modelwrapper:predict_on_dataset_generator:232] 2023-02-06T16:19:15.290361Z [info     ] Start Predict                  dataset=2583
100%|██████████| 2583/2583 [08:01<00:00,  5.37it/s]


In [None]:
print(top_uncertainty)

In [12]:
for idx in top_uncertainty:
    print(pool[idx][1])

tensor(2)
tensor(1)
tensor(1)
tensor(1)
tensor(0)
tensor(1)
tensor(1)
tensor(1)
tensor(0)
tensor(2)


In [13]:
# 4. Label those samples.
oracle_indices = active_learning_ds._pool_to_oracle_index(top_uncertainty)
# labels = [get_label(train_dataset.files[idx]) for idx in oracle_indices]
# print(list(zip(labels, oracle_indices)))
active_learning_ds.label(top_uncertainty)
