In [51]:
import pickle
import os
import sys #sys.exit()
import argparse
from pprint import pprint
import random
from copy import deepcopy
import csv
import datetime

import pandas as pd
import numpy as np
import torch
import torch.backends
from torch import optim
from torch.hub import load_state_dict_from_url
from torch.nn import CrossEntropyLoss
from torchvision import datasets
from torchvision.models import vgg16
from torchvision.transforms import transforms
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

from baal.active import get_heuristic, ActiveLearningDataset
from baal.active.active_loop import ActiveLearningLoop
from baal.bayesian.dropout import patch_module
from baal.modelwrapper import ModelWrapper
from baal.utils.metrics import Accuracy
from baal.active.heuristics import BALD

import aug_lib

from baal_extended.ExtendedActiveLearningDataset_2 import ExtendedActiveLearningDataset

In [52]:
parser = argparse.ArgumentParser()
parser.add_argument("--epoch", default=1, type=int)
parser.add_argument("--batch_size", default=32, type=int)
parser.add_argument("--initial_pool", default=10, type=int)   # 1000, we will start training with only 1000(org)+1000(aug)=2000 labeled data samples out of the 50k (org) and
parser.add_argument("--query_size", default=1, type=int)    # request 100(org)+100(aug)=200 new samples to be labeled at every cycle
parser.add_argument("--lr", default=0.001)
parser.add_argument("--heuristic", default="bald", type=str)
parser.add_argument("--iterations", default=2, type=int)     # 20 sampling for MC-Dropout to kick paths with low weights for optimization
parser.add_argument("--shuffle_prop", default=0.05, type=float)
parser.add_argument("--learning_epoch", default=2, type=int) # 20
parser.add_argument("--augment", default=2, type=int)

_StoreAction(option_strings=['--augment'], dest='augment', nargs=None, const=None, default=2, type=<class 'int'>, choices=None, help=None, metavar=None)

In [53]:
def get_datasets(initial_pool, n_augmentations):
    transform = transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Normalize(3 * [0.5], 3 * [0.5]),
        ]
    )
    aug_transform = transforms.Compose(
        [
            aug_lib.TrivialAugment(),
            transforms.ToTensor(),
            transforms.Normalize(3 * [0.5], 3 * [0.5]),
        ]
    )
    test_transform = transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Normalize(3 * [0.5], 3 * [0.5]),
        ]
    )
    # Note: We use the test set here as an example. You should make your own validation set.
    train_ds = datasets.CIFAR10(
        ".", train=True, transform=transform, target_transform=None, download=True
    )
    aug_train_ds = datasets.CIFAR10(
        ".", train=True, transform=aug_transform, target_transform=None, download=True
    )
    test_set = datasets.CIFAR10(
        ".", train=False, transform=test_transform, target_transform=None, download=True
    )

    #active_set = ActiveLearningDataset(train_ds, pool_specifics={"transform": test_transform})
    eald_set = ExtendedActiveLearningDataset(train_ds)
    eald_set.augment_n_times(n_augmentations, augmented_dataset=aug_train_ds)

    # We start labeling randomly.
    eald_set.label_randomly(initial_pool)
    return eald_set, test_set

In [54]:
args, unknown = parser.parse_known_args()
use_cuda = torch.cuda.is_available()
torch.backends.cudnn.benchmark = True
random.seed(1337)
torch.manual_seed(1337)
if not use_cuda:
    print("warning, the experiments would take ages to run on cpu")

now = datetime.datetime.now()
dt_string = now.strftime("%d_%m_%Y_%Hx%M")
csv_filename = "uncertainties/metrics_cifarnet_" + dt_string + "_.csv"
with open(csv_filename, "w+", newline="") as out_file:
    csvwriter = csv.writer(out_file)
    csvwriter.writerow(
    (
        "epoch",
        "test_acc",
        "train_acc",
        "test_loss",
        "train_loss",
        "Next training size",
        "amount original images labelled",
        "amount augmented images labelled"
    )
    )

hyperparams = vars(args)

active_set, test_set = get_datasets(hyperparams["initial_pool"], hyperparams["augment"])

heuristic = get_heuristic(hyperparams["heuristic"], hyperparams["shuffle_prop"])
criterion = CrossEntropyLoss()
model = vgg16(num_classes=10)

# change dropout layer to MCDropout
model = patch_module(model)

if use_cuda:
    model.cuda()
else: 
    print("WARNING! NO CUDA IN USE!")
optimizer = optim.SGD(model.parameters(), lr=hyperparams["lr"], momentum=0.9)

# Wraps the model into a usable API.
model = ModelWrapper(model, criterion, replicate_in_memory=False)
model.add_metric(name='accuracy', initializer=lambda : Accuracy())

logs = {}
logs["epoch"] = 0

# for prediction we use a smaller batchsize
# since it is slower
active_loop = ActiveLearningLoop(
    active_set,
    model.predict_on_dataset,
    heuristic,
    hyperparams.get("query_size", 1),
    batch_size=10,
    iterations=hyperparams["iterations"],
    use_cuda=use_cuda,
)
# We will reset the weights at each active learning step.
init_weights = deepcopy(model.state_dict())

layout = {
    "Loss/Accuracy": {
        "Loss": ["Multiline", ["loss/train", "loss/test"]],
        "Accuracy": ["Multiline", ["accuracy/train", "accuracy/test"]],
    },
}

writer = SummaryWriter("vgg_mcdropout_cifar10_org+aug_3")    # baal-serhiy/experiments/vgg_mcdropout_cifar10_org+aug_3
writer.add_custom_scalars(layout)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
orig len50000
augmented n times0


In [55]:
# save uncertainties in pickle file
def generate_pickle_file(epoch, oracle_indices, uncertainty):    
    pickle_filename = dt_string + (
        f"_uncertainty_epoch={epoch}" f"_labelled={len(active_set)}.pkl"
    )
    dir_path = os.path.join(os.getcwd(), "uncertainties")
    isExist = os.path.exists("uncertainties")
    if not isExist:
        os.makedirs(dir_path)
    pickle_file_path = os.path.join(dir_path, pickle_filename)
    print("Saving file " + pickle_file_path)
    pickle.dump(
        {
            "oracle_indices": oracle_indices,
            "uncertainty": uncertainty,
            "labelled_map": active_set.labelled_map,
        },
        open(pickle_file_path, "wb")
    )
    return dir_path, pickle_file_path


In [56]:
def generate_excel_file(pickle_file_path, epoch, pickle_dir_path, uncertainties_std): 
    excel_filename = dt_string + (
        f"_uncertainty_epoch={epoch}" f"_labelled={len(active_set)}.xlsx"
    )
    excel_path = os.path.join(pickle_dir_path, excel_filename)
    uncertainties_std.to_excel(excel_path)


In [57]:
for epoch in tqdm(range(args.epoch)):
    # if we are in the last round we want to train for longer epochs to get a more comparable result
    # if epoch == args.epoch:
    #     hyperparams["learning_epoch"] = 75
    # Load the initial weights.
    model.load_state_dict(init_weights)
    model.train_on_dataset(
        active_set,
        optimizer,
        hyperparams["batch_size"],
        hyperparams["learning_epoch"],
        use_cuda,
    )

    # Validation!
    model.test_on_dataset(test_set, hyperparams["batch_size"], use_cuda)
    metrics = model.metrics

    # get origin amount of labelled augmented/unaugmented images
    if(epoch == 0):
        with open(csv_filename, "a+", newline="") as out_file:
            csvwriter = csv.writer(out_file)
            csvwriter.writerow(
                (
                -1,
                0,
                0,
                0,
                0,
                active_set.n_labelled,
                active_set.n_unaugmented_images_labelled,
                active_set.n_augmented_images_labelled
                )
            )

    # replacement for step
    #pool = active_set._dataset  # len(active_set._dataset) 100000
    pool = active_set.pool
    if len(pool) > 0:
        probs = model.predict_on_dataset(
            active_set._dataset,
            batch_size=hyperparams["batch_size"],
            iterations=hyperparams["iterations"],
            use_cuda=use_cuda,
        )

        #if probs is not None and (isinstance(probs, types.GeneratorType) or len(probs) > 0):
        # -> "isinstance(...) needed when using predict_..._Generator"
        if probs is not None and len(probs) > 0:
            # 1. Get uncertainty
            uncertainty = active_loop.heuristic.get_uncertainties(probs)
            oracle_indices = np.argsort(uncertainty)
            active_set.labelled_map

            pickle_dir_path, pickle_file_path = generate_pickle_file(epoch, oracle_indices, uncertainty)

            mypickle = pd.read_pickle(pickle_file_path)

            uncertainty = mypickle['uncertainty']
            oracle_indices = mypickle['oracle_indices']
            labelled_map = mypickle['labelled_map']

            uncertainty_length = len(uncertainty)

            original = uncertainty[0:50000 - 1]
            aug1 = uncertainty[50000:100000 - 1]
            aug2 = uncertainty[100000:150000 - 1]

            if hyperparams["augment"] == 1:
                matrix = np.vstack([original, aug1])
            if hyperparams["augment"] == 2:   
                matrix = np.vstack([original, aug1, aug2])

            # 2. Calc standard deviation
            df_lab_img = pd.DataFrame(matrix)
            df_lab_img.std() # here
            df_lab_img = pd.DataFrame(np.vstack([matrix, df_lab_img.std()]))

            uncertainties_std = df_lab_img.transpose()
            if hyperparams["augment"] == 1:
                uncertainties_std.columns = ['original', 'aug1', 'std']
            if hyperparams["augment"] == 2:   
                uncertainties_std.columns = ['original', 'aug1', 'aug2', 'std']

            generate_excel_file(pickle_file_path, epoch, pickle_dir_path, uncertainties_std)
            
            # 3. Map std uncertainties to uncertainty array
            std_array = df_lab_img.std()
            for i in range(len(uncertainty)): # 150000
                uncertainty[i] = std_array[i % (50000-1)]
            oracle_indices = np.argsort(uncertainty)
            active_set.labelled_map
            # to_label -> indices sortiert von größter zu niedrigster uncertainty
            # uncertainty -> alle std uncertainties des pools
            to_label = heuristic.reorder_indices(uncertainty)
            to_label = oracle_indices[np.array(to_label)] # len(to_label) = 150000
            if len(to_label) > 0:
                active_set.label(to_label[: hyperparams.get("query_size", 1)])
            else: break
        else:
            break
    else: 
        break

  0%|          | 0/1 [00:00<?, ?it/s]

[15100-MainThread] [baal.modelwrapper:train_on_dataset:83] 2022-12-17T16:14:42.922254Z [info     ] Starting training              dataset=1500 epoch=2
[15100-MainThread] [baal.modelwrapper:train_on_dataset:94] 2022-12-17T16:15:18.868920Z [info     ] Training complete              train_loss=2.297689437866211
[15100-MainThread] [baal.modelwrapper:test_on_dataset:123] 2022-12-17T16:15:18.876916Z [info     ] Starting evaluating            dataset=10000
[15100-MainThread] [baal.modelwrapper:test_on_dataset:133] 2022-12-17T16:15:25.539357Z [info     ] Evaluation complete            test_loss=2.3021671772003174
[15100-MainThread] [baal.modelwrapper:predict_on_dataset_generator:232] 2022-12-17T16:15:25.683390Z [info     ] Start Predict                  dataset=150000

  0%|          | 0/4688 [00:00<?, ?it/s]
  0%|          | 1/4688 [00:08<11:07:33,  8.55s/it]
  0%|          | 9/4688 [00:08<54:38,  1.43it/s]   
  0%|          | 17/4688 [00:08<24:09,  3.22it/s]
  1%|          | 26/4688 [00:08<1

100%|██████████| 1/1 [01:58<00:00, 118.12s/it]


In [58]:
len(aug1)

49999

In [59]:
len(uncertainty)

150000

In [60]:
len(to_label)

150000

In [61]:
to_label # array[99999, 34555, ...]

array([ 2138, 68050, 80314, ..., 14453, 18398, 53986], dtype=int64)