# Multi-annotator Pool-based Deep Active Learning

This notebook gives an introduction for dealing with multiple annotators using `skactiveml`.

In [1]:
import matplotlib as mlp
import matplotlib.pyplot as plt
import numpy as np
import warnings

from copy import deepcopy

from torch.utils.data import RandomSampler

from skactiveml.classifier import SkorchClassifier
from skactiveml.classifier.multiannotator import CrowdLayerClassifier
from skactiveml.pool import RandomSampling
from skactiveml.pool.multiannotator import SingleAnnotatorWrapper
from skactiveml.utils import majority_vote
from skorch.callbacks import LRScheduler

import torch
from torch import nn
import torch.nn.functional as F
from tqdm import tqdm

mlp.rcParams["figure.facecolor"] = "white"

MISSING_LABEL = -1
RANDOM_STATE = 0
FONTSIZE = 12

np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
torch.cuda.manual_seed(RANDOM_STATE)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

warnings.filterwarnings("ignore")

# Loading Label-me Data Set

In [2]:
def load_data_set_label_me(data_dir):
    ds = {}
    
    X_train = np.load(f'{data_dir}/label-me-X.npy')
    y_train = np.load(f'{data_dir}/label-me-y.npy')
    y_train_true = np.load(f'{data_dir}/label-me-y-true.npy')
    X_valid = np.load(f'{data_dir}/label-me-X-valid.npy')
    y_valid_true = np.load(f'{data_dir}/label-me-y-true-valid.npy')
    X_test = np.load(f'{data_dir}/label-me-X-test.npy')
    y_test_true = np.load(f'{data_dir}/label-me-y-true-test.npy')
    
    ds['X_train'] = X_train
    ds['y_train'] = y_train
    ds['y_train_true'] = y_train_true
    ds['X_valid'] = X_valid
    ds['y_valid_true'] = y_valid_true
    ds['X_test'] = X_test
    ds['y_test_true'] = y_test_true

    return ds

In [3]:
data_dir = f'./data/label-me'
ds = load_data_set_label_me(data_dir)

In [11]:
classes = np.unique(ds['y_train'])
n_classes = len(classes)
X_train = ds['X_train'].astype(np.float32)
y_train = ds['y_train'].astype(np.float32)
n_features = X_train.shape[1]
n_annotators = y_train.shape[1]
n_samples = X_train.shape[0]

# Define the base Module

In [5]:
class ClassifierModule(nn.Module):
    def __init__(self, n_classes, n_features, dropout):
        super(ClassifierModule, self).__init__()
        n_hidden_neurons_1 = 256
        n_hidden_neurons_2 = 128
        self.embed_X_block = nn.Sequential(
            nn.Linear(in_features=n_features, out_features=n_hidden_neurons_1),
            nn.BatchNorm1d(num_features=n_hidden_neurons_1),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(in_features=n_hidden_neurons_1, out_features=n_hidden_neurons_2),
            nn.BatchNorm1d(num_features=n_hidden_neurons_2),
            nn.ReLU(),
            nn.Dropout(p=dropout),
        )
        self.mlp = nn.Linear(in_features=n_hidden_neurons_2, out_features=n_classes)

    def forward(self, x):
        embed_x = self.embed_X_block(x)
        logit_class = self.mlp(embed_x)

        return logit_class

In [6]:
hyper_parameter = {
    'max_epochs': 50,
    'batch_size': 64,
    'lr': 0.01,
    'optimizer__weight_decay': 0.0001,
}
lr_scheduler = LRScheduler(policy='CosineAnnealingLR', T_max=hyper_parameter['max_epochs'])

Majority Vote

In [7]:
net_mv = SkorchClassifier(
                ClassifierModule,
                module__n_classes=n_classes,
                module__n_features=n_features,
                module__dropout=0.5,
                classes=classes,
                missing_label=MISSING_LABEL,
                cost_matrix=None,
                random_state=1,
                criterion=nn.CrossEntropyLoss(),
                train_split=None,
                verbose=False,
                optimizer=torch.optim.RAdam,
                device=device,
                callbacks=[lr_scheduler],
                **hyper_parameter
            )

# Multi Annotator with Active Learning

In [17]:
al_n_cycles = 25
al_batch_size = 2 * n_classes
al_bs_annotator = 10

In [18]:
sa_qs = RandomSampling(random_state=RANDOM_STATE, missing_label=MISSING_LABEL)
ma_qs = SingleAnnotatorWrapper(sa_qs, random_state=RANDOM_STATE)

In [19]:
y = np.full(shape=(n_samples, n_annotators), fill_value=MISSING_LABEL)

query_idx = ma_qs.query(X_train, y, batch_size=16, n_annotators_per_sample=n_annotators)

ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.