<h1>al_bench</h1>
Example use of the al_bench Active Learning Benchmark Tool

In [None]:
# Install needed packages
!pip install h5py numpy tensorflow
!pip install -e /tf/notebooks/al_bench

<h2>Dataset</h2>
Fetch a dataset of 4598 feature vectors of length 1280 and their 4598 labels.

In [1]:
import al_bench as alb
import h5py as h5
import numpy as np

filename = "../test/TCGA-A2-A0D0-DX1_xmin68482_ymin39071_MPP-0.2500.h5py"
with h5.File(filename) as ds:
    my_features = np.array(ds["features"])
    print(
        f"Read in {my_features.shape[0]} feature vectors of length {my_features.shape[1]}."
    )
    my_labels = np.array(ds["labels"])
    print(f"Read in {my_labels.shape[0]} labels for the feature vectors.")
my_label_definitions = [
    {
        0: {"description": "other"},
        1: {"description": "tumor"},
        2: {"description": "stroma"},
        3: {"description": "infiltrate"},
    }
]
my_dataset_handler = alb.dataset.GenericDatasetHandler()
my_dataset_handler.set_all_features(my_features)
my_dataset_handler.set_all_label_definitions(my_label_definitions)
my_dataset_handler.set_all_labels(my_labels)

2022-09-26 08:32:15.386396: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Read in 4598 feature vectors of length 1280.
Read in 4598 labels for the feature vectors.


<h2>Model</h2>
Build a model that we will train.  We will build both a TensorFlow model and a PyTorch model, though normally one model is sufficient.  We'll choose one of them for use with the active learning strategy.

In [2]:
import tensorflow as tf

number_of_categories = len(my_label_definitions[0])
number_of_features = my_features.shape[1]
hidden_units = 128
dropout = 0.3

In [3]:
my_tensorflow_model = tf.keras.models.Sequential(
    [
        tf.keras.Input(shape=(number_of_features,)),
        tf.keras.layers.Dense(hidden_units, activation="relu"),
        tf.keras.layers.Dropout(dropout, noise_shape=None, seed=20220909),
        tf.keras.layers.Dense(number_of_categories, activation="softmax"),
    ],
    name=(
        f"{number_of_categories}_labels_from_{number_of_features}_features_with_"
        f"dropout_{dropout}"
    ),
)
my_tensorflow_model_handler = alb.model.TensorFlowModelHandler()
my_tensorflow_model_handler.set_model(my_tensorflow_model)
print("Tensorflow model handler built")

Tensorflow model handler built


2022-09-26 08:32:17.022095: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1922] Ignoring visible gpu device (device: 1, name: Quadro P400, pci bus id: 0000:a6:00.0, compute capability: 6.1) with core count: 2. The minimum required count is 8. You can adjust this requirement with the env var TF_MIN_GPU_MULTIPROCESSOR_COUNT.
2022-09-26 08:32:17.022632: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-26 08:32:17.604462: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22344 MB memory:  -> device: 0, name: NVIDIA RTX A5000, pci bus id: 0000:73:00.0, compute capability: 8.6


In [4]:
import torch


class MyTorchModel(torch.nn.modules.module.Module):
    def __init__(self, number_of_features, number_of_categories):
        super(MyTorchModel, self).__init__()
        self.fc1 = torch.nn.Linear(number_of_features, hidden_units)
        self.relu1 = torch.nn.ReLU()
        self.dropout1 = torch.nn.Dropout(p=dropout)
        self.fc2 = torch.nn.Linear(hidden_units, number_of_categories)
        self.softmax1 = torch.nn.Softmax(dim=-1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.softmax1(x)
        return x


my_torch_model = MyTorchModel(number_of_features, number_of_categories)

my_pytorch_model_handler = alb.model.PyTorchModelHandler()
my_pytorch_model_handler.set_model(my_torch_model)
print("PyTorch model handler built")

PyTorch model handler built


In [5]:
my_model_handler = my_pytorch_model_handler

<h2>Active Learning Strategy</h2>
Choose an active learning strategy to test

In [6]:
# my_strategy_handler = alb.strategy.RandomStrategyHandler()
my_strategy_handler = alb.strategy.LeastConfidenceStrategyHandler()
# my_strategy_handler = alb.strategy.LeastMarginStrategyHandler()
# my_strategy_handler = alb.strategy.EntropyStrategyHandler()

my_strategy_handler.set_dataset_handler(my_dataset_handler)
my_strategy_handler.set_model_handler(my_model_handler)
my_strategy_handler.set_learning_parameters(
    label_of_interest=0,  # We've supplied only one label per feature vector
    maximum_iterations=5,
    number_to_select_per_iteration=20,
)

<h2>Run the benchmarking tool</h2>

In [7]:
# Assume that we start with nothing labeled
currently_labeled_examples = set()
my_strategy_handler.run(currently_labeled_examples)

Predicting for 4598 examples
Training with 20 examples
Predicting for 4598 examples
Training with 40 examples
Predicting for 4598 examples
Training with 60 examples
Predicting for 4598 examples
Training with 80 examples
Predicting for 4598 examples
Training with 100 examples
Predicting for 4598 examples


In [8]:
log = my_strategy_handler.get_log()
print(f"{len(log) = }")
print(f"{log[:3] = }")

len(log) = 1852
log[:3] = [{'utcnow': datetime.datetime(2022, 9, 26, 12, 32, 18, 646768), 'model_step': <ModelStep.ON_PREDICT_BEGIN: 300>, 'logs': None}, {'utcnow': datetime.datetime(2022, 9, 26, 12, 32, 18, 663703), 'model_step': <ModelStep.ON_PREDICT_END: 305>, 'logs': {'outputs': array([[0.29915375, 0.24438834, 0.23698705, 0.21947087],
       [0.2740577 , 0.23741184, 0.24258009, 0.2459503 ],
       [0.2927576 , 0.23967767, 0.24158762, 0.22597717],
       ...,
       [0.30666745, 0.22335894, 0.248528  , 0.22144566],
       [0.29498193, 0.2238868 , 0.24112754, 0.24000369],
       [0.2854142 , 0.25165218, 0.24204808, 0.2208856 ]], dtype=float32)}}, {'utcnow': datetime.datetime(2022, 9, 26, 12, 32, 18, 665561), 'model_step': <ModelStep.ON_TRAIN_BEGIN: 100>, 'logs': None}]
