<h1>al_bench</h1>
Example use of the al_bench Active Learning Benchmark Tool

In [None]:
# Install needed packages
!pip install h5py numpy tensorflow
!pip install -e /tf/notebooks/al_bench

<h2>Dataset</h2>
Fetch a dataset of 4598 feature vectors of length 1280 and their 4598 labels.

In [1]:
import al_bench as alb
import h5py as h5
import numpy as np

filename = "../test/TCGA-A2-A0D0-DX1_xmin68482_ymin39071_MPP-0.2500.h5py"
with h5.File(filename) as ds:
    my_features = np.array(ds["features"])
    print(
        f"Read in {my_features.shape[0]} feature vectors of length {my_features.shape[1]}."
    )
    my_labels = np.array(ds["labels"])
    print(f"Read in {my_labels.shape[0]} labels for the feature vectors.")
my_label_definitions = [
    {
        0: {"description": "other"},
        1: {"description": "tumor"},
        2: {"description": "stroma"},
        3: {"description": "infiltrate"},
    }
]
my_dataset_handler = alb.dataset.GenericDatasetHandler()
my_dataset_handler.set_all_features(my_features)
my_dataset_handler.set_all_label_definitions(my_label_definitions)
my_dataset_handler.set_all_labels(my_labels)

2022-09-22 10:42:08.223509: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Read in 4598 feature vectors of length 1280.
Read in 4598 labels for the feature vectors.


<h2>Model</h2>
Build a model that we will train.  We choose a TensorFlow model, but we could have chosen a PyTorch model.

In [2]:
import tensorflow as tf

number_of_categories = len(my_label_definitions[0])
number_of_features = my_features.shape[1]
hidden_units = 128
dropout = 0.3
my_model = tf.keras.models.Sequential(
    [
        tf.keras.Input(shape=(number_of_features,)),
        tf.keras.layers.Dense(hidden_units, activation="relu"),
        tf.keras.layers.Dropout(dropout, noise_shape=None, seed=20220909),
        tf.keras.layers.Dense(number_of_categories, activation="softmax"),
    ],
    name=(
        f"{number_of_categories}_labels_from_{number_of_features}_features_with_"
        f"dropout_{dropout}"
    ),
)
my_model_handler = alb.model.TensorFlowModelHandler()
my_model_handler.set_model(my_model)

2022-09-22 10:42:09.794108: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1922] Ignoring visible gpu device (device: 1, name: Quadro P400, pci bus id: 0000:a6:00.0, compute capability: 6.1) with core count: 2. The minimum required count is 8. You can adjust this requirement with the env var TF_MIN_GPU_MULTIPROCESSOR_COUNT.
2022-09-22 10:42:09.794488: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-22 10:42:10.371732: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22344 MB memory:  -> device: 0, name: NVIDIA RTX A5000, pci bus id: 0000:73:00.0, compute capability: 8.6


<h2>Active Learning Strategy</h2>
Choose an active learning strategy to test

In [3]:
# my_strategy_handler = alb.strategy.RandomStrategyHandler()
my_strategy_handler = alb.strategy.LeastConfidenceStrategyHandler()
# my_strategy_handler = alb.strategy.LeastMarginStrategyHandler()
# my_strategy_handler = alb.strategy.EntropyStrategyHandler()

my_strategy_handler.set_dataset_handler(my_dataset_handler)
my_strategy_handler.set_model_handler(my_model_handler)
my_strategy_handler.set_learning_parameters(
    label_of_interest=0,  # We've supplied only one label per feature vector
    maximum_iterations=5,
    number_to_select_per_iteration=20,
)

<h2>Run the benchmarking tool</h2>

In [4]:
# Assume that we start with nothing labeled
currently_labeled_examples = set()
my_strategy_handler.run(currently_labeled_examples)

Predicting for 4598 examples


2022-09-22 10:42:12.026890: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Training with 20 examples
Predicting for 4598 examples
Training with 40 examples
Predicting for 4598 examples
Training with 60 examples
Predicting for 4598 examples
Training with 80 examples
Predicting for 4598 examples
Training with 100 examples
Predicting for 4598 examples


In [5]:
log = my_strategy_handler.get_log()
print(f"{len(log) = }")
print(f"{log[:3] = }")

len(log) = 2090
log[:3] = [{'utcnow': datetime.datetime(2022, 9, 22, 14, 42, 10, 841424), 'method': 'on_predict_begin', 'logs': {}}, {'utcnow': datetime.datetime(2022, 9, 22, 14, 42, 10, 889419), 'method': 'on_predict_batch_begin', 'batch': 0, 'logs': {}}, {'utcnow': datetime.datetime(2022, 9, 22, 14, 42, 12, 29012), 'method': 'on_predict_batch_end', 'batch': 0, 'logs': {'outputs': array([[0.17637576, 0.30988383, 0.31052232, 0.20321806],
       [0.15839325, 0.29567295, 0.34109855, 0.2048352 ],
       [0.15700321, 0.4227508 , 0.29908744, 0.12115856],
       [0.11130828, 0.4421247 , 0.32737586, 0.1191912 ],
       [0.14481519, 0.37979108, 0.3072532 , 0.16814053],
       [0.1366824 , 0.41189665, 0.3156456 , 0.13577527],
       [0.12464081, 0.34451035, 0.42326963, 0.10757921],
       [0.19432417, 0.38617337, 0.26430535, 0.15519705],
       [0.2317029 , 0.24509075, 0.34228304, 0.18092331],
       [0.22914907, 0.24170472, 0.30612883, 0.22301738],
       [0.23914844, 0.21286182, 0.39124307, 0