<h2>Import packages and install histomics_detect</h2>

In [None]:
# install histomics_detect
!pip install -e /tf/notebooks/histomics_detect

# install histomics_stream
!pip install -e /tf/notebooks/histomics_stream

# add to system path
import sys

sys.path.append("/tf/notebooks/histomics_detect/")
sys.path.append("/tf/notebooks/histomics_stream/")

In [None]:
import os
import re
import numpy as np
import tensorflow as tf

# import dataset related packages
from histomics_detect.io import dataset
from histomics_detect.augmentation import crop, flip, jitter, shrink
from histomics_detect.visualization import plot_inference

# import whole-slide image handling pipeline
import histomics_stream as hs

number_epochs = 50  # Set to a number smaller than 50 for speed during debug

<h2>Define dataset parameters and create datasets - DCC example</h2>

In [None]:
# input data path
path = "/tf/notebooks/DCC/data/"

# training parameters
train_tile = 224  # input image size
min_area_thresh = 0.5  # % of object area that must be in random crop to be included
width = tf.constant(train_tile, tf.int32)
height = tf.constant(train_tile, tf.int32)
min_area = tf.constant(min_area_thresh, tf.float32)

# split dataset into training and validation
cases = [
    "131458",
    "91315_leica_at2_40x",
    "135062",
    "93094",
    "131453",
    "131450",
    "135060",
    "131463",
    "131459",
    "131440",
    "131460",
    "93096",
    "131449",
    "131457",
    "131461",
    "93098",
    "131447",
    "93092",
    "131443",
    "93095",
    "131448",
    "93099",
    "91316_leica_at2_40x",
    "131462",
    "93091",
    "135065",
    "131446",
    "131441",
    "101626",
    "93093",
    "131454",
    "93097",
    "131445",
    "131444",
    "131456",
    "93090",
]
id = np.argsort(np.random.rand(len(cases) - 1))[0 : np.ceil(0.9 * len(cases)).astype(np.int32)]
training = [cases[i] for i in id]
validation = list(set(cases).difference(training))

# define parser for filenames
def parser(file):
    name = os.path.splitext(file)[0]
    case = name.split(".")[2]
    roi = ".".join([name.split(".")[1]] + name.split(".")[-3:])
    return case, roi


# generate training, validation datasets
ds_train_roi = dataset(path, parser, parser, train_tile, training)
ds_validation_roi = dataset(path, parser, parser, 0, validation)

# build training dataset
ds_train_roi = ds_train_roi.map(lambda x, y, z: (*crop(x, y, width, height, min_area_thresh), z))
ds_train_roi = ds_train_roi.map(lambda x, y, z: (*flip(x, y), z))
ds_train_roi = ds_train_roi.map(lambda x, y, z: (x, jitter(y, 0.05), z))
ds_train_roi = ds_train_roi.map(lambda x, y, z: (x, shrink(y, 0.05), z))
ds_train_roi = ds_train_roi.prefetch(tf.data.experimental.AUTOTUNE)

# build validation datasets
ds_validation_roi = ds_validation_roi.prefetch(tf.data.experimental.AUTOTUNE)

<h2>Create and train detection model - DCC example</h2>

In [None]:
# import network generation and training packages
from histomics_detect.networks.rpns import rpn
from histomics_detect.models.faster_rcnn import FasterRCNN

# choices for anchor sizes - all anchors 1:1 aspect ratio
anchor_px = tf.constant(
    [32, 64, 96], dtype=tf.int32
)  # width/height of square anchors in pixels at input mag.

# feature network parameters
backbone_stride = 1  # strides in feature generation network convolution
backbone_blocks = 14  # number of residual blocks to use in backbone
backbone_dimension = 256  # number of features generated by rpn convolution

# rpn network parameters
rpn_kernel = [3]  # kernel size for rpn convolution
rpn_act_conv = ["relu"]  # activation for rpn convolutional layers

# anchor filtering parameters
neg_max = 128  # maximum number of negative/positive anchors to keep in each roi
pos_max = 128
rpn_lmbda = 10.0  # weighting for rpn regression loss
roialign_tiles = 3.0  # roialign - number of horizontal/vertical tiles in a proposal
roialing_pool = 2.0  # roialign - number of horizontal/vertical samples in each tile

# create backbone and rpn networks
resnet50 = tf.keras.applications.ResNet50(
    include_top=False,
    weights="imagenet",
    input_tensor=None,
    input_shape=(train_tile, train_tile, 3),
    pooling=None,
)
rpnetwork, backbone = rpn(
    resnet50,
    n_anchors=tf.size(anchor_px),
    stride=backbone_stride,
    blocks=backbone_blocks,
    kernels=rpn_kernel,
    dimensions=[backbone_dimension],
    activations=rpn_act_conv,
)

# create FasterRCNN keras model
model = FasterRCNN(rpnetwork, backbone, [width, height], anchor_px, rpn_lmbda)

# compile FasterRCNN model with losses
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss=[
        tf.keras.losses.BinaryCrossentropy(from_logits=True),
        tf.keras.losses.Huber(),
    ],
)

# fit FasterRCNN model
model.fit(
    x=ds_train_roi,
    batch_size=1,
    epochs=number_epochs,
    verbose=1,
    validation_data=ds_validation_roi,
    validation_freq=number_epochs,
)

<h2>Define dataset parameters and create datasets - DLBCL example</h2>

In [None]:
# import dataset related packages
from histomics_detect.io import dataset, resize
from histomics_detect.augmentation import crop, flip, jitter, shrink
from histomics_detect.visualization import plot_inference
import numpy as np
import os

# input data path
path = "/tf/notebooks/DLBCL/detection/"

# training parameters
train_tile = 224  # input image size
min_area_thresh = 0.5  # % of object area that must be in crop to be included
width = tf.constant(train_tile, tf.int32)
height = tf.constant(train_tile, tf.int32)
min_area = tf.constant(min_area_thresh, tf.float32)

# define filename parsers
def png_parser(png):
    file = os.path.splitext(png)[0]
    case = file.split(".")[0]
    roi = ".".join(file.split(".")[1:])
    return case, roi


def csv_parser(csv):
    file = os.path.splitext(csv)[0]
    case = file.split(".")[0]
    roi = ".".join(file.split(".")[1:2] + file.split(".")[-3:])
    return case, roi


training = [
    "DCBT_2_CMYC",
    "DCBT_3_CMYC",
    "DCBT_5_CMYC",
    "DCBT_9_CMYC",
    "DCBT_10_CMYC",
    "DCBT_12_CMYC",
    "DCBT_14_CMYC",
    "DCBT_18_CMYC",
    "DCBT_19_CMYC",
    "DCBT_20_CMYC",
    "DCBT_21_CMYC",
    "DCBT_22_CMYC",
]
validation = [
    "DCBT_1_CMYC",
    "DCBT_4_CMYC",
    "DCBT_6_CMYC",
    "DCBT_8_CMYC",
    "DCBT_11_CMYC",
    "DCBT_13_CMYC",
    "DCBT_15_CMYC",
    "DCBT_16_CMYC",
    "DCBT_17_CMYC",
]


# generate training, validation datasets
ds_train_roi = dataset(path, png_parser, csv_parser, train_tile, training)
ds_validation_roi = dataset(path, png_parser, csv_parser, 0, validation)

# build training dataset
ds_train_roi = ds_train_roi.map(lambda x, y, z: (*resize(x, y, 2.0), z))
ds_train_roi = ds_train_roi.map(lambda x, y, z: (*crop(x, y, width, height, min_area_thresh), z))
ds_train_roi = ds_train_roi.map(lambda x, y, z: (*flip(x, y), z))
ds_train_roi = ds_train_roi.map(lambda x, y, z: (x, jitter(y, 0.05), z))
ds_train_roi = ds_train_roi.map(lambda x, y, z: (x, shrink(y, 0.05), z))
ds_train_roi = ds_train_roi.prefetch(tf.data.experimental.AUTOTUNE)

# build validation datasets
ds_validation_roi = ds_validation_roi.map(lambda x, y, z: (*resize(x, y, 2.0), z))
ds_validation_roi = ds_validation_roi.prefetch(tf.data.experimental.AUTOTUNE)

<h2>Create and train detection model - DLBCL example</h2>

In [None]:
# import network generation and training packages
from histomics_detect.networks.rpns import rpn
from histomics_detect.models.faster_rcnn import FasterRCNN

# choices for anchor sizes - all anchors 1:1 aspect ratio
anchor_px = tf.constant(
    [32, 48, 64], dtype=tf.int32
)  # width/height of square anchors in pixels at input mag.

# feature network parameters
backbone_stride = 1  # strides in feature generation network convolution
backbone_blocks = 14  # number of residual blocks to use in backbone
backbone_dimension = 256  # number of features generated by rpn convolution

# rpn network parameters
rpn_kernel = [3]  # kernel size for rpn convolution
rpn_act_conv = ["relu"]  # activation for rpn convolutional layers

# anchor filtering parameters
neg_max = 128  # maximum number of negative/positive anchors to keep in each roi
pos_max = 128
rpn_lmbda = 10.0  # weighting for rpn regression loss
roialign_tiles = 3.0  # roialign - number of horizontal/vertical tiles in a proposal
roialing_pool = 2.0  # roialign - number of horizontal/vertical samples in each tile

# create backbone and rpn networks
resnet50 = tf.keras.applications.ResNet50(
    include_top=False,
    weights="imagenet",
    input_tensor=None,
    input_shape=(train_tile, train_tile, 3),
    pooling=None,
)
rpnetwork, backbone = rpn(
    resnet50,
    n_anchors=tf.size(anchor_px),
    stride=backbone_stride,
    blocks=backbone_blocks,
    kernels=rpn_kernel,
    dimensions=[backbone_dimension],
    activations=rpn_act_conv,
)

# create FasterRCNN keras model
model = FasterRCNN(rpnetwork, backbone, [width, height], anchor_px, rpn_lmbda)

# compile FasterRCNN model with losses
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss=[
        tf.keras.losses.BinaryCrossentropy(from_logits=True),
        tf.keras.losses.Huber(),
    ],
)

# fit FasterRCNN model
model.fit(
    x=ds_train_roi,
    batch_size=1,
    epochs=number_epochs,
    verbose=1,
    validation_data=ds_validation_roi,
    validation_freq=number_epochs,
)

<h2>Inference on a single image - model.call() </h2>

In [None]:
# generate and visualize thresholded, roialign outputs
data = ds_validation_roi.shuffle(100).take(1).get_single_element()
rgb = tf.cast(data[0], tf.uint8)
regressions = model(rgb, tau=0.5, nms_iou=0.3)
plot_inference(rgb, regressions)

<h2>Raw inference on a single image - model.raw() </h2>

In [None]:
# generate raw rpn outputs
objectness, boxes, features = model.raw(rgb)

# threshold rpn proposals
boxes_positive, objectness_positive, positive = model.threshold(boxes, objectness, model.tau)

# perform non-max suppression on rpn positive predictions
boxes_nms, objectness_nms, selected = model.nms(boxes_positive, objectness_positive, model.nms_iou)

# generate roialign predictions for rpn positive predictions
align_boxes = model.align(boxes_nms, features, model.field, model.pool, model.tiles)

# apply thresholding, nms, and roialign
plot_inference(rgb, align_boxes)

<h2>Batch inference using tf.data.Dataset.map </h2>

In [None]:
# mapping model using data.Dataset.map keeps outputs from different images separate
map_output = ds_validation_roi.take(5).map(lambda x, y, z: (model(x), y, z))
map_output = [element for element in map_output]

# compare to using model.predict which merges the outputs from all images
predict_output = model.predict(ds_validation_roi.take(5))

<h2>Batch evaluation - model.evaluate() </h2>

In [None]:
# performance evaluation on multiple images from a tf.data.Dataset
metrics = model.evaluate(ds_validation_roi)

<h2>Build Dataset from dictionary of instructions</h2>

In [None]:
import copy

# Create a study and insert study-wide information
my_study0 = {"version": "version-1"}
my_study0["number_pixel_rows_for_tile"] = 256
my_study0["number_pixel_columns_for_tile"] = 256
my_slides = my_study0["slides"] = {}

# Add a slide to the study, including slide-wide information with it.
my_slide0 = my_slides["Slide_0"] = {}
my_slide0["filename"] = "/tf/notebooks/histomics_detect/example/DCBT_10_CMYC.svs"
my_slide0["slide_name"] = "DCBT_10_CMYC"
my_slide0["slide_group"] = "DCBT_10"
my_slide0["number_pixel_rows_for_chunk"] = 2048
my_slide0["number_pixel_columns_for_chunk"] = 2048

# For each slide, find the appropriate resolution given the desired_magnification and magnification_tolerance.  In this
# example, we use the same parameters for each slide, but this is not required generally.
find_resolution_for_slide = hs.configure.FindResolutionForSlide(my_study0, desired_magnification=20, magnification_tolerance=0.02)
for slide in my_study0["slides"].values():
    find_resolution_for_slide(slide)
print("================================================================")
print(f"my_study0 = {my_study0}")

# We are going to demonstrate several approaches to choosing tiles.  Each approach will start with its own copy of the
# my_study0 that we have built so far.

if True:
    # Demonstrate TilesByGridAndMask without a mask
    my_study_tiles_by_grid = copy.deepcopy(my_study0)
    tiles_by_grid = hs.configure.TilesByGridAndMask(
        my_study_tiles_by_grid,
        number_pixel_overlap_rows_for_tile=32,
        number_pixel_overlap_columns_for_tile=32,
        randomly_select=1000,
    )
    # We could apply this to a subset of the slides, but we will apply it to all slides in this example.
    for slide in my_study_tiles_by_grid["slides"].values():
        tiles_by_grid(slide)
    # print("================================================================")
    print("Finished with TilesByGrid")
    # print(f"my_study_tiles_by_grid = {my_study_tiles_by_grid}")

if False:
    # Demonstrate TilesByGridAndMask with a mask
    my_study_tiles_by_grid_and_mask = copy.deepcopy(my_study0)
    tiles_by_grid_and_mask = hs.configure.TilesByGridAndMask(
        my_study_tiles_by_grid_and_mask,
        number_pixel_overlap_rows_for_tile=0,
        number_pixel_overlap_columns_for_tile=0,
        mask_filename="/tf/notebooks/histomics_stream/example/TCGA-BH-A0BZ-01Z-00-DX1.45EB3E93-A871-49C6-9EAE-90D98AE01913-mask.png",
        randomly_select=1000,
    )
    # We could apply this to a subset of the slides, but we will apply it to all slides in this example.
    for slide in my_study_tiles_by_grid_and_mask["slides"].values():
        tiles_by_grid_and_mask(slide)
    # print("================================================================")
    print("Finished with TilesByGridAndMask")
    # print(f"my_study_tiles_by_grid_and_mask = {my_study_tiles_by_grid_and_mask}")

if True:
    # Demonstrate TilesByList
    my_study_tiles_by_list = copy.deepcopy(my_study0)
    tiles_by_list = hs.configure.TilesByList(
        my_study_tiles_by_list, randomly_select=5, tiles_dictionary=my_study_tiles_by_grid["slides"]["Slide_0"]["tiles"]
    )
    # We could apply this to a subset of the slides, but we will apply it to all slides in this example.
    for slide in my_study_tiles_by_list["slides"].values():
        tiles_by_list(slide)
    # print("================================================================")
    print("Finished with TilesByList")
    # print(f"my_study_tiles_by_list = {my_study_tiles_by_list}")

if True:
    # Demonstrate TilesRandomly
    my_study_tiles_randomly = copy.deepcopy(my_study0)
    tiles_randomly = hs.configure.TilesRandomly(my_study_tiles_randomly, randomly_select=3)
    # We could apply this to a subset of the slides, but we will apply it to all slides in this example.
    for slide in my_study_tiles_randomly["slides"].values():
        tiles_randomly(slide)
    # print("================================================================")
    print("Finished with TilesRandomly")
    # print(f"my_study_tiles_randomly = {my_study_tiles_randomly}")

# We choose one of the above examples for further processing.
my_study_of_tiles = my_study_tiles_by_grid
# my_study_of_tiles = my_study_tiles_randomly

create_tensorflow_dataset = hs.tensorflow.CreateTensorFlowDataset()
tiles = create_tensorflow_dataset(my_study_of_tiles)
print("Finished with CreateTensorFlowDataset")

# print("================================================================")
# print(tiles)
# print("================================================================")
# tf.print(tiles)


<h3>Run with the tiles dataset</h3>

In [None]:
dataset_map_options = {
    "num_parallel_calls": tf.data.experimental.AUTOTUNE,
    "deterministic": False,
}

# Convert pixel data to uint8
tiles = tiles.map(lambda x, y: (tf.cast(x, tf.uint8), y), **dataset_map_options)

In [None]:
# Run the model on the tiles
tiles1 = tiles.map(lambda x, y: (x, model(x, tau=0.5, nms_iou=0.3), y), **dataset_map_options)

<h3>Find a tile with many detections</h3>

In [None]:
tiles2 = tiles1
tiles2 = tiles2.map(lambda x, p, y: (x, p, tf.shape(p)[0], y), **dataset_map_options)

max_number_detections = -1
number_tiles = 0
for tile in tiles2:
    number_tiles = number_tiles + 1
    rgb, regressions, number_detections, _ = tile
    if number_detections > max_number_detections:
        tf.print(f"New best: Tile #{number_tiles} has {number_detections} detections.")
        max_number_detections = number_detections
        max_rgb = rgb
        max_regressions = regressions
    if max_number_detections ** 2 * number_tiles >= 10000:
        break
tf.print(f"Examined {number_tiles} tiles in total.")
plot_inference(max_rgb, max_regressions)

<h2>Save and Load Model Weights</h2>

In [None]:
# save checkpoint
model.save_weights("/tf/notebooks/histomics_detect/example/saved_model/")

# create dummy network for restore
restored = FasterRCNN(rpnetwork, backbone, [width, height], anchor_px, rpn_lmbda)
restored.load_weights("/tf/notebooks/histomics_detect/example/saved_model/")

# check that outputs are same
assert tf.math.reduce_all(tf.math.equal(restored(rgb), model(rgb)))

<h2>Wrap the model so that predictions can be done with annotations</h2>

In [None]:
class WrappedModel(tf.keras.Model):
    def __init__(self, model, *args, **kwargs):
        super(WrappedModel, self).__init__(*args, **kwargs)
        self.model = model

    def call(self, pair, *args, **kwargs):
        return (self.model(pair[0], *args, **kwargs), pair[1])

wrapped_model = WrappedModel(model, name="wrapped_model")
wrapped_tiles = tiles.map(lambda rgb, annot: ((rgb, annot), None, None), **dataset_map_options)
print("Starting wrapped_model.predict")
wrapped_predict_output = wrapped_model.predict(wrapped_tiles)
print("Finished wrapped_model.predict")