How to filter tiles using labels
--------------------------------

In [1]:
import os
import numpy as np
import pandas as pd
import slide_tools

In [2]:
root = "/mnt/data/Lennard/gyn"
csv_train = "ago-tr1/csv/finetune_train_1.csv"
frame = pd.read_csv(os.path.join(root, csv_train))
frame

Unnamed: 0,SlideNr,HRD(BRCA1),label,slide,annotation
0,10,0,ago-tr1/labels/10.json,ago-tr1/slides/10.svs,ago-tr1/annotations/10.geojson
1,100,1,ago-tr1/labels/100.json,ago-tr1/slides/100.svs,ago-tr1/annotations/100.geojson
2,101,1,ago-tr1/labels/101.json,ago-tr1/slides/101.svs,ago-tr1/annotations/101.geojson
3,103,1,ago-tr1/labels/103.json,ago-tr1/slides/103.svs,ago-tr1/annotations/103.geojson
4,106,1,ago-tr1/labels/106.json,ago-tr1/slides/106.svs,ago-tr1/annotations/106.geojson
...,...,...,...,...,...
127,84,1,ago-tr1/labels/84.json,ago-tr1/slides/84.svs,ago-tr1/annotations/84.geojson
128,85,1,ago-tr1/labels/85.json,ago-tr1/slides/85.svs,ago-tr1/annotations/85.geojson
129,9,0,ago-tr1/labels/9.json,ago-tr1/slides/9.svs,ago-tr1/annotations/9.geojson
130,92,1,ago-tr1/labels/92.json,ago-tr1/slides/92.svs,ago-tr1/annotations/92.geojson


### You can filter tiles (regions) by an abitrary function

This function receives a labels dictionary as input e.g. like this
```json
{
    "white_or_blurry": [0.1, 0.7, 1, ...],
    "tile_classifier": [
                        [0.1, 0.2, 0.0, 0.4, 0.0, 0.0, 0.1, 0.2],
                        ...
    ]
}
```

and must return a 1d mask where `True` means keep like this `[True, False, False, ...]`.

In [3]:
def white_blurry_tumor(labels):
    not_white_or_blurry = labels["white_or_blurry"] < 0.5
    tumor = np.argmax(labels["tile_classifier"], axis=1) == 8
    mask = not_white_or_blurry & tumor
    return mask

In [4]:
%%time

rootify = lambda path: os.path.join(root, path)

ds = slide_tools.tile_level.TileLevelDataset(
    slide_paths=frame.slide.apply(rootify),
    annotation_paths=frame.annotation.apply(rootify),
    label_paths=frame.label.apply(rootify),
    global_labels=frame[["HRD(BRCA1)"]].to_dict("records"),
    simplify_tolerance=100,
)

ds.setup_regions(
    centroid_in_annotation=True,
    region_overlap=0.5,
    filter_by_label_func=white_blurry_tumor,
    with_labels=True,
)

ds.setup_epoch(
    shuffle=True,
    balance_size_by=slide_tools.objects.BalanceMode.MEDIAN,  # MIN=Undersampling, MAX=Oversampling, MEDIAN/MEAN=Mix
    balance_label_key="HRD(BRCA1)",
    balance_label_bins=2,
    shuffle_chunk_size=16
)

print(f"Setup epoch with {len(ds)} tiles.")

# Setup epoch with 8986362 tiles.
# CPU times: user 2min 5s, sys: 3.29 s, total: 2min 9s
# Wall time: 2min 9s

[Plugin: cucim.kit.cuslide] Loading the dynamic library from: /home/caduser/anaconda3/envs/tmmae/lib/python3.8/site-packages/cucim/clara/cucim.kit.cuslide@22.02.00.so
Initializing plugin: cucim.kit.cuslide (interfaces: [cucim::io::IImageFormat v0.1]) (impl: cucim.kit.cuslide)
[Plugin: cucim.kit.cumed] Loading the dynamic library from: /home/caduser/anaconda3/envs/tmmae/lib/python3.8/site-packages/cucim/clara/cucim.kit.cumed@22.02.00.so
Initializing plugin: cucim.kit.cumed (interfaces: [cucim::io::IImageFormat v0.1]) (impl: cucim.kit.cumed)


Setup epoch with 8986362 tiles.
CPU times: user 2min 5s, sys: 3.29 s, total: 2min 9s
Wall time: 2min 9s
