How to use the TileLevelDataset
-------------------------------

In [1]:
import os
import time
import glob
import pandas as pd
import slide_tools

In [2]:
root = "/mnt/data/Lennard/gyn"
csv_train = "ago-tr1/csv/finetune_train_1.csv"
frame = pd.read_csv(os.path.join(root, csv_train))
frame

Unnamed: 0,SlideNr,HRD(BRCA1),label,slide,annotation
0,10,0,ago-tr1/labels/10.json,ago-tr1/slides/10.svs,ago-tr1/annotations/10.geojson
1,100,1,ago-tr1/labels/100.json,ago-tr1/slides/100.svs,ago-tr1/annotations/100.geojson
2,101,1,ago-tr1/labels/101.json,ago-tr1/slides/101.svs,ago-tr1/annotations/101.geojson
3,103,1,ago-tr1/labels/103.json,ago-tr1/slides/103.svs,ago-tr1/annotations/103.geojson
4,106,1,ago-tr1/labels/106.json,ago-tr1/slides/106.svs,ago-tr1/annotations/106.geojson
...,...,...,...,...,...
127,84,1,ago-tr1/labels/84.json,ago-tr1/slides/84.svs,ago-tr1/annotations/84.geojson
128,85,1,ago-tr1/labels/85.json,ago-tr1/slides/85.svs,ago-tr1/annotations/85.geojson
129,9,0,ago-tr1/labels/9.json,ago-tr1/slides/9.svs,ago-tr1/annotations/9.geojson
130,92,1,ago-tr1/labels/92.json,ago-tr1/slides/92.svs,ago-tr1/annotations/92.geojson


### Create dataset from slide, annotations, tile labels and slide labels

In [3]:
rootify = lambda path: os.path.join(root, path)

In [4]:
%%time
ds = slide_tools.tile_level.TileLevelDataset(
    slide_paths=frame.slide.apply(rootify),
    annotation_paths=frame.annotation.apply(rootify),
    label_paths=frame.label.apply(rootify),
    global_labels=frame[["HRD(BRCA1)"]].to_dict("records"),
    simplify_tolerance=100,
)
print(f"Found {len(ds.samples)} tiles of native size")

# Found 16101629 tiles of native size
# CPU times: user 1min 5s, sys: 2.82 s, total: 1min 8s
# Wall time: 1min 8s

[Plugin: cucim.kit.cuslide] Loading the dynamic library from: /home/caduser/anaconda3/envs/tmmae/lib/python3.8/site-packages/cucim/clara/cucim.kit.cuslide@22.02.00.so
Initializing plugin: cucim.kit.cuslide (interfaces: [cucim::io::IImageFormat v0.1]) (impl: cucim.kit.cuslide)
[Plugin: cucim.kit.cumed] Loading the dynamic library from: /home/caduser/anaconda3/envs/tmmae/lib/python3.8/site-packages/cucim/clara/cucim.kit.cumed@22.02.00.so
Initializing plugin: cucim.kit.cumed (interfaces: [cucim::io::IImageFormat v0.1]) (impl: cucim.kit.cumed)


Found 16101629 tiles of native size
CPU times: user 1min 5s, sys: 2.82 s, total: 1min 8s
Wall time: 1min 8s


### Tiles with overlap and centroid within annotation

In [5]:
%%time
region_overlap = 0.5
ds.setup_regions(centroid_in_annotation=True, region_overlap=region_overlap, with_labels=True)
ds.setup_epoch()
print(f"Found {len(ds.samples)} tiles of native size inside annotations with overlap of {region_overlap}")

# Found 28201545 tiles of native size inside annotations with overlap of 0.5
# CPU times: user 36.9 s, sys: 5.08 s, total: 42 s
# Wall time: 42.1 s

Found 28201545 tiles of native size inside annotations with overlap of 0.5
CPU times: user 36.9 s, sys: 5.08 s, total: 42 s
Wall time: 42.1 s


### Balancing by size 

In [6]:
%%time
ds.setup_epoch(
    shuffle=True,
    balance_size_by=slide_tools.objects.BalanceMode.MIN,  # MIN=Undersampling, MAX=Oversampling, MEDIAN/MEAN=Mix
    balance_label_key="HRD(BRCA1)",
    balance_label_bins=2,
    shuffle_chunk_size=16
)
print(f"Found {len(ds.samples)} tiles of native size inside annotations with overlap of {region_overlap} and balanced by size and label")

# Found 1733028 tiles of native size inside annotations with overlap of 0.5 and balanced by size and label
# CPU times: user 4.74 s, sys: 2.51 s, total: 7.25 s
# Wall time: 7.29 s

Found 1733028 tiles of native size inside annotations with overlap of 0.5 and balanced by size and label
CPU times: user 4.74 s, sys: 2.51 s, total: 7.25 s
Wall time: 7.29 s


### Let's have a look at the speed

In [7]:
import torch

In [8]:
bs = 128
num_workers = 8
dl = torch.utils.data.DataLoader(ds, shuffle=False, batch_size=bs, num_workers=num_workers)

In [9]:
N = 100
for i, batch in enumerate(dl):
    if i == 20:
        t0 = time.time()
    if i == (N + 20):
        t = time.time() - t0
        break
        
print(f"{bs*N/t:.0f} 240x240 tiles per second with {num_workers=}")

# 1139 240x240 tiles per second with num_workers=8

1139 240x240 tiles per second with num_workers=8


### Checkout the main documentation for further options

In [10]:
slide_tools.tile_level.TileLevelDataset?

[0;31mInit signature:[0m [0mslide_tools[0m[0;34m.[0m[0mtile_level[0m[0;34m.[0m[0mTileLevelDataset[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwds[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
An abstract class representing a :class:`Dataset`.

All datasets that represent a map from keys to data samples should subclass
it. All subclasses should overwrite :meth:`__getitem__`, supporting fetching a
data sample for a given key. Subclasses could also optionally overwrite
:meth:`__len__`, which is expected to return the size of the dataset by many
:class:`~torch.utils.data.Sampler` implementations and the default options
of :class:`~torch.utils.data.DataLoader`.

.. note::
  :class:`~torch.utils.data.DataLoader` by default constructs a index
  sampler that yields integral indices.  To make it work with a map-style
  dataset with non-integral indices/keys, a custom sampler must be provided.
[0;31mInit docstring:[0m
Map-style datase

In [11]:
ds.setup_regions?

[0;31mSignature:[0m
[0mds[0m[0;34m.[0m[0msetup_regions[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0msize[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mSequence[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0munit[0m[0;34m:[0m [0mslide_tools[0m[0;34m.[0m[0mobjects[0m[0;34m.[0m[0mconstants[0m[0;34m.[0m[0mSizeUnit[0m [0;34m=[0m [0;34m<[0m[0mSizeUnit[0m[0;34m.[0m[0mPIXEL[0m[0;34m:[0m [0;34m'pixel'[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlevel[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcentroid_in_annotation[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mannotation_align[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mregion_overlap[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0

In [12]:
ds.setup_epoch?

[0;31mSignature:[0m
[0mds[0m[0;34m.[0m[0msetup_epoch[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mbalance_size_by[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mslide_tools[0m[0;34m.[0m[0mobjects[0m[0;34m.[0m[0mconstants[0m[0;34m.[0m[0mBalanceMode[0m[0;34m,[0m [0mint[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbalance_label_key[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbalance_label_bins[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m10[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mshuffle[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mshuffle_chunk_size[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Populate .samples 