# Advanced Workflow (Working Example)

This workflow is being used for preprocessing of data for self-supervised learning pilot.

### Dataset consists of 400 slides (700Gb)

For each slide:

1. Detect regions of tissue, using low-resolution image of whole slide
2. load full-resolution slide in chunks
3. divide each chunk into 224px tiles

The ability to define our own custom TileExtractor class allows us to create this pipeline.

1. SlideLoader loads low-resolution image into SlideData.image
2. SlidePreprocessor performs tissue detection on the low-res image, and puts the mask into SlideData.mask
3. TileExtractor:
    - dimensions of full-resolution image are divided into 5000px chunks
    - for each chunk:
        - use SlideData.wsi.read_region() to read full-resolution image for chunk
        - get corresponding mask region from tissue detection
        - upsample the mask (https://stackoverflow.com/a/32848377)
        - divide into tiles
        - write each tile to disk

In [63]:
import os
import glob
from tqdm import tqdm
import matplotlib.pyplot as plt

from pathml.tiling import extract_tiles_with_mask
from pathml.image_utils import pil_to_rgb, upsample_array, plot_mask
from pathml.pipeline import Pipeline
from pathml.base_preprocessor import (BaseSlideLoader,
                                      BaseSlidePreprocessor,
                                      BaseTileExtractor,
                                      BaseTilePreprocessor)
from pathml.transforms_HandE import TissueDetectionHE
from pathml.wsi import HESlide
from pathml.transforms import ForegroundDetection

In [66]:
# step 1
class MySlideLoader(BaseSlideLoader):
    def __init__(self, level):
        self.level = level
    
    def apply(self, path):
        data = HESlide(path).load_data(level=self.level)
        # add the level as an attribute to the SlideData object so we can access it later
        data.level = self.level
        return data

# step 2
class MySlidePreprocessor(BaseSlidePreprocessor):
    """slide-level preprocessor which detects regions of tissue"""
    def apply(self, data):
        # using downsampled image, so need to lower min_region_size for tissue detection
        tissue_detector = TissueDetectionHE(
            foreground_detection = ForegroundDetection(min_region_size=1000, max_hole_size=1000)
        )
        tissue_mask = tissue_detector.apply(data.image)
        data.mask = tissue_mask
        return data

Now we get to the trickier part, which is tile extraction.

In [128]:
class MyTileExtractor(BaseTileExtractor):
    
    def __init__(self, tile_size=224):
        self.tile_size = 224
    
    def apply(self, data, debug=False):
        """
        Use the downsampled data.mask to get full-resolution tiles. 
        Process full-resolution image in chunks.
        No tile-level preprocessing being performed.
        This will also write the tiles
        """
        # get scale for upscaling mask to full-res
        scale = data.wsi.slide.level_downsamples[data.level]
        scale = int(scale)
        # size of each chunk, at low-resolution
        chunk_size_low_res = 1000
        # size of each chunk, at full-resolution
        chunk_size = chunk_size_low_res * scale
        # how many chunks in each full_res dim
        # note that openslide uses (width, height) format
        full_res_j, full_res_i = data.wsi.slide.level_dimensions[0]
        
        #filepath for saving tiles
        if "testing" in data.wsi.path:
            out_dir = "/mnt/disks/pilot-data/preprocessed/testing"
        elif "training" in data.wsi.path:
            if "tumor" in data.wsi.path:
                out_dir = "/mnt/disks/pilot-data/preprocessed/training/tumor"
            elif "normal" in data.wsi.path:
                out_dir = "/mnt/disks/pilot-data/preprocessed/training/normal"
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        wsi_base_name = os.path.splitext(os.path.basename(data.wsi.path))[0]
        
        # loop thru chunks
        n_chunk_i = full_res_i // chunk_size
        n_chunk_j = full_res_j // chunk_size
        
        for ix_i in range(n_chunk_i):
            for ix_j in range(n_chunk_j):
                # get mask
                mask = data.mask[ix_i*chunk_size_low_res:(ix_i + 1)*chunk_size_low_res, 
                                 ix_j*chunk_size_low_res:(ix_j + 1)*chunk_size_low_res]
                
                if mask.mean() == 0.0:
                    # empty chunk, no need to continue processing
                    continue
                
                mask_upsampled = upsample_array(mask, scale)
                # get full-res image
                region = data.wsi.slide.read_region(
                    location = (ix_j*chunk_size, ix_i*chunk_size),
                    level = 0, size = (chunk_size, chunk_size)
                )
                region_rgb = pil_to_rgb(region)
                
                # divide into tiles
                good_tiles = extract_tiles_with_mask(
                    im = region_rgb, 
                    tile_size = self.tile_size,
                    mask = mask_upsampled
                )
                
                for tile in good_tiles:
                    # adjust i and j coords for each tile to account for the chunk offset
                    tile.i += ix_i*chunk_size
                    tile.j += ix_j*chunk_size
                    tile.save(out_dir = out_dir, filename = f"{wsi_base_name}_{tile.i}_{tile.j}.jpeg")

In [129]:
# compose into pipeline
class PipelineSSL(Pipeline):
    """
    Preprocessing pipeline for self-supervised learning project.
    CAMELYON16 data, with slide-level labels
    """
    def __init__(self, slide_loader, slide_preprocessor, tile_extractor):
        self.slide_loader = slide_loader
        self.slide_preprocessor = slide_preprocessor
        self.tile_extractor = tile_extractor
    
    def run(self, path, debug=False):
        """
        Run full pipeline
        """
        # load slide
        data = self.slide_loader.apply(path)
        
        # slide-level preprocessing: tissue detection
        data = self.slide_preprocessor.apply(data)
            
        # extract tiles
        self.tile_extractor.apply(data, debug=debug)

In [130]:
# compose into pipeline
pipeline_ssl = PipelineSSL(
    slide_loader = MySlideLoader(level = 3),
    slide_preprocessor = MySlidePreprocessor(),
    tile_extractor = MyTileExtractor()
)

In [131]:
# find all images to run on
impaths = glob.glob(
    "/mnt/disks/pilot-data/parrot.genomics.cn/gigadb/pub/10.5524/100001_101000/100439/CAMELYON16/**/*.tif",
    recursive = True
)

In [None]:
for im in tqdm(impaths):
    pipeline_ssl.run(im)