In [1]:
import pandas as pd
import numpy as np
import os
import multiprocessing
from tqdm import tqdm
from PIL import Image

Image.MAX_IMAGE_PIXELS = None

'''
Most of the code is from:
https://www.kaggle.com/code/jirkaborovec/bloodclots-eda-load-wsi-prune-background/notebook
'''

'\nMost of the code is from:\nhttps://www.kaggle.com/code/jirkaborovec/bloodclots-eda-load-wsi-prune-background/notebook\n'

In [2]:
DATASET_FOLDER = './data/'
N_JOBS = -1

In [3]:
path_csv = os.path.join(DATASET_FOLDER, "train.csv")
df_train = pd.read_csv(path_csv)

In [4]:
def mask_median(im, val=255):
    '''
    Creates the mask where the value is greater or lower than the median for each color map, to decide whether
    to delete row/column or not (in prune_image_rows_cols function). 
    To work properly assumes the background is white (0,0,0).
    '''
    masks = [None] * 3
    
    for c in range(3):
        masks[c] = im[..., c] >= np.median(im[:, :, c]) - 5
        
    mask = np.logical_and(*masks)
    im[mask, :] = val
    
    return im, mask


def prune_image_rows_cols(im, mask, thr=0.990):
    '''
    Deletes rows and columns where the number of pixels in the mask is greater than the threshold
    '''
    
    # delete empty columns
    for l in reversed(range(im.shape[1])):
        if (np.sum(mask[:, l]) / float(mask.shape[0])) > thr:
            im = np.delete(im, l, 1)
            
    # delete empty rows
    for l in reversed(range(im.shape[0])):
        if (np.sum(mask[l, :]) / float(mask.shape[1])) > thr:
            im = np.delete(im, l, 0)
            
    return im

In [8]:
def image_load_scale_norm(img_path, prune_thr=0.990, bg_val=255):
    '''
    Prunes the image, and resizes the image if they still to big
    '''
    
    img = Image.open(img_path)
    
    scale = min(img.height / 2e3, img.width / 2e3)
    
    if scale > 1:
        tmp_size = int(img.width / scale), int(img.height / scale)
        img.thumbnail(tmp_size, resample=Image.Resampling.BILINEAR, reducing_gap=scale)
        
    im, mask = mask_median(np.array(img), val=bg_val)
    im = prune_image_rows_cols(im, mask, thr=prune_thr)
    img = Image.fromarray(im)
    scale = min(img.height / 1e3, img.width / 1e3)
    
    if scale > 1:
        img = img.resize((int(img.width / scale), int(img.height / scale)), Image.Resampling.LANCZOS)
        
    return img

In [10]:
def preprocess_image(image_id):
    '''
    Gets and image and creates the preprocessed one in the "train_images" folder.
    '''
    img_path = os.path.join(DATASET_FOLDER, "train", f"{image_id}.tif")
    img = image_load_scale_norm(img_path)
    
    if not img:
        return
    
    img.save(os.path.join(DATASET_FOLDER, "train_images", f"{image_id}.png"))
    del img

In [None]:
if N_JOBS == 1:
    for name in tqdm(df_train["image_id"]):
        preprocess_image(name)
else: 
    # It has problems with space
    with multiprocessing.Pool(processes=os.cpu_count() if N_JOBS == -1 else N_JOBS) as pool:
        pool.map(
            func=preprocess_image, 
            iterable=[name for name in df_train["image_id"]]
        )

        pool.close()

In [None]:
from scipy.ndimage.morphology import binary_fill_holes
from skimage.color import rgb2gray
from skimage.feature import canny
from skimage.morphology import binary_closing, binary_dilation, disk


def optical_density(tile):
    """
    Convert a tile to optical density values.
    
    Code from (deep-histopath/deephistopath/preprocessing.py): 
    https://github.com/CODAIT/deep-histopath/tree/c8baf8d47b6c08c0f6c7b1fb6d5dd6b77e711c33

    Args:
    tile (np.array): A 3D NumPy array of shape (tile_size, tile_size, channels).

    Returns:
    A 3D NumPy array of shape (tile_size, tile_size, channels) representing optical density values.
    """
    tile = tile.astype(np.float64)
    #od = -np.log10(tile/255 + 1e-8)
    od = -np.log((tile+1)/240)
    return od


def keep_tile(tile:np.array, tile_size:int, tissue_threshold:float=0.9)->bool:
    """
    Determine if a tile should be kept.
    
    This filters out tiles based on size and a tissue percentage
    threshold, using a custom algorithm. If a tile has height &
    width equal to (tile_size, tile_size), and contains greater
    than or equal to the given percentage, then it will be kept;
    otherwise it will be filtered out.
    
    Check 1:
    Uses edge detection (Canny) to see where the tissue is, as the
    tissue should be full of edges while the background should not.
    The algorithm follows the following steps: 
        . Convert image from RGB to gray
        . Canny (for edge detection)
        . Closing (dilatation -> erosion)
        . Dilatation
        . Fill holes
        . Percentage of binary image that represent the % of tissue (check with tissue_threshold)
        
    Check 2:
    It is based on the optical density of the image.
    The algorithm follows the following steps: 
        . Calculate the optical density values of the tile
        . Binarize the image with the threshold of 0.15
        . Closing (dilatation -> erosion)
        . Dilatation
        . Fill holes
        . Percentage of binary image that represent the % of tissue (check with tissue_threshold)
        
    Code from (deep-histopath/deephistopath/preprocessing.py): 
    https://github.com/CODAIT/deep-histopath/tree/c8baf8d47b6c08c0f6c7b1fb6d5dd6b77e711c33

    Args:
        tile (np.array): 3D NumPy array with the tile of shape (width, height, 3).
        tile_size (int): Desired tile size.
        tissue_threshold (float, optional): Percentage of the image that has to be tissue. Defaults to 0.9.

    Returns:
        bool: A Boolean indicating whether or not a tile should be kept for future usage.
    """
    if tile.shape[0:2] != (tile_size, tile_size):
        return False
    
    tile_orig = tile.copy()
    
    # Check 1
    tile = rgb2gray(tile)
    tile = 1 - tile # Binary image where 0 = background, 1 = dense tissue
    tile = canny(tile)
    tile = binary_closing(tile, disk(10))
    tile = binary_dilation(tile, disk(10))
    tile = binary_fill_holes(tile)
    percentage = tile.mean()
    check1 = percentage >= tissue_threshold
    
    # Check 2
    tile = optical_density(tile_orig)
    beta = 0.15
    tile = np.min(tile, axis=2) >= beta
    tile = binary_closing(tile, disk(2))
    tile = binary_dilation(tile, disk(2))
    tile = binary_fill_holes(tile)
    percentage = tile.mean()
    check2 = percentage >= tissue_threshold
    
    return check1 and check2


In [None]:
class WSI:
    def __init__(self, path:str, tile_size:int=256, overlap:int=0, tile_level:int=-1, tissue_threshold:float=0.90):
        self.path = path
        self.tile_size = tile_size
        self.overlap = overlap
        self.tile_level = tile_level
        self.tissue_threshold = tissue_threshold
        
        self.slide = open_slide(self.path)
        self.generator = DeepZoomGenerator(self.slide, tile_size=self.tile_size, overlap=self.overlap)
        self.tiles = create_tiles
    
    def get_top_N_tiles(self, N:int, target_dir:str=self.path)->Tuple[Tile, ...]:
        pass
    
    def _create_tiles(
    generator:DeepZoomGenerator,
    tile_level:int,
    tile_size:int,
    tissue_threshold:float
)->Tuple[Tile, ...]:
    
    cols, rows = generator.level_tiles[tile_level]

    for row in range(rows):
        for col in range(cols):
            tile = generator.get_tile(tile_level, (col, row))
            tile_np = np.array(tile)
            
            if _keep_tile(tile_np, tile_size, tissue_threshold):
                tile.save(os.path.join(target_dir, f"{base_image_name}-{count}.png"))
                count += 1

In [None]:
class Tile:
    def __init__(self, tile:Image, col:int, row:int, basename:str):
        self.tile = tile
        self.col = col
        self.row = row
        self.basename = basename
        self.tissue_percentage = _tissue_percentage
        
    def _tissue_percentage(self, tile):
        
    
    def save_tile(self, )

lista -> [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
indice -> 15
post precessing -> 15
valor obtido -> 15
