# Data preprocessing: Creating tiles from microscopy images
### Kaggle Competition: UBC Ovarian Cancer Subtype Classification and Outlier Detection (UBC-OCEAN)

This notebook creates tiles from raw histopathology images fromt the Kaggle Ovarian Cancer Subtype Classification competition dataset. 
It should ideally be run on a Kaggle kernel, since downloading the whole training data locally (~700 GB) is cumbersome.

Problem: 
1) Images are too large to be fed into CNNs (~100k x 100k pixels, up to 6 GB)-
2) Only a small part of the images contains information about the cancer subtype
3) Processing all training images (~700 GB) takes several hours

Solution: 
1) Create tiles of size 512 x 512 pixels
2) use filters to extract patches with maximum information about cancer subtype
3) Use multiprocessing to extract tiles in batches

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import matplotlib.pyplot as plt
import os
import glob
import time
from tqdm import tqdm
import random
from PIL import Image
Image.MAX_IMAGE_PIXELS = None


In [4]:
def extract_tiles(image_name,
                        directory_images = '/kaggle/input/UBC-OCEAN/train_images/',
                        directiory_tiles = '/kaggle/working/tiles',
                        patch_size=2048, 
                        scale = 0.25.,
                        num_tiles=100,
                        threshold_black_pixel=10*3,
                        threshold_black_background_ratio=0.1, 
                        threshold_variability=0.15):
    
    """
        input: image_name
        return: file_paths of tiles created
        
        Arguments:
        directory_images: Directory of raw images
        directiory_tiles: Directory, into which tiles are saved
        patch_size: Tile size in units of the original image
        scale: scale factor for the tiles; the final tile will have dimension (patch_size*scale x patch_size*scale)
        num_tiles: maximum number of tiles that should be extracted from image
        threshold_black_pixel: Threshold of bw image, beneath which pixel is considered black
        threshold_black_background_ratio: Tile is discarded, when ratio of black pixels is larger than this value
        threshold_variability: Tile is discarded, when variability (std/mean) is smaller than this values
    """
    
    file_paths = []
    
    image_path = os.path.join(directory_images, image_name)
    with Image.open(image_path) as image:
        image_id = int(image_path.split('/')[-1][:-4])

        n_patches_h = int(np.floor(image.width/patch_size))
        n_patches_v = int(np.floor(image.height/patch_size))

        grid_h = np.arange(n_patches_h)
        random.shuffle(grid_h)

        grid_v = np.arange(n_patches_v)
        random.shuffle(grid_v)

        counter = 0
        for i in grid_h:
            if counter == num_tiles:
                break
            for j in grid_v:
                idx = i*n_patches_h+j
                patch = image.crop((i * patch_size, j * patch_size,
                                   i * patch_size+patch_size, j * patch_size+patch_size,))

                bw = np.sum(patch, axis=2) # black and white image
                num_black_pixels = np.sum(bw<threshold_black_pixel) # threshold 

                if np.mean(bw)>0:
                    variability = (np.std(bw)/np.mean(bw))
                else:
                    variability = 0

                # if Background is more than 10% of image, discard
                condition1 = num_black_pixels < threshold_black_background_ratio*patch_size**2

                # if variability is smaller than 0.1, discard
                condition2 =  variability > threshold_variability

                if (condition1 and condition2):
                    if scale<1:
                        patch = patch.resize((int(scale*patch_size), int(scale*patch_size)))
                    fname = f'{image_id}_{counter}.png'
                    file_path = os.path.join(directiory_tiles, fname)
                    patch.save(file_path)
                    file_paths.append(file_path)
                    counter += 1
                    if counter == num_tiles:
                        break
    if counter==0:
        print('no patch found')
    return file_paths

In [1]:
# get sizes of input images
im_paths = glob.glob('/kaggle/input/UBC-OCEAN/train_images/*.png')
im_sizes = [os.path.getsize(im_path) for im_path in im_paths]
df = pd.DataFrame({'paths':im_paths, 'sizes':im_sizes})
df = df.sort_values('sizes', ascending=True).reset_index()
file_paths = df['paths']
df.head()
sizes = [os.path.getsize(f) for f in file_paths]

NameError: name 'glob' is not defined

In [None]:
# Create batches of similarly sized images for multiprocessing 
start = 0
max_size = 3e9

start_list = [0]
num_files_list = []
while start<len(sizes):
    num_files = np.sum([(np.cumsum(sizes[start:])<max_size)])
    if num_files > 0:
        num_files_list.append(num_files)
    else:
        num_files_list.append(1)
    start += num_files_list[-1]
    start_list.append(start)

In [6]:
# Create task for multiprocessing
def task(i):
    extract_tiles_PIL(file_paths[i],
                        directory_images = '/kaggle/input/UBC-OCEAN/train_images/',
                        directiory_tiles = '/kaggle/working/tiles',
                        patch_size=1200, 
                        num_tiles=100,
                        scale=0.25,
                        threshold_black_pixel=10*3,
                        threshold_black_background_ratio=0.1, 
                        threshold_variability=0.15)

In [9]:
# Create output directory
os.mkdir('/kaggle/working/tiles')

In [10]:
from concurrent.futures import ThreadPoolExecutor


# Process images
t0 = time.time()

for i, start in enumerate(start_list[:80]):
    print()
    num_files = start_list[i+1]-start
    print(start, num_files)
    if num_files >1:
        print(f'Batch {i}/80: Multi')
        with ThreadPoolExecutor(max_workers=4) as executor:
            futures = []
            for i in range(start, start+num_files):
                futures.append(executor.submit(task, i))
        for future in futures:
            future.result()
            
    else:
        print('Single')
        task(start)
    print(f'{start+num_files} files processed ({np.sum(sizes[:start+num_files])/1e9} GB)')
    print(f'This took {(time.time()-t0)/60} min')



0 35
Batch 0/80: Multi
35 files processed (2.755583074 GB)
This took 2.739610687891642 min

35 8
Batch 1/80: Multi
43 files processed (5.692699395 GB)
This took 5.322237706184387 min

43 7
Batch 2/80: Multi
50 files processed (8.638527512 GB)
This took 8.044220793247224 min

50 6
Batch 3/80: Multi
56 files processed (11.447639553 GB)
This took 10.28765914440155 min

56 5
Batch 4/80: Multi
61 files processed (13.956695882 GB)
This took 12.417055316766103 min

61 5
Batch 5/80: Multi
66 files processed (16.57341403 GB)
This took 14.461115423838297 min

66 5
Batch 6/80: Multi
71 files processed (19.297620169 GB)
This took 16.672872189680735 min

71 5
Batch 7/80: Multi
76 files processed (22.157630479 GB)
This took 18.81913568576177 min

76 5
Batch 8/80: Multi
81 files processed (25.09045332 GB)
This took 21.20768335660299 min

81 4
Batch 9/80: Multi
85 files processed (27.524259917 GB)
This took 22.476172542572023 min

85 4
Batch 10/80: Multi
89 files processed (30.001061311 GB)
This took