<a href="https://colab.research.google.com/github/matjesg/deepflash2/blob/master/paper/challenge_data/preprocess_gleason.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data preprocessing for the Gleason 2019 Challenge Dataset


![Gleason Logo](https://rumc-gcorg-p-public.s3.amazonaws.com/i/2020/01/21/550cece0.png)

**References:** 

- Nir G, Hor S, Karimi D, Fazli L, Skinnider BF, Tavassoli P, Turbin D, Villamil CF, Wang G, Wilson RS, Iczkowski KA. Automatic grading of prostate cancer in digitized histopathology images: Learning from multiple experts. Medical image analysis. 2018 Dec 1;50:167-80.

- Karimi D, Nir G, Fazli L, Black PC, Goldenberg L, Salcudean SE. Deep Learning-Based Gleason Grading of Prostate Cancer From Histopathology Images—Role of Multiscale Decision Aggregation and Data Augmentation. IEEE journal of biomedical and health informatics. 2019 Sep 30;24(5):1413-26.

See https://gleason2019.grand-challenge.org/ for more information.


## 1. Download and extract data

In [None]:
# Train data with labels
!wget -O train.zip https://m209.syncusercontent.com/zip/00ba920b1d8700367e5a42f336a954de/Train%20Imgs.zip?linkcachekey=2312d2d50&pid=00ba920b1d8700367e5a42f336a954de&jid=56108426
!mkdir train && unzip -qju train.zip -d train/images

In [None]:
# Expert segmentations
!mkdir expert_segmentations

!wget -O maps1.zip https://m209.syncusercontent.com/zip/74d1fd441a935c8566eba260a388c946/Maps1_T.zip?linkcachekey=7c1511b70&pid=74d1fd441a935c8566eba260a388c946&jid=1eab4912
!unzip -qju maps1.zip -d expert_segmentations/expert1

!wget -O maps2.zip https://m209.syncusercontent.com/zip/b8da9b621d450b16dd5a6e14520223b1/Maps2_T.zip?linkcachekey=40029c030&pid=b8da9b621d450b16dd5a6e14520223b1&jid=57518d06
!unzip -qju maps2.zip -d expert_segmentations/expert2

!wget -O maps3.zip https://m209.syncusercontent.com/zip/f2998fa4353fb6f41f1df491fd07de0c/Maps3_T.zip?linkcachekey=46047d8f0&pid=f2998fa4353fb6f41f1df491fd07de0c&jid=611d4658
!unzip -qju maps3.zip -d expert_segmentations/expert3

!wget -O maps4.zip https://m209.syncusercontent.com/zip/3e19f34db9df54c43ddaf528b9010d0d/Maps4_T.zip?linkcachekey=fb19fb780&pid=3e19f34db9df54c43ddaf528b9010d0d&jid=ba72b487
!unzip -qju maps4.zip -d expert_segmentations/expert4

!wget -O maps5.zip https://m209.syncusercontent.com/zip/ed46068e96fe2669fe3dfc20d933613f/Maps5_T.zip?linkcachekey=4ca5bebb0&pid=ed46068e96fe2669fe3dfc20d933613f&jid=432ea1a2
!unzip -qju maps5.zip -d expert_segmentations/expert5

!wget -O maps6.zip https://m209.syncusercontent.com/zip/fc98a2b5b5ba5735ab395be560aba46b/Maps6_T.zip?linkcachekey=ee2ab68f0&pid=fc98a2b5b5ba5735ab395be560aba46b&jid=d9167209
!unzip -qju maps6.zip -d expert_segmentations/expert6

## 2. Imports and functions

In [None]:
# deepflash2 preprocessing required
!pip install -qq git+https://github.com/matjesg/deepflash2.git@master

In [None]:
#!pip install -U SimpleITK

In [None]:
# Imports
import SimpleITK as sitk
import imageio
import numpy as np
from pathlib import Path
from fastai.vision.all import *
from sklearn.model_selection import train_test_split

In [None]:
def read_msk(msk_path, **kwargs):
    msk = imageio.imread(msk_path)
    # Replace classes for use in consecutive order
    msk[msk==3] = 1
    msk[msk==4] = 2
    msk[msk==5] = 3
    msk[msk==6] = 3 
    assert msk.max()<=3
    return msk

def staple(segmentations):
    'STAPLE: Simultaneous Truth and Performance Level Estimation with simple ITK'
    sitk_segmentations = [sitk.GetImageFromArray(x) for x in segmentations]

    STAPLE = sitk.MultiLabelSTAPLEImageFilter()
    STAPLE.SetLabelForUndecidedPixels(255)
    msk = STAPLE.Execute(sitk_segmentations)
    msk = sitk.GetArrayFromImage(msk)
    traces = []
    for _ in range(len(segmentations)):
        cm = np.array(STAPLE.GetConfusionMatrix(_))
        if len(cm)==6: cm = cm.reshape((3,2))
        elif len(cm)==12: cm = cm.reshape((4,3))
        elif len(cm)==20: cm = cm.reshape((5,4))
        elif len(cm)==30: cm = cm.reshape((6,5))
        elif len(cm)==42: cm = cm.reshape((7,6))
        elif len(cm)==54: cm = cm.reshape((8,7))
        else: raise NotImplementedError
        traces.append(np.trace(cm))
    best_seg = np.argmax(traces)
    
    # Replace undecided pixels with values from 'best' segementation
    msk[msk == 255] = segmentations[best_seg][msk == 255]
    assert msk.max()<=3
    return msk

## 3. Ground truth estimation from expert masks

We use STAPLE instead of majority voting here!

Example:
![](https://rumc-gcorg-p-public.s3.amazonaws.com/i/2020/01/21/f9f06df6.png)



In [None]:
# Folders
exp_dir = Path('expert_segmentations')
out_dir = Path('train')/'masks_STAPLE'
out_dir.mkdir()
mask_fn = lambda exp,msk: exp_dir/exp/msk

# Get expert and file names
fnames = get_image_files(exp_dir)
masks = {}
experts = []
for m in sorted(fnames):
    exp = m.parent.name
    if m.name in masks:
        masks[m.name].append(exp)
    else:
        masks[m.name] = [exp]
    experts.append(exp)
experts = sorted(set(experts))

In [None]:
for m, exps in progress_bar(masks.items()):
    masks = [read_msk(mask_fn(exp,m)) for exp in exps]
    ref = staple(masks)
    imageio.imsave(out_dir/m, ref)

## 4. Split into train and test set

In [None]:
# Settings
train_path = Path('train')
test_path = Path('test')
image_folder = 'images'
mask_folder = 'masks_STAPLE'
mask_suffix = '_classimg_nonconvex.png'

# Functions for copying data
cp_fn = lambda o: test_path/image_folder/p.name
cp_fn_msk = lambda o: test_path/mask_folder/p.name
label_fn = lambda o: train_path/mask_folder/f'{o.stem}{mask_suffix}'

(test_path/image_folder).mkdir(exist_ok=True, parents=True)
(test_path/mask_folder).mkdir(exist_ok=True)

In [None]:
#f_names = get_image_files(train_path/image_folder)
#_, val = train_test_split(f_names, train_size=0.8, shuffle=True, random_state=0)

# Original split was not sorted, so fixing file names here
val = [Path('train/images/slide001_core004.jpg'),
 Path('train/images/slide001_core005.jpg'),
 Path('train/images/slide001_core010.jpg'),
 Path('train/images/slide001_core011.jpg'),
 Path('train/images/slide001_core014.jpg'),
 Path('train/images/slide001_core030.jpg'),
 Path('train/images/slide001_core039.jpg'),
 Path('train/images/slide001_core059.jpg'),
 Path('train/images/slide001_core095.jpg'),
 Path('train/images/slide002_core041.jpg'),
 Path('train/images/slide002_core050.jpg'),
 Path('train/images/slide002_core062.jpg'),
 Path('train/images/slide002_core067.jpg'),
 Path('train/images/slide002_core074.jpg'),
 Path('train/images/slide002_core084.jpg'),
 Path('train/images/slide002_core096.jpg'),
 Path('train/images/slide003_core055.jpg'),
 Path('train/images/slide003_core067.jpg'),
 Path('train/images/slide003_core097.jpg'),
 Path('train/images/slide003_core114.jpg'),
 Path('train/images/slide003_core134.jpg'),
 Path('train/images/slide003_core135.jpg'),
 Path('train/images/slide003_core136.jpg'),
 Path('train/images/slide005_core017.jpg'),
 Path('train/images/slide005_core018.jpg'),
 Path('train/images/slide005_core021.jpg'),
 Path('train/images/slide005_core029.jpg'),
 Path('train/images/slide005_core038.jpg'),
 Path('train/images/slide005_core045.jpg'),
 Path('train/images/slide005_core051.jpg'),
 Path('train/images/slide005_core057.jpg'),
 Path('train/images/slide005_core064.jpg'),
 Path('train/images/slide005_core122.jpg'),
 Path('train/images/slide005_core147.jpg'),
 Path('train/images/slide006_core016.jpg'),
 Path('train/images/slide006_core023.jpg'),
 Path('train/images/slide006_core086.jpg'),
 Path('train/images/slide006_core102.jpg'),
 Path('train/images/slide006_core105.jpg'),
 Path('train/images/slide006_core108.jpg'),
 Path('train/images/slide006_core109.jpg'),
 Path('train/images/slide006_core110.jpg'),
 Path('train/images/slide006_core113.jpg'),
 Path('train/images/slide006_core114.jpg'),
 Path('train/images/slide006_core125.jpg'),
 Path('train/images/slide006_core142.jpg'),
 Path('train/images/slide007_core047.jpg'),
 Path('train/images/slide007_core055.jpg'),
 Path('train/images/slide007_core056.jpg')]

In [None]:
for p in progress_bar(val):
    shutil.move(str(p), str(cp_fn(p)))
    msk_p = label_fn(p)
    shutil.move(str(msk_p), str(cp_fn_msk(p)))