# Tools to prepare Training Data (from raw data)

Raw Training Data are delivered in a folder containing
* images in format .jpg 
* zipped .roi files

For each image there is a corresponding zip file, containing several .roi files.
A .roi file is a contour created by a biologist by using Fiji.

Mask R-CNN model is expecting a binary mask instead of a .roi. 
This notebook is a set of tools you can use to convert ROI into MASKS, output in neat folders, and control the result.

The output folder architecture is the following : 
* glomerulus/train/
    * ImageID/
        * images/ -> contains the image in jpg format
        * masks/  -> contains the image's masks in png format
        * roi/    -> original rois

## FOLDERS

In [1]:
import os
import shutil
import stat
ROOT_FOLDER = "C:/Users/Fred/FRED_PRO/8. JEDHA/Projets/Nath - ROI detection/_GIT_Segmenting-Histology-Images"
IMG_FOLDER = os.path.join(ROOT_FOLDER,"Data/Images with ROI")
ROI_FOLDER = os.path.join(IMG_FOLDER, 'ROI')
ROI_unzipped_FOLDER = os.path.join(IMG_FOLDER, 'ROI_unzipped')
MASK_FOLDER = os.path.join(IMG_FOLDER, 'Masks')
TEMP =  os.path.join(IMG_FOLDER, 'temp')
DATASET_FOLDER = os.path.join(ROOT_FOLDER,"Data/glomerulus/train")

For each image "ImgName.jpg" there is a corresponding "ZipName.zip" file, containing several .roi files. <br>
For instance "**R22** VEGF 2_15.0x**C1**.jpg" has a corresponding "RoiSet**R22C1**.zip" <br>
The names of the .roi files inside the .zip file is composed of seamingly random figures, like 0075-1108 <br>
The following function creates a subfolder "ROI_unzipped", containing all .roi files, renamed as 'ImgName_RoiName.roi' <br> 
(ex: R22 VEGF 2_15.0xC1_0075-1108.roi). 


In [2]:
def fresh_start(folder):
    """
    If the unzip folder exists, delete and recreate empty 
    """
    if os.path.isdir(folder): 
        os.chmod(folder,stat.S_IWRITE) # authorize modification so that we can delete the folder
        shutil.rmtree(folder)          # delete folder and its content
    os.mkdir(folder,stat.S_IWRITE) # recreate empty

## UNZIP ROIs

In [75]:
fresh_start(ROI_unzipped_FOLDER)

In [78]:
# Unzip Roi Files from the ROI_FOLDER to the ROI_unzipped_FOLDER
import zipfile
files=os.listdir(IMG_FOLDER)
fresh_start(TEMP)
for img in files:
    if img.endswith('.jpg'):
        zip_name = 'ROISet'+ img[0:3] + img[-6:-4] + '.zip'
        with zipfile.ZipFile(ROI_FOLDER+'/'+zip_name,"r") as zip_ref:
            zip_ref.extractall(TEMP)
        for file in os.listdir(TEMP):
            if file.endswith('.roi'):
                filename = img[0:-4] +'_' + file # img_name without .jpg + '_' + current name
                os.rename(os.path.join(TEMP,file),os.path.join(TEMP,filename))
            shutil.move(os.path.join(TEMP,filename), os.path.join(ROI_unzipped_FOLDER,filename))

## CONVERT ROIS TO MASKS

We convert .roi into masks in three steps
1. read .roi file with the package https://pypi.org/project/read-roi/ 
2. convert into the opencv object 'contour'
3. us opencv function to fill in the contour 
4. export as a binary mask (actually png with black and white pixels)

The structure of roi is a dict with two levels :

* level 1 : just one key, the name of the file without extension
* level 2 : the interesting informations
    * type : 'freehand' - this means that the roi was drawn with mouse or pad. There are other ways, like geometric shapes. Normally all the roi in the dataset should be freehand style => worth checking
    * x, y : pixel coordinates
    * n : number of pixels
    * width : 0 ( ? check what it means)
    * name : name of the file without extension
    * position : 0 ( ? check what it means)

In [3]:
import cv2
import numpy as np
from read_roi import read_roi_file

def fill_contour(cnt,img_shape):
    '''
        takes a contour and an image shape
        return a black and white image with the countour filled in white
    '''
    img = np.zeros(img_shape, np.uint8)  # Create a black image
    cv2.drawContours(img, [cnt], 0, (255,255,255), -1)  # Draw countour and fill it (meaning of -1)
    return img

def roi2mask(roi_path,img_size):
    '''
        takes roi_path and returns a mask file
    '''
    roi = read_roi_file(roi_path)
    for roi_name in roi:
        cnt = np.array([[[x,y]] for (x,y) in zip(roi[roi_name]['x'],roi[roi_name]['y'])])
        mask = fill_contour(cnt,img_shape)
    return mask

In [4]:
def generate_masks(img_id,img_shape):
    """
    looks for all rois associated wiht img_id in the ROI_unzipped_FOLDER
    converts as masks
    saves them as png in the MASKS_FOLDER
    """
    rois=os.listdir(ROI_unzipped_FOLDER)
    for roi_name in rois:
        if roi_name.startswith(img_id): 
            roi_path = os.path.join(ROI_unzipped_FOLDER,roi_name)
            mask = roi2mask(roi_path,img_shape)
            mask_name = roi_name[0:-4]+'.png'
            cv2.imwrite(os.path.join(MASK_FOLDER,mask_name), mask) # PNG format mandatory to avoid compression loss like with jpg

In [6]:
# generate masks for all images in IMG_FOLDER
import skimage.io
fresh_start(MASKS)
images=os.listdir(IMG_FOLDER)
for image in images:
    if image.endswith('.jpg'):
        img_id = image[0:-4]
        img_shape = skimage.io.imread(os.path.join(IMG_FOLDER,image)).shape
        generate_masks(img_id,img_shape)

## SANITY CHECK : GENERATE IMAGES WITH MASKS

We generate images with superposed roi and masks, in order to check the result is consistent <br>
(I was not sure the contour filling would always work, since many contours were not properly closed)

In [34]:
import skimage.io
import matplotlib.pyplot as plt
from read_roi import read_roi_file

def generate_image_with_masks(img_name):
    img_id = img_name[0:-4]
    img_read = skimage.io.imread(os.path.join(IMG_FOLDER,img_name))
    img_shape = img_read.shape
    plt.figure(figsize=(20,11))
    plt.imshow(img_read)

    # Read and display Rois
    rois=os.listdir(ROI_unzipped_FOLDER)
    img_rois = [roi for roi in rois if (roi.endswith('.roi') and roi.startswith(img_id))]
    for roi in img_rois:
        roi = read_roi_file(os.path.join(ROI_unzipped_FOLDER,roi))
        for name in roi:
            x_roi = roi[name]['x']
            y_roi = roi[name]['y']
        plt.plot(x_roi,y_roi,color='red')

    # Read and display Masks
    masks=os.listdir(MASK_FOLDER)
    img_masks = [mask for mask in masks if (mask.endswith('.png') and mask.startswith(img_id))]
    mask_sum=0
    for mask in img_masks:
        mask_read = skimage.io.imread(os.path.join(MASK_FOLDER,mask))
        mask_sum += mask_read 
        # NB : we should use 'max' instead of 'sum', for the case of overlapping masks
        # ex : 255 + 255 -> 254 with uint8
        # However, there are few cases and it won't be noticeable on the output images
        # It's enough for visual inspection but should be corrected for further computations
    plt.imshow(mask_sum, alpha=0.5) 
    
    # Save in TEMP/
    plt.savefig(os.path.join(os.path.join(TEMP,img_id+'_with_rois.jpg')), dpi=96)
    plt.close()


In [None]:
images=os.listdir(IMG_FOLDER)
for image in images:
    if image.endswith('.jpg'):
        generate_image_with_masks(image)

## POPULATE OUTPUT FOLDER

In [39]:
images=os.listdir(IMG_FOLDER)
rois=os.listdir(ROI_unzipped_FOLDER)
masks=os.listdir(MASK_FOLDER)

for image in images:
    if image.endswith('.jpg'):
        img_id = image[0:-4]
        print(img_id)
        
        # folder architecture creation
        image_folder = os.path.join(DATASET_FOLDER, img_id)
        images_subfolder = os.path.join(image_folder,'images')
        rois_subfolder = os.path.join(image_folder,'rois')
        masks_subfolder = os.path.join(image_folder,'masks')
        os.mkdir(image_folder,stat.S_IWRITE)
        os.mkdir(images_subfolder,stat.S_IWRITE)
        os.mkdir(rois_subfolder,stat.S_IWRITE)
        os.mkdir(masks_subfolder,stat.S_IWRITE)
        
        # moving files
        os.rename(os.path.join(IMG_FOLDER,image),os.path.join(images_subfolder,image))
        for roi in rois:
            if (roi.endswith('.roi') and roi.startswith(img_id)):
                os.rename(os.path.join(ROI_unzipped_FOLDER,roi),os.path.join(rois_subfolder,roi))
        for mask in masks:
            if (mask.endswith('.png') and mask.startswith(img_id)):
                os.rename(os.path.join(MASK_FOLDER,mask),os.path.join(masks_subfolder,mask))
        

R22 VEGF 2_15.0xC1
R22 VEGF 2_15.0xC2
R22 VEGF 2_15.0xC3
R22 VEGF 2_15.0xC4
R22 VEGF 2_15.0xC5
R24 VEGF_15.0xC1
R24 VEGF_15.0xC2
R24 VEGF_15.0xC3
R24 VEGF_15.0xC4
R24 VEGF_15.0xC5
R25 VEGF_15.0xC1
R25 VEGF_15.0xC2
R25 VEGF_15.0xC3
R25 VEGF_15.0xC4
R25 VEGF_15.0xC5
R27 VEGF 2_15.0xC1
R27 VEGF 2_15.0xC2
R27 VEGF 2_15.0xC3
R27 VEGF 2_15.0xC4
R27 VEGF 2_15.0xC5
R27 VEGF 2_15.0xC6
R29 VEGF 2_15.0xC1
R29 VEGF 2_15.0xC2
R29 VEGF 2_15.0xC3
R29 VEGF 2_15.0xC4
R30 RD VEGF_15.0xC1
R30 RD VEGF_15.0xC2
R30 RD VEGF_15.0xC3
R30 RD VEGF_15.0xC4
R30 RD VEGF_15.0xC5
R31 RD VEGF_15.0xC1
R31 RD VEGF_15.0xC2
R31 RD VEGF_15.0xC3
R31 RD VEGF_15.0xC4
R32 RG VEGF_15.0xC1
R32 RG VEGF_15.0xC2
R32 RG VEGF_15.0xC3
R32 RG VEGF_15.0xC4
R32 RG VEGF_15.0xC5
R33 RD VEGF_15.0xC1
R33 RD VEGF_15.0xC2
R33 RD VEGF_15.0xC3
R33 RD VEGF_15.0xC4
R33 RD VEGF_15.0xC5
R33 RD VEGF_15.0xC6
R33 RD VEGF_15.0xC7
R33 RD VEGF_15.0xC8
R33 RD VEGF_15.0xC9
R34 RD VEGF_15.0xC1
R34 RD VEGF_15.0xC2
R34 RD VEGF_15.0xC3
R34 RD VEGF_15.0xC4
R34 R

# compute MEAN_PIXEL constant

In [32]:
import skimage.io
import numpy as np

means_pixel = []
IMG_FOLDER = "C:/Users/Fred/Downloads/Images with ROI"
images=os.listdir(IMG_FOLDER)
for image in images:
    if image.endswith('.jpg'):
        img_read = skimage.io.imread(os.path.join(IMG_FOLDER,image))
        rgb_mean = [img_read[:,:,0].mean(), img_read[:,:,1].mean(), img_read[:,:,2].mean()]
        means_pixel.append(rgb_mean)  
means_pixel = np.array(means_pixel)
mean_pixel = [means_pixel[:,0].mean(), means_pixel[:,1].mean(), means_pixel[:,2].mean()]
mean_pixel

[209.92926586595058, 203.35202129297485, 204.79692678635834]