In [12]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import pycocotools.mask as mask_utils
import json
from tqdm import tqdm
from numpy.random import choice,seed
from numba import jit
from skimage import measure

In [None]:
@jit(nopython=True)
def bimask_to_rle(binary_mask):
    rle=[]
    # flatten the bimask into vector
    flat_bimask = binary_mask.T.flatten().astype(np.uint8)
    # check the first element
    if flat_bimask[0]==1:
        rle.append(0)
    num=flat_bimask[0]
    counter=1
    # iteration
    for i in range(1,len(flat_bimask)): 
        if num==flat_bimask[i]:
            counter+=1
        else:
            num=flat_bimask[i]
            rle.append(counter)
            counter=1
    # update for the last counter
    rle.append(counter)
    return rle
@jit(nopython=True)
def encpix_to_bimask(encpix,h,w):
    vec_mask=np.zeros(h*w) # vectorize mask
    for i in range(len(encpix)):
        if i%2==1:
            start=encpix[i-1]-1
            end=start+encpix[i]
            vec_mask[start:end]=1
    bimask=vec_mask.reshape(w,h).T # binary mask
    return bimask

# following codes from https://github.com/waspinator/pycococreator/blob/master/pycococreatortools/pycococreatortools.py

def close_contour(contour):
    if not np.array_equal(contour[0], contour[-1]):
        contour = np.vstack((contour, contour[0]))
    return contour

def binary_mask_to_polygon(binary_mask, tolerance=0):
    """Converts a binary mask to COCO polygon representation
    Args:
        binary_mask: a 2D binary numpy array where '1's represent the object
        tolerance: Maximum distance from original points of polygon to approximated
            polygonal chain. If tolerance is 0, the original coordinate array is returned.
    """
    polygons = []
    # pad mask to close contours of shapes which start and end at an edge
    padded_binary_mask = np.pad(binary_mask, pad_width=1, mode='constant', constant_values=0)
    contours = measure.find_contours(padded_binary_mask, 0.5)
    contours = np.subtract(contours, 1)
    for contour in contours:
        contour = close_contour(contour)
        contour = measure.approximate_polygon(contour, tolerance)
        if len(contour) < 3:
            continue
        contour = np.flip(contour, axis=1)
        segmentation = contour.ravel().tolist()
        # after padding and subtracting 1 we may get -0.5 points in our segmentation 
        segmentation = [0 if i < 0 else i for i in segmentation]
        polygons.append(segmentation)

In [13]:
def coco_dict_gen(file_dir,labels,id_lst):
    # input: file directory [str]
    # labels: rle/labels [dataframe]
    # id_lst: selected image ids
    img_lst=[]
    anno_lst=[]
    j=0
    for i in tqdm(range(len(id_lst))):
        imgid=id_lst[i]
        # image dict
        image_dict={}
        # image file directory
        image_dir=file_dir+imgid+'/images/'+imgid+'.png'
        # load the image
        image=plt.imread(image_dir)
        # specify the image dict
        image_dict['file_name']=image_dir
        image_dict['height']=int(image.shape[0])
        image_dict['width']=int(image.shape[1])
        image_dict['id']=i
        # append
        img_lst.append(image_dict)

        # annotation dict
        for encpix in labels[labels.ImageId==imgid]['EncodedPixels']:
            anno_dict={}

            encpix=list(map(int,encpix.split(' ')))
            bimask=encpix_to_bimask(encpix,image.shape[0],image.shape[1])
            rle_uncprs={'counts':bimask_to_rle(bimask),'size':list(bimask.shape)}
            rle_cprs=mask_utils.frPyObjects(rle_uncprs,image.shape[0],image.shape[1])
            poly=binary_mask_to_polygon(bimask)
            
            anno_dict['image_id']=i
            anno_dict['id']=j
            anno_dict['category_id']=1
            anno_dict['iscrowd']=0
            anno_dict['segmentation']=poly#rle_uncprs
            anno_dict['area']=float(mask_utils.area(rle_cprs))
            anno_dict['bbox']=list(mask_utils.toBbox(rle_cprs))

            anno_lst.append(anno_dict)
            j+=1

    data_dict={}
    data_dict['info']={}
    data_dict['licenses']=[]
    data_dict['images']=img_lst
    data_dict['annotations']=anno_lst
    data_dict['categories']=[{'supercategory': 'Nuclei', 'id': 1, 'name': 'mask'}]
    
    return data_dict

## Training set and validation set

In [14]:
file_dir='./stage1_train/'
labels=pd.read_csv('stage1_train_labels.csv')
id_lst=list(np.unique(labels.ImageId))
n=len(id_lst)
val_ratio=0.1

In [15]:
seed(2018)
val_id_lst=choice(id_lst,size=np.int(val_ratio*n),replace=False)
train_id_lst=[i for i in id_lst if i not in val_id_lst]

In [16]:
val_dict=coco_dict_gen(file_dir,labels,val_id_lst)
with open("./validation.json","w",encoding='utf-8') as f:
    json.dump(val_dict,f)
    print("Complete!")

100%|██████████| 67/67 [00:15<00:00,  7.66it/s]


Complete!


In [17]:
train_dict=coco_dict_gen(file_dir,labels,train_id_lst)
with open("./train.json","w") as f:
    json.dump(train_dict,f)
    print("Complete!")

100%|██████████| 603/603 [03:16<00:00,  4.40it/s]


Complete!


In [18]:
# mean 
import PIL.Image as plimg
h_lst=[]
w_lst=[]
pix_mean=[]
for i in tqdm(range(len(id_lst))):
    imgid=id_lst[i]
    # image dict
    image_dict={}
    # image file directory
    image_dir=file_dir+imgid+'/images/'+imgid+'.png'
    # load the image
    image=np.array(plimg.open(image_dir))
    h_lst.append(image.shape[0])
    w_lst.append(image.shape[1])
    pix_mean.append(np.mean(image[:,:,:3],axis=(0,1)))

100%|██████████| 670/670 [00:07<00:00, 95.50it/s] 


In [19]:
print(np.max(h_lst+w_lst))
print(np.min(h_lst+w_lst))
np.mean(pix_mean,axis=0)

1388
256


array([43.53287505, 39.56061986, 48.22454996])

## Test Set

In [20]:
file_dir='./stage1_test/'
labels=pd.read_csv('stage1_solution.csv')
id_lst=list(np.unique(labels.ImageId))

In [21]:
test_dict=coco_dict_gen(file_dir,labels,id_lst)
with open("./test.json","w") as f:
    json.dump(test_dict,f)
    print("Complete!")

100%|██████████| 65/65 [00:32<00:00,  1.84it/s]


Complete!


In [22]:
print(labels.Height.max(),labels.Height.min())
print(labels.Width.max(),labels.Width.min())

524 256
696 161


In [23]:
64/72

0.8888888888888888