In [1]:
# webstie https://nucleisegmentationbenchmark.weebly.com
# paper   https://drive.google.com/file/d/0ByERBiBsEbuTOEJISEpwSkR0SlE/view
import os
import re
import PIL
import time
import random
import shutil
import itertools
import collections
import numpy as np
import cv2 as cv
import matplotlib.pyplot as plt
from xml.dom import minidom
from PIL import Image
from skimage.io import imread, imshow

In [2]:
TRAIN_PATH = '../orig_data/Tissue images_png/'
XML_PATH   = '../orig_data/Annotations/'
OUT_PATH  = '../orig_data/external_TCGA_train/'  # manual create
#MASK_PATH  = OUT_PATH + '/masks'
#IMG_PATH = OUT_PATH + '/images'

train_ids = next(os.walk(TRAIN_PATH))[2]
xml_ids = next(os.walk(XML_PATH))[2]
print('train_ids = ' + str(len(train_ids)) + '\nxml_ids = ' + str(len(xml_ids)))
IMG_HEIGHT = 1000
IMG_WIDTH = 1000
IMG_CHANNELS = 1

mask_dict = {}
if not os.path.isdir(OUT_PATH):
    os.mkdir(OUT_PATH) 


train_ids = 30
xml_ids = 30


In [3]:
class AutoVivification(dict):
    """Implementation of perl's autovivification feature."""
    def __getitem__(self, item):
        try:
            return dict.__getitem__(self, item)
        except KeyError:
            value = self[item] = type(self)()
            return value

In [4]:
#img_filenames = [TRAIN_PATH + f for f in train_ids]
xml_filenames = [XML_PATH + f for f in xml_ids]
#print(img_filenames[0])
print(xml_filenames[0])


../orig_data/Annotations/TCGA-E2-A14V-01Z-00-DX1.xml


In [5]:
def fill_circle(mask_img,vertex_list):
    # max x value for each y
    max_right = {}     
    for i in range(len(vertex_list)):
        current_y = vertex_list[i][0]
        current_x = vertex_list[i][1]
        if current_y not in max_right:
            max_right[current_y] = current_x 
        else:
            max_right[current_y] = max(max_right[current_y],current_x)
    
    for i in range(len(vertex_list)):
        current_y = vertex_list[i][0]
        current_x = vertex_list[i][1]
        mask_img[current_y,current_x:max_right[current_y]] = 255
    
    return mask_img

In [6]:
def check_valid_xy(y, x):
    if x >= IMG_WIDTH: x = IMG_WIDTH-1
    if y >= IMG_HEIGHT: y = IMG_HEIGHT-1                
    if x <= 0: x = 0
    if y <= 0: y = 0
    return y,x

In [7]:
def generate_miss_node(y, x, py, px):
    dist_x = int(x) - px
    dist_y = int(y) - py
    miss_node = []
    if max(abs(dist_y),abs(dist_x)) >= 2 and py != IMG_HEIGHT and px != IMG_WIDTH:
        #print('------------------------------------------------------------')
        #print('x =>' + str(x) + '\ty => ' + str(y) )
        #print('px =>' + str(px) + '\tpy => ' + str(py) )
        a = int(x); b = px
        newx = list(map(int,np.linspace(a,b, abs(b-a)+1).tolist()))
        #print('newx => ' + str(newx))
        if len(newx) >= 2: newx.pop(-1)
        if len(newx) >= 2: newx.pop(0)
        a = int(y); b = py
        newy = list(map(int,np.linspace(a,b, abs(b-a)+1).tolist()))
        #print('newy => ' + str(newy))
        if len(newy) >= 2: newy.pop(-1)
        if len(newy) >= 2: newy.pop(0)
        miss_node = list(itertools.product(newy, newx))
    #miss_node = []  # enable this line if you wanna disable this function
    return miss_node

In [8]:
def regions2mask(Resions, xml_idx, mask_folder):
    mask_contour = np.zeros((IMG_HEIGHT, IMG_WIDTH), dtype=np.uint8)
    mask_solid   = np.zeros((IMG_HEIGHT, IMG_WIDTH), dtype=np.uint8)

    for region_idx in range(len(Regions)):
        #print('region_idx ' + str(region_idx))
        Region=Regions.item(region_idx)
        verticies=Region.getElementsByTagName('Vertex');
        Region_ID = Region.getAttribute('Id')
        #print('Region_ID ==> ' + str(Region_ID))
        single_mask = np.zeros((IMG_HEIGHT, IMG_WIDTH), dtype=np.uint8)    
        vertix_list = []
        px   = IMG_WIDTH
        py   = IMG_HEIGHT        

        for vertexi in range(len(verticies)):
            x=int(float(verticies.item(vertexi).getAttribute('X')))
            y=int(float(verticies.item(vertexi).getAttribute('Y')))

            y, x = check_valid_xy(y, x)

            miss_node = generate_miss_node(y, x, py, px)
            if miss_node != []:
                #print(miss_node)
                vertix_list.extend(miss_node)
                for node_y, node_x in miss_node:
                    mask_contour[node_y,node_x] = 255 
                    single_mask[node_y,node_x] = 255 # signal mask
            px = x
            py = y
            # fill mask contour
            mask_contour[y, x] = 255 # all for one mask   
            single_mask[y, x] = 255 # signal mask
            vertix_list.append([y, x]) 
        # consider relation between first node and last node
        miss_node = generate_miss_node(vertix_list[0][0], vertix_list[0][1], vertix_list[-1][0],vertix_list[-1][1])
        if miss_node != []:
            #print(miss_node)
            vertix_list.extend(miss_node)
            for node_y, node_x in miss_node:
                mask_contour[node_y,node_x] = 255 
                single_mask[node_y,node_x] = 255 # signal mask
        
        
        # generate folder and file name
        fname = re.sub('.*/|\.xml','',xml_filenames[xml_idx])
        #sigle_mask_folder = mask_folder + fname
        #print(sigle_mask_folder)
        #os.mkdir(sigle_mask_folder) if not os.path.isdir(sigle_mask_folder) else 'folder existed .....'
        sigle_mask_name = mask_folder + fname + '_' + '{:0>4}'.format(str(Region_ID)) + '.png'
        #print(sigle_mask_name)

        # fill mask circle w/ solid and generate png 
        mask_solid_ = fill_circle(single_mask,vertix_list )
        newImg1= Image.fromarray(mask_solid_,'L')
        newImg1.save(sigle_mask_name,"PNG")
        # merge all single solid-circle to one
        #mask_solid = np.maximum(mask_solid, mask_solid_)
        
        # dict storage
        #if fname not in mask_dict:
        #mask_dict[fname]['single_contour'][region_idx] = single_mask
        #mask_dict[fname]['single_solid'][region_idx] = mask_solid_
        #mask_dict[fname]['Region_Id'][region_idx] = Region_ID
        #mask_dict[fname]['solid'] = mask_solid
        #mask_dict[fname]['contour'] = mask_contour  

    # merge all sub mask and generate png
    #mask_contour_name = MASK_PATH + '/'+ fname + '_contour_interpolate.png'
    #newImg2= Image.fromarray(mask_contour,'L')
    #newImg2.save(mask_contour_name,"PNG")
    # output multiple solid-circle
    #mask_solid_name = MASK_PATH + '/'+ fname + '_solid_interpolate.png'
    #mask_solid_name = MASK_PATH + '/'+ fname + '.png'
    #newImg3= Image.fromarray(mask_solid,'L')
    #newImg3.save(mask_solid_name,"PNG")    

In [None]:
def convertTIF2PNG(TRAIN_PATH,tif_fname,png_path):
    TIF_full_name = TRAIN_PATH + tif_fname 
    if os.path.isfile(TIF_full_name+'.tiff'):
        TIF_full_name += '.tiff'
    elif os.path.isfile(TIF_full_name+'.tif'):
        TIF_full_name += '.tif'
    else:
        print('TRAIN_PATH dont contain subname tiff/tif for ' + tif_fname)
        
    PNG_full_name = png_path + tif_fname + '.png'
    print('TIF_full_name = ' + TIF_full_name)
    print('PNG_full_name = ' + PNG_full_name)
    '''
    # now work unstill
    TIF_Img = Image.open(TIF_full_name)
    imarray_new = np.array(TIF_Img)
    print('TIF_Img size = ' + str(TIF_Img.size))
    print(imarray_new.shape)
    PNG_Img = Image.fromarray(imarray_new,'RGB')
    PNG_Img.save(PNG_full_name,"PNG",quailty=100)
    '''

In [9]:
def generate_folder(fname):
    FILE_folder = OUT_PATH + fname + '/'
    MASK_folder = FILE_folder + 'masks/'
    IMG_folder = FILE_folder + 'images/'
    if not os.path.isdir(FILE_folder):
        os.mkdir(FILE_folder)
    if not os.path.isdir(MASK_folder):
        os.mkdir(MASK_folder)
    if not os.path.isdir(IMG_folder):
        os.mkdir(IMG_folder)
    #print('FILE_folder = ' + FILE_folder)
    #print('MASK_folder = ' + MASK_folder)
    #print('IMG_folder = ' + IMG_folder)
    return MASK_folder, IMG_folder
    

In [10]:
mask_dict = AutoVivification()
start_time = time.time()
region_count = 0
for xml_idx in range(len(xml_filenames)):
    fname = re.sub('.*/|\.xml','',xml_filenames[xml_idx])
    # generate ncecssary folder 
    MASK_folder, IMG_folder = generate_folder(fname)
    # move original png to indivitual folder
    shutil.copy(TRAIN_PATH+fname+'.png',IMG_folder)
    # parsing xml and generate label picture for each region
    root = minidom.parse(xml_filenames[xml_idx])
    Regions=root.getElementsByTagName('Region');
    #print(xml_filenames[xml_idx] + '\t' + str(len(Regions)))
    regions2mask(Regions, xml_idx, MASK_folder)
    region_count += len(Regions)

    #break
print("\ntotal region count is " + str(region_count))
print("--- %s seconds ---" % (time.time() - start_time))    


total region count is 16966
--- 159.69799828529358 seconds ---


In [None]:
#%matplotlib inline
#newImg1 = Image.new('PNG', (IMG_HEIGHT,IMG_WIDTH))
#mask = np.random.random((IMG_HEIGHT,int(IMG_WIDTH/2)))
#mask = mask.reshape((IMG_HEIGHT,IMG_WIDTH))
#print(mask.shape)
#newImg1= Image.fromarray(singal_mask,'L')
#newImg1.save("img1.png","PNG")
tmp = mask_dict['TCGA-E2-A14V-01Z-00-DX1']['single_contour'][0]
imshow(tmp[200:300,200:300])
#newImg1.show()
plt.show()

In [None]:
len(mask_dict['TCGA-E2-A14V-01Z-00-DX1']['single_contour'][1])

In [None]:
mask_dict['TCGA-E2-A14V-01Z-00-DX1']['single_contour'][0].shape

In [None]:
xml_filenames[0]

In [None]:
# plot specific region
target_fname  = 'TCGA-B0-5711-01Z-00-DX1'
target_region = 320 #374
#target_fname  = ''
#target_region = ''


if target_fname != '':
    select_id = [i for i, s in enumerate(xml_filenames) if target_fname in s]
    xml_idx = select_id[0]
    root = minidom.parse(xml_filenames[xml_idx])
    Regions=root.getElementsByTagName('Region');
else:
    xml_idx = random.randint(0, len(xml_filenames))
    root = minidom.parse(xml_filenames[xml_idx])
    Regions=root.getElementsByTagName('Region');
    target_region = random.randint(0, len(Regions))
    
    
plt.figure(figsize=(16, 8))
plt.cla()
ax1 = plt.subplot(1,2,1)
ax2 = plt.subplot(1,2,2)
target_contour_mask = mask_dict['TCGA-E2-A14V-01Z-00-DX1']['single_contour'][target_region]
target_solid_mask   = mask_dict['TCGA-E2-A14V-01Z-00-DX1']['single_solid'][target_region]
min_idx_y = min(np.where(target_contour_mask>0)[0])
max_idx_y = max(np.where(target_contour_mask>0)[0])
min_idx_x = min(np.where(target_contour_mask>0)[1])
max_idx_x = max(np.where(target_contour_mask>0)[1])

plot_y_idx = int(min_idx_y/100)*100 + 20
plot_x_idx = int(min_idx_x/100)*100 + 80
print('Region_Id = ' + str( mask_dict['TCGA-E2-A14V-01Z-00-DX1']['Region_Id'][target_region]))
print('plot_y_idx ' + str(plot_y_idx))
print('plot_x_idx ' + str(plot_x_idx))
print('min_idx_x ==> ' + str(min_idx_x))
width=80
ax1.imshow(target_contour_mask[plot_y_idx:plot_y_idx+width,plot_x_idx:plot_x_idx+width],cmap='gray')  
ax2.imshow(target_solid_mask[plot_y_idx:plot_y_idx+width,plot_x_idx:plot_x_idx+width],cmap='gray')  


    
plt.show()
print(target_contour_mask)
#root = minidom.parse(xml_filenames[xml_idx])

In [None]:
target_solid_mask[plot_y_idx:plot_y_idx+width,plot_x_idx:plot_x_idx+width].shape

In [None]:
import pandas as pd
print(plot_y_idx)
print(plot_x_idx)
#xx = pd.DataFrame(target_contour_mask[plot_y_idx:plot_y_idx+width,plot_x_idx:plot_x_idx+width])
xx = pd.DataFrame(target_contour_mask[730:770,299:320])
#target_contour_mask[plot_y_idx:plot_y_idx+width,plot_x_idx:plot_x_idx+width]
xx

In [None]:
mask_dict['TCGA-E2-A14V-01Z-00-DX1']['solid']

In [None]:
target_contour_mask
min_idx_y = min(np.where(target_region_mask>0)[0])
max_idx_y = max(np.where(target_region_mask>0)[0])
min_idx_x = min(np.where(target_region_mask>0)[1])
max_idx_x = max(np.where(target_region_mask>0)[1])
print(min_idx)
print('-----------------')
print(min_idx[0])


In [None]:
min_idx_y = min(np.where(target_region_mask>0)[0])
max_idx_y = max(np.where(target_region_mask>0)[0])
print(min_idx_y)
print(max_idx_y)

In [None]:
from pprint import pprint
pprint(dict(mask_dict))

In [None]:
TIF_full_name = '../orig_data/Tissue images/TCGA-E2-A14V-01Z-00-DX1.tif'
PNG_full_name = '../orig_data/external_TCGA_train/TCGA-E2-A14V-01Z-00-DX1/images/TCGA-E2-A14V-01Z-00-DX1.png'

In [None]:
#from time import sleep
ima = Image.open(TIF_full_name)
#dtype = {'F': np.float32, 'L': np.uint8, 'RGB': np.uint8}[ima.mode]
print('format = ' + ima.format)
print('size = '+ str(ima.size))
print('mode = ' + ima.mode)
im_npp = np.zeros((IMG_HEIGHT, IMG_WIDTH), dtype=np.uint8)
im_npp = np.array(ima)
#im_npp = np.array(ima)
#sleep(5)
print(im_npp.shape)

In [None]:
im_npp = np.array(ima)

In [None]:
im_npp.shape

In [None]:
ima.getdata()