In [1]:
import torch
import tables
import os,sys
import glob
import PIL
import numpy as np
import cv2
import matplotlib.pyplot as plt
from sklearn import model_selection
import sklearn.feature_extraction.image
import random


seed = random.randrange(sys.maxsize) #get a random seed so that we can reproducibly do the cross validation setup
random.seed(seed) # set the seed
print(f"random seed (note down for reproducibility): {seed}")

random seed (note down for reproducibility): 984757887166951114


In [2]:




targetMag = 8
#17S050572_1A_L12 - 2018-03-20 13.09.26.ndpi Masked out because of too large to be saved in mag=8
baseMag = 20
file_ext= ".png"
mask_ext = ".png"
mask_pattern = "*"

#imgdir = f"/home/yxz1826/downsampled/{targetMag}_group"
#maskroot = '/Skin/QupathOut/multi_mag'

#maskdir = maskroot+f"/{targetMag:.1f}/"
#pytable_root = '/mnt/ccipd_home/yxz1826/Skin/pytable'

imgdir = f"Y:\\home\\yxz1826\\Skin\\tiles\\{targetMag:.0f}_tile"
maskroot = 'Y:\\home\\yxz1826\\Skin\\tiles\\'
maskdir = maskroot+f"\\{targetMag:.0f}_mask"
pytable_root = 'E:\\pytable'
print(imgdir,maskdir,pytable_root)

Y:\home\yxz1826\Skin\tiles\8_tile Y:\home\yxz1826\Skin\tiles\\8_mask E:\pytable


In [3]:
dataname="melanoma_png"+"_"+str(targetMag)
patch_size=500 #size of the tiles to extract and save in the database, must be >= to training size
test_set_size=.1 # what percentage of the dataset should be used as a held out validation/testing set
classes=[0,1,2,3] #what classes we expect to have in the data, here we have only 2 classes but we could add additional classes and/or specify an index from which we would like to ignore

stride_size=250
mirror_pad_size=250
resize = 1.
print(dataname)

melanoma_png_8


In [4]:
img_dtype = tables.UInt8Atom()  # dtype in which the images will be saved, this indicates that images will be saved as unsigned int 8 bit, i.e., [0,255]
filenameAtom = tables.StringAtom(itemsize=255) #create an atom to store the filename of the image, just incase we need it later, 

In [5]:
maskpattern = maskdir+'\\'+mask_pattern+mask_ext
files=glob.glob(maskpattern) # create a list of the files, in this case we're only interested in files which have masks so we can use supervised learning

#create training and validation stages and split the files appropriately between them
phases={}
phases["train"],phases["val"]=next(iter(model_selection.ShuffleSplit(n_splits=10,test_size=test_set_size).split(files)))

#specify that we'll be saving 2 different image types to the database, an image and its associated masked
imgtypes=["img","mask"]
print(len(files))

777


In [6]:
from tqdm import tqdm
import readslide
from readslide import getThumbByMag
from readslide import ndpiAdaptor
from readslide import pngAdaptor
import traceback

In [7]:
def defaultIOAdaptor(imgtype,imgdir,fname,file_ext,classes,mags,totals):
    if(imgtype=="img"): #if we're looking at an img, it must be 3 channel, but cv2 won't load it in the correct channel order, so we need to fix that
        io=cv2.cvtColor(cv2.imread("./imgs/"+os.path.basename(fname).replace("_mask.png",file_ext)),cv2.COLOR_BGR2RGB)
        interp_method=PIL.Image.BICUBIC

    else: #if its a mask image, then we only need a single channel (since grayscale 3D images are equal in all channels)
        io=cv2.imread(fname)/255 #the image is loaded as {0,255}, but we'd like to store it as {0,1} since this represents the binary nature of the mask easier
        interp_method=PIL.Image.NEAREST #want to use nearest! otherwise resizing may cause non-existing classes to be produced via interpolation (e.g., ".25")

        for i,key in enumerate(classes): #sum the number of pixels, this is done pre-resize, the but proportions don't change which is really what we're after
            totals[1,i]+=sum(sum(io[:,:,0]==key))    
    return io

In [16]:
#
imageIOAdaptor = pngAdaptor;
storage={} #holder for future pytables
err_report = []
h5_debug = {}
block_shape={} #block shape specifies what we'll be saving into the pytable array, here we assume that masks are 1d and images are 3d
block_shape["img"]= np.array((patch_size,patch_size,3))
block_shape["mask"]= np.array((patch_size,patch_size))  # the output for both imgtypes has 3 channels here
patch_shape={}
patch_shape['img']=(patch_size,patch_size,3)
patch_shape['mask']=(patch_size,patch_size,3)

filters=tables.Filters(complevel=6, complib='zlib') #we can also specify filters, such as compression, to improve storage speed
interpolation  = {}
interpolation['img'] = PIL.Image.BICUBIC
interpolation['mask'] = PIL.Image.NEAREST
pad_shape = {}
pad_shape['img'] =  [(mirror_pad_size, mirror_pad_size), (mirror_pad_size, mirror_pad_size), (0, 0)]
pad_shape['mask'] =  [(mirror_pad_size, mirror_pad_size), (mirror_pad_size, mirror_pad_size),(0,0)]

for phase in phases.keys(): #now for each of the phases, we'll loop through the files
    print(phase)
    
    totals=np.zeros((2,len(classes))) # we can to keep counts of all the classes in for in particular training, since we 
    totals[0,:]=classes               # can later use this information to create better weights

    hdf5_file = tables.open_file(f"{pytable_root}/{dataname}_{phase}.pytable", mode='w') #open the respective pytable
    h5_debug[phase] = hdf5_file
    storage["filename"] = hdf5_file.create_earray(hdf5_file.root, 'filename', filenameAtom, (0,)) #create the array for storage
    
    for imgtype in imgtypes: #for each of the image types, in this case mask and image, we need to create the associated earray
        storage[imgtype]= hdf5_file.create_earray(hdf5_file.root, imgtype, img_dtype,  
                                                  shape=np.append([0],block_shape[imgtype]), 
                                                  chunkshape=np.append([1],block_shape[imgtype]),
                                                  filters=filters)
    io_arr_out = {}
    for filei in tqdm(phases[phase]): #now for each of the files
        fname=files[filei] 
        
        
        succeed = True;
        try:
            for imgtype in imgtypes:
                #read image
                io = imageIOAdaptor(imgtype,imgdir,fname,file_ext,classes,(targetMag,baseMag),totals)
                interp_method = interpolation[imgtype]
                io = cv2.resize(io,(0,0),fx=resize,fy=resize, interpolation=interp_method) #resize it as specified above
                io = np.pad(io,pad_shape[imgtype],mode="reflect")

                #convert input image into overlapping tiles, size is ntiler x ntilec x 1 x patch_size x patch_size x3
                io_arr_out[imgtype]=sklearn.feature_extraction.image.extract_patches(io,patch_shape[imgtype],stride_size)

                #resize it into a ntile x patch_size x patch_size x 3
                io_arr_out[imgtype]=io_arr_out[imgtype].reshape(-1,patch_size,patch_size,3)
                succeed = True;    
        except Exception as e:
                succeed = False; 
                raise e
                #err_report.append((phase,fname,imgtype,str(e),traceback.extract_stack()))
            #save the 4D tensor to the table
        if (succeed):
            print(1)
            storage["img"].append(io_arr_out["img"])
            storage["mask"].append(io_arr_out["mask"][:,:,:,0].squeeze()) #only need 1 channel for mask data
            storage["filename"].append([fname for x in range(io_arr_out[imgtype].shape[0])]) #add the filename to the storage array
        else:
            pass
            print("skip:",err_report[-1])
    #lastely, we should store the number of pixels
    npixels=hdf5_file.create_carray(hdf5_file.root, 'numpixels', tables.Atom.from_dtype(totals.dtype), totals.shape)
    npixels[:]=totals
    hdf5_file.close()

train


  0%|                                                                                          | 0/699 [00:00<?, ?it/s]

1


  0%|                                                                                  | 1/699 [00:00<11:27,  1.01it/s]

1


  0%|▏                                                                                 | 2/699 [00:02<12:38,  1.09s/it]

1


  0%|▎                                                                                 | 3/699 [00:03<12:26,  1.07s/it]

1


  1%|▍                                                                                 | 4/699 [00:03<09:45,  1.19it/s]

1


  1%|▌                                                                                 | 5/699 [00:05<12:21,  1.07s/it]

1





KeyboardInterrupt: 

In [None]:
sklearn.feature_extraction.image.extract_patches(io,(patch_size,patch_size,None,3),stride_size)

In [None]:
err_report

In [15]:
hdf5_file.close()

In [13]:
#useful reference
#http://machinelearninguru.com/deep_learning/data_preparation/hdf5/hdf5.html
sum(sum(io==key))

NameError: name 'key' is not defined

In [14]:
pad_shape
block_shape

{'img': array([500, 500,   3]), 'mask': array([500, 500,   3])}

In [13]:
io_arr_out["mask"][:,:,:,0].squeeze().shape

(25, 500, 500)