In [1]:
import torch
import tables
import os,sys
import glob
import PIL
import numpy as np
import cv2
import matplotlib.pyplot as plt
from sklearn import model_selection
import sklearn.feature_extraction.image
import random
from types import SimpleNamespace


seed = random.randrange(sys.maxsize) #get a random seed so that we can reproducibly do the cross validation setup
random.seed(seed) # set the seed
print(f"random seed (note down for reproducibility): {seed}")

random seed (note down for reproducibility): 3193652305956924952


In [3]:


params = {}

params['targetMag'] = 5
#17S050572_1A_L12 - 2018-03-20 13.09.26.ndpi Masked out because of too large to be saved in mag=8
params['baseMag'] = 20
params['file_ext']= ".png"
params['mask_ext'] = ".png"
params['mask_pattern'] = "*"

#imgdir = f"/home/yxz1826/downsampled/{targetMag}_group"
#maskroot = '/Skin/QupathOut/multi_mag'

#maskdir = maskroot+f"/{targetMag:.1f}/"
#pytable_root = '/mnt/ccipd_home/yxz1826/Skin/pytable'

params['imgdir'] = f"Y:\\home\\yxz1826\\Skin\\tiles\\{params.get('targetMag'):.0f}_tile"
params['maskroot'] = 'Y:\\home\\yxz1826\\Skin\\tiles\\'
params['maskdir'] = params.get('maskroot')+f"\\{params.get('targetMag'):.0f}_mask"
params['pytable_root'] = 'C:\\pytable'
print(params.get('imgdir'),params.get('maskdir'),params.get('pytable_root'))

params['dataname']="melanoma_png"+"_"+str(params.get('targetMag'))
params['patch_size']=500 #size of the tiles to extract and save in the database, must be >= to training size
params['test_set_size']=.1 # what percentage of the dataset should be used as a held out validation/testing set
params['classes']=[0,1,2,3] #what classes we expect to have in the data, here we have only 2 classes but we could add additional classes and/or specify an index from which we would like to ignore

params['stride_size']=250
params['mirror_pad_size']=250
params['resize'] = 1.
params['img_dtype'] = tables.UInt8Atom()  # dtype in which the images will be saved, this indicates that images will be saved as unsigned int 8 bit, i.e., [0,255]
params['filenameAtom'] = tables.StringAtom(itemsize=255) #create an atom to store the filename of the image, just incase we need it later, 
#params = SimpleNamespace(**params)
print(params.get('dataname'))

Y:\home\yxz1826\Skin\tiles\5_tile Y:\home\yxz1826\Skin\tiles\\5_mask C:\pytable
melanoma_png_5


In [4]:
maskpattern = params.get('maskdir')+'\\'+params.get('mask_pattern')+params.get('mask_ext')
files=glob.glob(maskpattern) # create a list of the files, in this case we're only interested in files which have masks so we can use supervised learning

#create training and validation stages and split the files appropriately between them
phases={}
phases["train"],phases["val"]=next(iter(model_selection.ShuffleSplit(n_splits=10,test_size=params.get('test_set_size')).split(files)))

#specify that we'll be saving 2 different image types to the database, an image and its associated masked
imgtypes=["mask","img"]
print(len(files))

0


In [5]:
from tqdm import tqdm
import readslide
from readslide import getThumbByMag

from readslide import ioAdaptor
from readslide import defaultNameParser
from readslide import defaultLabelMap
import traceback


In [6]:
############ function handles
imageIOAdaptor = ioAdaptor;
nameParser = defaultNameParser;
inputfun = {}
inputfun['img'] = cv2.imread
inputfun['mask'] = cv2.imread

###################################
storage={} #holder for future pytables
err_report = []
h5_debug = {}
block_shape={} #block shape specifies what we'll be saving into the pytable array, here we assume that masks are 1d and images are 3d
block_shape["img"]= np.array((params.get('patch_size'),params.get('patch_size'),3))
block_shape["mask"]= np.array((params.get('patch_size'),params.get('patch_size')))  # the output for both imgtypes has 3 channels here
patch_shape={}
patch_shape['img']=(params.get('patch_size'),params.get('patch_size'),3)
patch_shape['mask']=(params.get('patch_size'),params.get('patch_size'),3)

filters=tables.Filters(complevel=6, complib='zlib') #we can also specify filters, such as compression, to improve storage speed
interpolation  = {}
interpolation['img'] = PIL.Image.BICUBIC
interpolation['mask'] = PIL.Image.NEAREST
pad_shape = {}
pad_shape['img'] =  [(params.get('mirror_pad_size'), params.get('mirror_pad_size')), (params.get('mirror_pad_size'), params.get('mirror_pad_size')), (0, 0)]
pad_shape['mask'] =  [(params.get('mirror_pad_size'), params.get('mirror_pad_size')), (params.get('mirror_pad_size'), params.get('mirror_pad_size')),(0,0)]

for phase in phases.keys(): #now for each of the phases, we'll loop through the files
    print(phase)
    
    totals=np.zeros((2,len(params.get('classes')))) # we can to keep counts of all the classes in for in particular training, since we 
    totals[0,:]=params.get('classes')               # can later use this information to create better weights

    hdf5_file = tables.open_file(f"{params.get('pytable_root')}/{params.get('dataname')}_{phase}.pytable", mode='w') #open the respective pytable
    h5_debug[phase] = hdf5_file
    storage["filename"] = hdf5_file.create_earray(hdf5_file.root, 'filename', params.get('filenameAtom'), (0,)) #create the array for storage
    
    for imgtype in imgtypes: #for each of the image types, in this case mask and image, we need to create the associated earray
        storage[imgtype]= hdf5_file.create_earray(hdf5_file.root, imgtype, params.get('img_dtype'),  
                                                  shape=np.append([0],block_shape[imgtype]), 
                                                  chunkshape=np.append([1],block_shape[imgtype]),
                                                  filters=filters)
    io_arr_out = {}
    for filei in tqdm(phases[phase]): #now for each of the files
        fname=files[filei] 
        succeed = True;
        try:
            for imgtype in imgtypes:
                #read image
                
                name_dict = nameParser(fname,params)
                io = ioAdaptor(name_dict[imgtype],imgtype,inputfun[imgtype],params,totals,labelChange = None)             
                interp_method = interpolation[imgtype]
                io = cv2.resize(io,(0,0),fx=params.get('resize'),fy=params.get('resize'), interpolation=interp_method) #resize it as specified above
                io = np.pad(io,pad_shape[imgtype],mode="reflect")

                #convert input image into overlapping tiles, size is ntiler x ntilec x 1 x params.get('patch_size') x params.get('patch_size') x3
                io_arr_out[imgtype]=sklearn.feature_extraction.image.extract_patches(io,patch_shape[imgtype],params.get('stride_size'))
 
                #resize it into a ntile x params.get('patch_size') x params.get('patch_size') x 3
                io_arr_out[imgtype]=io_arr_out[imgtype].reshape(-1,params.get('patch_size'),params.get('patch_size'),3)

                succeed = True;    
        except Exception as e:
                succeed = False; 
                #raise e
                err_report.append((phase,fname,imgtype,str(e),traceback.extract_stack()))
            #save the 4D tensor to the table
        if (succeed):
            #print(1)
            storage["img"].append(io_arr_out["img"])
            storage["mask"].append(io_arr_out["mask"][:,:,:,0]) #only need 1 channel for mask data
            storage["filename"].append([fname for x in range(io_arr_out[imgtype].shape[0])]) #add the filename to the storage array
        else:
            pass
            #print("skip:",err_report[-1])
    #lastely, we should store the number of pixels
    npixels=hdf5_file.create_carray(hdf5_file.root, 'numpixels', tables.Atom.from_dtype(totals.dtype), totals.shape)
    npixels[:]=totals
    hdf5_file.close()

train


0it [00:00, ?it/s]


val


0it [00:00, ?it/s]


In [8]:
h5_debug["img"].close()
h5_debug["mask"].close()

KeyError: 'img'

In [12]:
#useful reference
#http://machinelearninguru.com/deep_learning/data_preparation/hdf5/hdf5.html


IndexError: index 2 is out of bounds for axis 0 with size 2

In [14]:
pad_shape
block_shape

{'img': array([500, 500,   3]), 'mask': array([500, 500,   3])}

In [26]:
io_arr_out["mask"].shape

(120, 500, 500, 3)

In [24]:
err_report

[]