In [1]:
import torch
import tables
import os
import os,sys
import glob

import PIL
import numpy as np
from numpy.lib.stride_tricks import as_strided

import cv2
import matplotlib.pyplot as plt

from sklearn import model_selection
import random


seed = random.randrange(sys.maxsize) #get a random seed so that we can reproducibly do the cross validation setup
random.seed(seed) # set the seed
print(f"random seed (note down for reproducibility): {seed}")

random seed (note down for reproducibility): 6341798946737309275


In [2]:




patch_size=500 #size of the tiles to extract and save in the database, must be >= to training size
test_set_size=.1 # what percentage of the dataset should be used as a held out validation/testing set

classes=[0,1,2,3] #what classes we expect to have in the data, here we have only 2 classes but we could add additional classes and/or specify an index from which we would like to ignore

targetMag = 8
#17S050572_1A_L12 - 2018-03-20 13.09.26.ndpi Masked out because of too large to be saved in mag=8

dataname="melanoma_png"+"_"+str(targetMag)

baseMag = 20
file_ext= ".png"
mask_ext = ".png"
mask_pattern = "*"


imgdir = f"/home/yxz1826/downsampled/{targetMag}_group"
maskroot = os.environ["REMOTE"]+'/Skin/QupathOut/multi_mag'

maskdir = maskroot+f"/{targetMag:.1f}/"
pytable_root = '/mnt/ccipd_home/yxz1826/Skin/pytable'
print(maskdir)
print(imgdir)
print(dataname)

/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/
/home/yxz1826/downsampled/8_group
melanoma_png_8


In [3]:
img_dtype = tables.UInt8Atom()  # dtype in which the images will be saved, this indicates that images will be saved as unsigned int 8 bit, i.e., [0,255]
filenameAtom = tables.StringAtom(itemsize=255) #create an atom to store the filename of the image, just incase we need it later, 
files=glob.glob(maskdir+mask_pattern+mask_ext) # create a list of the files, in this case we're only interested in files which have masks so we can use supervised learning

#create training and validation stages and split the files appropriately between them
phases={}
phases["train"],phases["val"]=next(iter(model_selection.ShuffleSplit(n_splits=10,test_size=test_set_size).split(files)))

#specify that we'll be saving 2 different image types to the database, an image and its associated masked
imgtypes=["img","mask"]

In [4]:
import readslide
from readslide import getThumbByMag
from readslide import ndpiAdaptor

In [6]:
##Define adaptor- contract of reading
#imageIOAdaptor = defaultIOAdaptor
from readslide import pngAdaptor


In [7]:
imageIOAdaptor = pngAdaptor


storage={} #holder for future pytables
temp_handle = {} #just in case exception raised

block_shape={} #block shape specifies what we'll be saving into the pytable array, here we assume that masks are 1d and images are 3d
block_shape["img"]= np.array((patch_size,patch_size,3))
block_shape["mask"]= np.array((patch_size,patch_size)) 

filters=tables.Filters(complevel=6, complib='zlib') #we can also specify filters, such as compression, to improve storage speed


for phase in phases.keys(): #now for each of the phases, we'll loop through the files
    print(phase)
    
    totals=np.zeros((2,len(classes))) # we can to keep counts of all the classes in for in particular training, since we 
    totals[0,:]=classes               # can later use this information to create better weights

    hdf5_file = tables.open_file(f"{pytable_root}/{dataname}_{phase}.pytable", mode='w') #open the respective pytable
    temp_handle[phase] = hdf5_file
    storage["filename"] = hdf5_file.create_earray(hdf5_file.root, 'filename', filenameAtom, (0,)) #create the array for storage
    
    for imgtype in imgtypes: #for each of the image types, in this case mask and image, we need to create the associated earray
        storage[imgtype]= hdf5_file.create_earray(hdf5_file.root, imgtype, img_dtype,  
                                                  shape=np.append([0],block_shape[imgtype]), 
                                                  chunkshape=np.append([1],block_shape[imgtype]),
                                                  filters=filters)
    count = 0
    for filei in phases[phase]: #now for each of the files
        fname=files[filei] 
        for imgtype in imgtypes:
            count = count +1
            print(count)
            io = imageIOAdaptor(imgtype,imgdir,fname,file_ext,classes,(targetMag,baseMag),totals)
            io_shape = np.array(io.shape) #get the final shape
           
            #the code below chops up the image into tiles of the appropriate shape, resulting in a 4D tensor
            #this code could be improved to take more samples, for example with a stride of 1/2 of the tile size to better cover the data
            new_shape = tuple(io_shape  // block_shape[imgtype]) + tuple(block_shape[imgtype])
            new_strides = tuple(io.strides * block_shape[imgtype]) + io.strides
           # print(io.dtype)
            #print("calc:",io_shape,block_shape[imgtype],io.strides)
           # print("new:",new_shape,new_strides)
            io_arr_out = as_strided(io, shape=new_shape, strides=new_strides)
            #print('pre_resize:',io_arr_out.shape)
            io_arr_out=io_arr_out.reshape(np.append([-1],block_shape[imgtype]))
            #save the 4D tensor to the table
            #print(io_arr_out.shape)
            storage[imgtype].append(io_arr_out)
            pass
            
        storage["filename"].append([fname for x in range(io_arr_out.shape[0])]) #add the filename to the storage array
        pass
    #lastely, we should store the number of pixels
    npixels=hdf5_file.create_carray(hdf5_file.root, 'numpixels', tables.Atom.from_dtype(totals.dtype), totals.shape)
    npixels[:]=totals
    print(hdf5_file.root.numpixels)
    hdf5_file.close()
pass
print("finish")

train
1
imgMode:/home/yxz1826/downsampled/8_group/17S049691_1A_L1 - 2018-03-20 08.01.28.png
2
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17S049691_1A_L1 - 2018-03-20 08.01.28_mask.png
3
imgMode:/home/yxz1826/downsampled/8_group/17S050297_4A_L12 - 2018-03-21 14.36.00.png
4
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17S050297_4A_L12 - 2018-03-21 14.36.00_mask.png
5
imgMode:/home/yxz1826/downsampled/8_group/17Z024109_2A_L1 - 2018-03-22 18.09.33.png
6
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17Z024109_2A_L1 - 2018-03-22 18.09.33_mask.png
7
imgMode:/home/yxz1826/downsampled/8_group/17S050412_1A_L1 - 2018-03-21 14.57.01.png
8
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17S050412_1A_L1 - 2018-03-21 14.57.01_mask.png
9
imgMode:/home/yxz1826/downsampled/8_group/17Z024398_1A-L123 - 2018-03-22 18.25.44.png
10
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17Z024398_1A-L123 - 2018-03-22 18.25.44_mask.png
11

83
imgMode:/home/yxz1826/downsampled/8_group/17Z033106_1A_L123 - 2018-03-26 17.20.34.png
84
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17Z033106_1A_L123 - 2018-03-26 17.20.34_mask.png
85
imgMode:/home/yxz1826/downsampled/8_group/17S049827_1A_L12 - 2018-03-20 11.55.38.png
86
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17S049827_1A_L12 - 2018-03-20 11.55.38_mask.png
87
imgMode:/home/yxz1826/downsampled/8_group/17S049764_1A_L12 - 2018-03-20 11.23.59.png
88
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17S049764_1A_L12 - 2018-03-20 11.23.59_mask.png
89
imgMode:/home/yxz1826/downsampled/8_group/17Z018025_1A_L123 - 2018-03-24 13.45.17.png
90
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17Z018025_1A_L123 - 2018-03-24 13.45.17_mask.png
91
imgMode:/home/yxz1826/downsampled/8_group/17Z017322 1A L456.png
92
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17Z017322 1A L456_mask.png
93
imgMode:/home/yxz1826/downsamp

165
imgMode:/home/yxz1826/downsampled/8_group/17Z018731_1A_L123 - 2018-03-24 15.04.29.png
166
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17Z018731_1A_L123 - 2018-03-24 15.04.29_mask.png
167
imgMode:/home/yxz1826/downsampled/8_group/17S049631_1A_L123 - 2018-03-20 07.34.16.png
168
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17S049631_1A_L123 - 2018-03-20 07.34.16_mask.png
169
imgMode:/home/yxz1826/downsampled/8_group/17Z023854_1A_L123 - 2018-03-23 13.27.59.png
170
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17Z023854_1A_L123 - 2018-03-23 13.27.59_mask.png
171
imgMode:/home/yxz1826/downsampled/8_group/17Z024073_1A_L1 - 2018-03-22 17.40.08.png
172
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17Z024073_1A_L1 - 2018-03-22 17.40.08_mask.png
173
imgMode:/home/yxz1826/downsampled/8_group/17S050297_1A_L123 - 2018-03-21 14.25.05.png
174
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17S050297_1A_L123 - 2018-03-

246
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17Z032903_1A_L12 - 2018-03-20 10.02.32_mask.png
247
imgMode:/home/yxz1826/downsampled/8_group/17z023746_2A_L123 - 2018-03-23 13.02.04.png
248
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17z023746_2A_L123 - 2018-03-23 13.02.04_mask.png
249
imgMode:/home/yxz1826/downsampled/8_group/17Z032978_2B_L12 - 2018-03-26 16.52.18.png
250
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17Z032978_2B_L12 - 2018-03-26 16.52.18_mask.png
251
imgMode:/home/yxz1826/downsampled/8_group/17S051235_1A_L123 - 2018-03-20 15.43.54.png
252
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17S051235_1A_L123 - 2018-03-20 15.43.54_mask.png
253
imgMode:/home/yxz1826/downsampled/8_group/17Z018511_1A_L123 - 2018-03-24 14.12.32.png
254
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17Z018511_1A_L123 - 2018-03-24 14.12.32_mask.png
255
imgMode:/home/yxz1826/downsampled/8_group/17Z032862_1A_L123 - 20

327
imgMode:/home/yxz1826/downsampled/8_group/17S050373_1A_123 - 2018-03-21 14.48.41.png
328
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17S050373_1A_123 - 2018-03-21 14.48.41_mask.png
329
imgMode:/home/yxz1826/downsampled/8_group/17S049745_2A_L1 - 2018-03-20 11.07.15.png
330
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17S049745_2A_L1 - 2018-03-20 11.07.15_mask.png
331
imgMode:/home/yxz1826/downsampled/8_group/17Z024587_1A_L1 - 2018-03-22 17.03.48.png
332
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17Z024587_1A_L1 - 2018-03-22 17.03.48_mask.png
333
imgMode:/home/yxz1826/downsampled/8_group/17Z018506_2A_L123 - 2018-03-24 14.02.17.png
334
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17Z018506_2A_L123 - 2018-03-24 14.02.17_mask.png
335
imgMode:/home/yxz1826/downsampled/8_group/17Z024545_1A_L123 - 2018-03-22 16.45.04.png
336
maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17Z024545_1A_L123 - 2018-03-22 16.

In [None]:
#useful reference
#http://machinelearninguru.com/deep_learning/data_preparation/hdf5/hdf5.html

In [16]:
print(f"{pytable_root}/{dataname}_{phase}.pytable")

/mnt/ccipd_home/yxz1826/Skin/pytable/melanoma_png_8_val.pytable


In [10]:
phase

'val'

In [8]:
phases

{'train': array([110, 153,  38,  29,  44,  83,  64,  57, 194, 179,  97,  32,  62,
        176,  74,  81,   6, 101, 128, 201,  22, 200,  88, 183,  34, 151,
        173,  13,  46,   5,  31,  25,  48, 124, 188,  77, 150,  69,  50,
        100, 193,  98, 103, 147,   7, 162,  14, 154, 142,  15,  40,  56,
        122,  78, 159, 118, 112, 172,  89, 181,  65, 186,   1,  17, 163,
         18,  27, 171, 135, 161,  66,  95,  20, 182, 184, 185, 169,  99,
         92, 140,  39,  85, 175, 158, 178,  93, 192, 160,  10, 165, 105,
        109, 115, 166, 138, 125,  53,  91,  71, 116, 157,  96,  21,   4,
        139, 149,  63, 137,  51,  60, 174,  52, 197,  19,  37,  36, 120,
         28, 189, 121, 190,  73,  72,  80,  87,  45,  76,  59, 129, 199,
        143,  33,  79, 102, 177,  16,   0, 114, 156, 146, 108, 127, 132,
          3, 196,  47,  70, 144, 111, 155, 198,  43, 195,  67,  23, 202,
         41, 104, 107, 119, 134, 152, 148, 170, 126, 136,  86,  30,  55,
        113, 180,  12, 164,  58,  68, 130,

In [17]:
testcreate = tables.open_file("/mnt/ccipd_home/yxz1826/Skin/pytable/melanoma_png_test_val.pytable",mode='w')

OSError: directory ``/mnt/ccipd_home/yxz1826/Skin/pytable`` exists but it can not be written

In [8]:
test = imageIOAdaptor('mask',imgdir,fname,file_ext,classes,(targetMag,baseMag),totals)

maskMode:/mnt/ccipd_home/yxz1826/Skin/QupathOut/multi_mag/8.0/17Z024763_2A_L123 - 2018-03-22 16.26.20_mask.png


In [9]:
test.max()

3

In [11]:
test.shape

(8601, 32256)

In [11]:
io.max()

3