# Image preprocessing and Tile-structured dataset formation --Workflow
## 1. Image preprocessing
1. tile regular sampling
2. labeling
3. color space transformation
4. remove non-tissue area<br>

## 2. Filename handling
To match the filenames of WSI with that of masks<br>
benefit labeling process

## 3. Tile-structured dataset formation

## 1.  Image preprocessing

In [34]:
import cv2
import math
import os 
import shutil
import numpy as np

#used prior to CNN model training
#four subtasks are included in function: sampling, non-tissue regions removing,
#color space transformation and tile labeling
#Raw tiles, unprocessed but labelled tiles,as well as processed and labelled tiles are stored 

def img_sampling_labeling(
    input_path_WSI,input_path_mask,output_path_RawTile,output_path_processed_positive,output_path_processed_negative, 
    output_path_unprocessed_positive,output_path_unprocessed_negative):

    positive_collection = []
    negative_collection = []
    non_tissue_collection = []
    tile_size = (101, 101)
    offset = (101, 101)
    #remove and recreate directories 
    if  os.path.exists(output_path_RawTile) :
        shutil.rmtree(output_path_RawTile)     
    os.makedirs(output_path_RawTile)
    if  os.path.exists(output_path_processed_positive) :
        shutil.rmtree(output_path_processed_positive)     
    os.makedirs( output_path_processed_positive)
    if  os.path.exists(output_path_processed_negative) :
        shutil.rmtree(output_path_processed_negative)     
    os.makedirs(output_path_processed_negative)
        
    if  os.path.exists(output_path_unprocessed_positive) :
        shutil.rmtree(output_path_unprocessed_positive)     
    os.makedirs( output_path_unprocessed_positive)
    if  os.path.exists(output_path_unprocessed_negative) :
        shutil.rmtree(output_path_unprocessed_negative)     
    os.makedirs(output_path_unprocessed_negative)
    
        
    for imgname in os.listdir(input_path_WSI):
        img_WSI = cv2.imread(os.path.join(input_path_WSI,imgname))
        img_mask = cv2.imread(os.path.join(input_path_mask,imgname))
        img_shape = img_WSI.shape
        for i in range(int(math.ceil(img_shape[0]/(offset[1] * 1.0)))): #height
            for j in range(int(math.ceil(img_shape[1]/(offset[0] * 1.0)))):#width
                #for each tile
                #step1. tile sampling
                cropped_img_WSI = img_WSI[offset[1]*i:min(offset[1]*i+tile_size[1], img_shape[0]), offset[0]*j:min(offset[0]*j+tile_size[0], img_shape[1])]
                cropped_img_mask = img_mask[offset[1]*i:min(offset[1]*i+tile_size[1], img_shape[0]), offset[0]*j:min(offset[0]*j+tile_size[0], img_shape[1])]
                #save plain tiles
                cv2.imwrite(output_path_RawTile + '\\'+ imgname +'_tile_'+ str(i)+"_"+ str(j)+'.png', cropped_img_WSI)
                if cropped_img_WSI is not None:#checkpoint
                    cropped_image_WSI_01 = cropped_img_WSI/255 
                    cropped_image_WSI_img_01_mean = cropped_image_WSI_01.mean()
                    cropped_image_WSI_img_01_std = cropped_image_WSI_01.std() 
                    #step2.Remove non_tissue
                    if cropped_image_WSI_img_01_mean >= 0.85  and cropped_image_WSI_img_01_std <= 0.1: #more strict 
                         non_tissue_collection.append( imgname+'_'+str(i)+"_"+ str(j))
                    else:
                        #step3.  color space transoformation : RGB to YUV
                        # normalization and standardization are achieved using imageGenerator 
                        #img_processed = ColorSpaceTransformaton_Standardization_Normalization(cropped_img_WSI)
                        img_processed = cv2.cvtColor(cropped_img_WSI,cv2.COLOR_RGB2YUV)
                         #step4. labeling according to binary mask
                        a = np.mean(cropped_img_mask[:,:,0])/255 # binary mask 0：white; 1: black
                        if a >= 0.55:#60% white(positive) #more slack
                            positive_collection.append( imgname+'_'+str(i)+"_"+ str(j))
                            #save to local positive folder
                            cv2.imwrite(output_path_processed_positive + '\\'+ imgname[0:-4] +'_tile_'+ str(i)+"_"+ str(j)+'.png', img_processed)
                            cv2.imwrite(output_path_unprocessed_positive + '\\'+ imgname[0:-4] +'_tile_'+ str(i)+"_"+ str(j)+'.png', cropped_img_WSI )
                         
                        else:
                             #save to negative folder
                            cv2.imwrite(output_path_processed_negative + '\\'+ imgname[0:-4] +'_tile_'+ str(i)+"_"+ str(j)+'.png', img_processed)
                            cv2.imwrite(output_path_unprocessed_negative + '\\'+ imgname[0:-4] +'_tile_'+ str(i)+"_"+ str(j)+'.png', cropped_img_WSI )
                            negative_collection.append(imgname+'_'+str(i)+"_"+ str(j)) 
                else:
                    non_tissue_collection.append(dirname+'_'+str(i)+"_"+ str(j))
    return positive_collection, negative_collection, non_tissue_collection


#### CWRU

In [13]:
original_path_training_WSI_CWRU = r'\ECSE484_Fall2020\training_2020\CWRU_imgs_idx8'
original_path_training_mask_CWRU = r'\ECSE484_Fall2020\training_2020\CWRU_masks'

In [39]:
output_path_RawTile = r'ECSE484_Fall2020\training_2020\CWRU_RawTile'
output_path_processed_positive = r'\ECSE484_Fall2020\training_2020\CWRU_processed_tiles\positive'
output_path_processed_negative = r'\ECSE484_Fall2020\training_2020\CWRU_processed_tiles\negative'
output_path_unprocessed_positive = r'\ECSE484_Fall2020\training_2020\CWRU_unprocessed_tiles\positive'
output_path_unprocessed_negative = r'\ECSE484_Fall2020\training_2020\CWRU_unprocessed_tiles\negative'
input_path_WSI = original_path_training_WSI_CWRU
input_path_mask =  original_path_training_mask_CWRU
positive_collection, negative_collection, non_tissue_collection = img_sampling_labeling(
    input_path_WSI,input_path_mask,output_path_RawTile,output_path_processed_positive,output_path_processed_negative, 
    output_path_unprocessed_positive,output_path_unprocessed_negative)


In [15]:
print('Number of CWRU image tiles in total:', len(non_tissue_collection )+
      len(positive_collection)+len(negative_collection) )
print('Number of CWRU positive tiles:', len(positive_collection))
print('Number of CWRU negative tiles:', len(negative_collection))
print('Number of CWRU non-tissue tiles:', len(non_tissue_collection ))
      

Number of CWRU image tiles in total: 101576
Number of CWRU positive tiles: 6821
Number of CWRU negative tiles: 31526
Number of CWRU non-tissue tiles: 63229


#### HUP

In [16]:
original_path_training_mask_HUP_1 = r'\ECSE484_Fall2020\training_2020\HUP_masks_part1' 
original_path_training_mask_HUP_2 = r'\ECSE484_Fall2020\training_2020\HUP_mask_part2'
original_path_training_WSI_HUP_1  = r'\ECSE484_Fall2020\training_2020\HUP_imgs_idx5_Part_1'
original_path_training_WSI_HUP_2 = r'\ECSE484_Fall2020\training_2020\HUP_imgs_idx5_Part_2'
original_path_training_WSI_HUP = r'\ECSE484_Fall2020\training_2020\HUP_imgs'
original_path_training_mask_HUP = r'\ECSE484_Fall2020\training_2020\HUP_masks'

In [45]:
output_path_RawTile = r'\ECSE484_Fall2020\training_2020\HUP_RawTile'
output_path_processed_positive = r'\ECSE484_Fall2020\training_2020\HUP_processed_tiles\positive'
output_path_processed_negative = r'\ECSE484_Fall2020\training_2020\HUP_processed_tiles\negative'
output_path_unprocessed_positive = r'\ECSE484_Fall2020\training_2020\HUP_unprocessed_tiles\positive'
output_path_unprocessed_negative = r'\ECSE484_Fall2020\training_2020\HUP_unprocessed_tiles\negative'
input_path_WSI = original_path_training_WSI_HUP
input_path_mask =  original_path_training_mask_HUP
positive_collection, negative_collection, non_tissue_collection = img_sampling_labeling(
    input_path_WSI,input_path_mask,output_path_RawTile,output_path_processed_positive,output_path_processed_negative, 
    output_path_unprocessed_positive,output_path_unprocessed_negative) 

In [19]:
#test
temp = os.listdir(output_path_processed_positive)
a = cv2.imread(os.path.join(output_path_processed_positive,temp[0] ))

In [18]:
print('Number of HUP image tiles in total:', len(non_tissue_collection )+
      len(positive_collection)+len(negative_collection) )
print('Number of HUP positive tiles:', len(positive_collection))
print('Number of HUP negative tiles:', len(negative_collection))
print('Number of HUP non-tissue tiles:', len(non_tissue_collection ))

Number of HUP image tiles in total: 172197
Number of HUP positive tiles: 12628
Number of HUP negative tiles: 49716
Number of HUP non-tissue tiles: 109853


#### TCGA

In [22]:
original_path_training_WSI_TCGA = r'\ECSE484_Fall2020\training_2020\TCGA_imgs_idx5'
original_path_training_mask_TCGA = r'\ECSE484_Fall2020\training_2020\TCGA_masks'

In [48]:
output_path_RawTile = r'\ECSE484_Fall2020\training_2020\TCGA_RawTile'
output_path_processed_positive = r'\ECSE484_Fall2020\training_2020\TCGA_processed_tiles\positive'
output_path_processed_negative = r'\ECSE484_Fall2020\training_2020\TCGA_processed_tiles\negative'
output_path_unprocessed_positive = r'\ECSE484_Fall2020\training_2020\TCGA_unprocessed_tiles\positive'
output_path_unprocessed_negative = r'\ECSE484_Fall2020\training_2020\TCGA_unprocessed_tiles\negative'
input_path_WSI = original_path_training_WSI_TCGA
input_path_mask =  original_path_training_mask_TCGA
positive_collection, negative_collection, non_tissue_collection = img_sampling_labeling(
    input_path_WSI,input_path_mask,output_path_RawTile,output_path_processed_positive,output_path_processed_negative, 
    output_path_unprocessed_positive,output_path_unprocessed_negative) 

In [24]:
print('Number of TCGA image tiles in total:', len(non_tissue_collection )+
      len(positive_collection)+len(negative_collection) )
print('Number of TCGA positive tiles:', len(positive_collection))
print('Number of TCGA negative tiles:', len(negative_collection))
print('Number of TCGA non-tissue tiles:', len(non_tissue_collection ))

Number of TCGA image tiles in total: 132110
Number of TCGA positive tiles: 24733
Number of TCGA negative tiles: 30365
Number of TCGA non-tissue tiles: 77012


#### CINJ

In [25]:
original_path_training_WSI_CINJ = r'\ECSE484_Fall2020\testing_2020\CINJ_imgs_idx5'
original_path_training_mask_CINJ = r'\ECSE484_Fall2020\testing_2020\CINJ_masks_HG'

In [61]:
output_path_RawTile = r'\ECSE484_Fall2020\testing_2020\CINJ_RawTile'
output_path_processed_positive = r'\ECSE484_Fall2020\testing_2020\CINJ_processed_tiles\positive'
output_path_processed_negative = r'\ECSE484_Fall2020\testing_2020\CINJ_processed_tiles\negative'
output_path_unprocessed_positive = r'\ECSE484_Fall2020\testing_2020\CINJ_unprocessed_tiles\positive'
output_path_unprocessed_negative = r'\ECSE484_Fall2020\testing_2020\CINJ_unprocessed_tiles\negative'
input_path_WSI = original_path_training_WSI_CINJ
input_path_mask =  original_path_training_mask_CINJ
positive_collection, negative_collection, non_tissue_collection = img_sampling_labeling(
    input_path_WSI,input_path_mask,output_path_RawTile,output_path_processed_positive,output_path_processed_negative, 
    output_path_unprocessed_positive,output_path_unprocessed_negative) 

In [27]:
print('Number of CINJ image tiles in total:', len(non_tissue_collection )+
      len(positive_collection)+len(negative_collection) )
print('Number of CINJ positive tiles:', len(positive_collection))
print('Number of CINJ negative tiles:', len(negative_collection))
print('Number of CINJ non-tissue tiles:', len(non_tissue_collection ))

Number of CINJ image tiles in total: 28994
Number of CINJ positive tiles: 3800
Number of CINJ negative tiles: 10330
Number of CINJ non-tissue tiles: 14864


## 2. Filename Handling

To match the filename of WSI with that of masks

In [38]:
#remove the alphebats in the filename
import re
def RemoveAlphebats(input_path):
    for old_imgname in os.listdir(input_path):
        new_imgname = ''.join([i for i in old_imgname if i.isnumeric()]);  
        os.rename(os.path.join(input_path,old_imgname), os.path.join(input_path,new_imgname+'.png'))#suffix


In [54]:
#only ForHUP_imgs_idx5_Part_1 and HUP_imgs_idx5_Part_1 
#only For original_path_training_WSI_HUP_1 , original_path_training_WSI_HUP_2 
# and original_path_validation_WSI_CINJ
#operation only once !!!
import os
def RemoveLas5(input_path):
    for old_imgname in os.listdir(input_path):
        new_imgname_onlynum = ''.join([i for i in old_imgname if i.isnumeric()]);  #re.sub
        new_imgname = new_imgname_onlynum[:-1] 
        os.rename(os.path.join(input_path,old_imgname), os.path.join(input_path,new_imgname+'.png'))#suffix


In [58]:
original_path_validation_WSI_CINJ = r'\ECSE484_Fall2020\testing_2020\CINJ_imgs_idx5'
original_path_training_mask_HUP_1 = r'\ECSE484_Fall2020\training_2020\HUP_masks_part1' 
original_path_training_mask_HUP_2 = r'\ECSE484_Fall2020\training_2020\HUP_mask_part2'
original_path_training_WSI_HUP_1  = r'\paper_material\ECSE484_Fall2020\training_2020\HUP_imgs_idx5_Part_1'
original_path_training_WSI_HUP_2 = r'\paper_material\ECSE484_Fall2020\training_2020\HUP_imgs_idx5_Part_2'

In [53]:
RemoveAlphebats(original_path_validation_WSI_CINJ)
RemoveAlphebats(original_path_training_mask_HUP_1)
RemoveAlphebats(original_path_training_mask_HUP_2)
RemoveAlphebats(original_path_training_WSI_HUP_1)
RemoveAlphebats(original_path_training_WSI_HUP_2)

In [55]:
#operation only once !!!
RemoveLas5(original_path_training_WSI_HUP_1)
RemoveLas5(original_path_training_WSI_HUP_2)

In [59]:
#operation only once !!!
RemoveLas5(original_path_validation_WSI_CINJ)

In [111]:
# check if there is inconsistency in filenames of WSI and mask
original_path_training_WSI_CWRU = r'\ECSE484_Fall2020\training_2020\CWRU_imgs_idx8'
original_path_training_mask_CWRU = r'\ECSE484_Fall2020\training_2020\CWRU_masks'
# Find inconsistent filenames
import numpy as np
import re
training_WSI_CWRU_filenames = np.array(os.listdir(original_path_training_WSI_CWRU))#need to be converted into np.array
training_mask_CWRU_filenames =  np.array(os.listdir(original_path_training_mask_CWRU))
training_WSI_CWRU_filenames  == training_mask_CWRU_filenames
Inconsistency = np.where(training_WSI_CWRU_filenames != training_mask_CWRU_filenames) #return tuple
print('Number of inconsistent filenames: ',len(Inconsistency[0]))
for  i in Inconsistency[0]:
    print(training_WSI_CWRU_filenames[i])
    print(training_mask_CWRU_filenames[i])
    

Number of inconsistent filenames:  0


## 3.  Dataset Construction:

### Case 1 

$$
\begin{array}{|l|l|l|l|l|}
\hline & \text { CWRU } & \text { HUP } & \text { TCGA } & \text { CINJ } \\
\hline \text { Training:positive } & 3000 & 2000 & 2000 & 0 \\
\hline \text { Training:negative } & 3000 & 2000 & 2000 & 0 \\
\hline \text { Training: total } & 6000 & 4000 & 4000 & 0 \\
\hline \text { Validation:positive } & 500 & 500 & 500 & 0 \\
\hline \text { Validation:negative } & 500 & 500 & 500 & 0 \\
\hline \text { Validation: total } & 1000 & 1000 & 1000 & 0 \\
\hline \text { Testing:positive } & 0 & 0 & 0 & 1500 \\
\hline \text { Testing:negative } & 0 & 0 & 0 & 1500 \\
\hline \text { Testing:total } & 0 & 0 & 0 & 3000 \\
\hline
\end{array}
$$

### Case 2 

$$
\begin{array}{|l|l|l|l|l|}
\hline & \text { CWRU } & \text { HUP } & \text { TCGA } & \text { CINJ } \\
\hline \text { Training:positive } & 1750 & 1750 & 1750 & 1750 \\
\hline \text { Training:negative } & 1750 & 1750 & 1750 & 1750 \\
\hline \text { Total training } & 3500 & 3500 & 3500 & 3500 \\
\hline \text { Validation:positive } & 375 & 375 & 375 & 375 \\
\hline \text { Validation:negative } & 375 & 375 & 375 & 375 \\
\hline \text { Validation:total } & 750 & 750 & 750 & 750 \\
\hline \text { Testing:positive } & 0 & 0 & 0 & 1500 \\
\hline \text { Testing:negative } & 0 & 0 & 0 & 1500 \\
\hline \text { Testing: total } & 0 & 0 & 0 & 3000 \\
\hline
\end{array}
$$

### Training and validation set construction

dataset1---delete and recreate `dir`

In [29]:
import os 
output_path_training = r'\ECSE484_Fall2020\training_for_CNN'
output_path_validation = r'\ECSE484_Fall2020\validation_for_CNN'
output_positive_path_training = os.path.join(output_path_training,'positive' ) 
output_negative_path_training = os.path.join(output_path_training,'negative' ) 
output_positive_path_validation = os.path.join(output_path_validation,'positive' ) 
output_negative_path_validation = os.path.join(output_path_validation ,'negative' ) 
if  os.path.exists(output_path_training) :
        shutil.rmtree(output_path_training)     
os.makedirs(output_positive_path_training, exist_ok=True)
    
os.makedirs(output_negative_path_training, exist_ok=True)
if  os.path.exists(output_path_validation) :
        shutil.rmtree(output_path_validation)  
        
os.makedirs(output_positive_path_validation, exist_ok=True)#intermediate level
os.makedirs(output_negative_path_validation, exist_ok=True)


dataset2---delete and recreate `dir`

In [4]:
import os 
import shutil
output_path_training = r'\ECSE484_Fall2020\training_for_CNN_2'
output_path_validation = r'\ECSE484_Fall2020\validation_for_CNN_2'
output_path_testing = r'\ECSE484_Fall2020\testing_for_CNN_2'
output_positive_path_training = os.path.join(output_path_training,'positive' ) 
output_negative_path_training = os.path.join(output_path_training,'negative' ) 
output_positive_path_validation = os.path.join(output_path_validation,'positive' ) 
output_negative_path_validation = os.path.join(output_path_validation ,'negative' ) 
output_positive_path_testing = os.path.join(output_path_testing,'positive' ) 
output_negative_path_testing = os.path.join(output_path_testing ,'negative' ) 
if  os.path.exists(output_path_training) :
        shutil.rmtree(output_path_training)  
        
if  os.path.exists(output_path_validation) :
        shutil.rmtree(output_path_validation)  
if  os.path.exists(output_path_testing) :
        shutil.rmtree(output_path_testing)  
os.makedirs(output_positive_path_training, exist_ok=True)
os.makedirs(output_negative_path_training)
   
        
os.makedirs(output_positive_path_validation, exist_ok=True)#intermediate level
os.makedirs(output_negative_path_validation)

os.makedirs(output_positive_path_testing, exist_ok=True)#intermediate level
os.makedirs(output_negative_path_testing)

`TransportFilesToDataset`  <br>
1.distribute files from CWRU, HUP, TCGA dataset to training and validation set with defined split ratio<br> 2. used in constructing both dataset1 and dataset2 <br> 
Only CINJ is used in building testing set <br>


In [5]:
import os
import random
import shutil

def TransportFilesToDataset(input_path,output_path_training,
                            output_path_validation,HalfNumRequired_training, HalfNumRequired_validation):
#only for training and validation set
#input_path and output_path are parent directory path of 'positive' and 'negative' directory
    input_positive_path = os.path.join(input_path,'positive' )
    input_negative_path = os.path.join(input_path,'negative' ) 
    output_positive_path_training = os.path.join(output_path_training,'positive' ) 
    output_negative_path_training = os.path.join(output_path_training,'negative' ) 
    output_positive_path_validation = os.path.join(output_path_validation,'positive' ) 
    output_negative_path_validation = os.path.join(output_path_validation ,'negative' ) 
    
    input_positive_files = os.listdir( input_positive_path    )
    input_negative_files = os.listdir( input_negative_path    )
    randomList_positive = []
    randomList_negative = []
    randomList_positive_Training = []
    randomList_positive_negative = []
    #extract random index of input postivie and negative folders
    randomList_positive = random.sample(range(0, len(input_positive_files)), 
                                         HalfNumRequired_training + HalfNumRequired_validation    )
    randomList_negative = random.sample(range(0, len(input_negative_files)), 
                                        HalfNumRequired_training + HalfNumRequired_validation  )
    
    random.shuffle(randomList_positive)
    random.shuffle(randomList_negative)
    randomList_positive_training = randomList_positive[0:  HalfNumRequired_training    ]
    randomList_positive_validation = randomList_positive[ HalfNumRequired_training :]
    randomList_negative_training = randomList_negative[0: HalfNumRequired_training ]
    randomList_negative_validation = randomList_negative[HalfNumRequired_training :]

    
    for idx, ran_num in enumerate(randomList_positive_training):
        src = os.path.join(input_positive_path, input_positive_files[ran_num]  )
        shutil.copy2(   src ,      output_positive_path_training)#dst could be directory or complete target filename
    
    for idx, ran_num in enumerate(  randomList_negative_training):
        src = os.path.join(input_negative_path,input_negative_files[ran_num]  )
        shutil.copy2(   src ,      output_negative_path_training) 
        
    for idx, ran_num in enumerate(  randomList_positive_validation):
        src = os.path.join(input_positive_path, input_positive_files[ran_num]  )
        shutil.copy2(   src ,     output_positive_path_validation)   
        
    for idx, ran_num in enumerate(     randomList_negative_validation):
        src = os.path.join(input_negative_path,input_negative_files[ran_num]  )
        shutil.copy2(   src ,      output_negative_path_validation) 

 

In [6]:
import os
main_path_training = r'D:\semester 1th\intro to bioinfo\paper_material\ECSE484_Fall2020\training_2020'
CWRU_path_processed = os.path.join(main_path_training , 'CWRU_processed_tiles')
HUP_path_processed  = os.path.join(main_path_training , 'HUP_processed_tiles')
TCGA_path_processed = os.path.join(main_path_training , 'TCGA_processed_tiles')
CINJ_path_processed = os.path.join(main_path_training , 'CINJ_processed_tiles')

CWRU_dataset1

In [32]:
input_path = CWRU_path_processed
output_path_training = r'\ECSE484_Fall2020\training_for_CNN'
output_path_validation = r'\ECSE484_Fall2020\validation_for_CNN'
TotalNumRequired_training = 3000
TotalNumRequired_validation = 500
TransportFilesToDataset(input_path,output_path_training,output_path_validation,
                        TotalNumRequired_training, TotalNumRequired_validation)

CWRU_dataset2

In [7]:
input_path = CWRU_path_processed
output_path_training = r'\ECSE484_Fall2020\training_for_CNN_2'
output_path_validation = r'\ECSE484_Fall2020\validation_for_CNN_2'
TotalNumRequired_training = 1750
TotalNumRequired_validation = 375
TransportFilesToDataset(input_path,output_path_training,output_path_validation,
                        TotalNumRequired_training, TotalNumRequired_validation)

HUP_dataset_1

In [None]:
input_path = HUP_path_processed
output_path_training = r'\ECSE484_Fall2020\training_for_CNN'
output_path_validation = r'\ECSE484_Fall2020\validation_for_CNN'
TotalNumRequired_training = 2000
TotalNumRequired_validation = 500
TransportFilesToDataset(input_path,output_path_training,output_path_validation,
                        TotalNumRequired_training, TotalNumRequired_validation)

HUP_dataset2

In [8]:
input_path = HUP_path_processed
output_path_training = r'\ECSE484_Fall2020\training_for_CNN_2'
output_path_validation = r'\ECSE484_Fall2020\validation_for_CNN_2'
TotalNumRequired_training = 1750
TotalNumRequired_validation = 375
TransportFilesToDataset(input_path,output_path_training,output_path_validation,
                        TotalNumRequired_training, TotalNumRequired_validation)

TCGA_dataset_1

In [34]:
input_path = TCGA_path_processed
output_path_training = r'\ECSE484_Fall2020\training_for_CNN'
output_path_validation = r'\ECSE484_Fall2020\validation_for_CNN'
TotalNumRequired_training = 2000
TotalNumRequired_validation = 500
TransportFilesToDataset(input_path,output_path_training,output_path_validation,
                        TotalNumRequired_training, TotalNumRequired_validation)

TCGA_dataset_2

In [9]:
input_path = TCGA_path_processed
output_path_training = r'\ECSE484_Fall2020\training_for_CNN_2'
output_path_validation = r'\ECSE484_Fall2020\validation_for_CNN_2'
TotalNumRequired_training =  1750
TotalNumRequired_validation = 375
TransportFilesToDataset(input_path,output_path_training,output_path_validation,
                        TotalNumRequired_training, TotalNumRequired_validation)

CINJ_dataset2

In [15]:
input_path = CINJ_path_processed
output_path_training = r'\ECSE484_Fall2020\training_for_CNN_2'
output_path_validation = r'\ECSE484_Fall2020\validation_for_CNN_2'
output_path_testing = r'\ECSE484_Fall2020\testing_for_CNN_2'
HalfNumRequired_training = 1750
HalfNumRequired_validation = 375
HalfNumRequired_testing = 375
TransportFilesToAllDataset(input_path,output_path_training,output_path_validation,output_path_testing,
                        HalfNumRequired_training, HalfNumRequired_validation,HalfNumRequired_testing)

In [22]:
num_test = os.listdir( os.path.join(output_path_training, 'negative'))
print(len(num_test))
print(len(set(num_test))) #check the exsitence of repetitive files 

7000
7000


`TransportFilesToDataset_dataset2_CINJ` used for CINJ only<br>
As to dataset2, CINJ dataset is divided into three parts constituting training , validation and testing set respectivey

In [15]:
def TransportFilesToDataset_dataset2_CINJ(input_path, output_path_testing, 
                                   output_path_training,output_path_validation,
                                   HalfNumRequired_training, HalfNumRequired_validation):
#input_path and output_path are parent directory path of 'positive' and 'negative' directory
    #path
    input_positive_path = os.path.join(input_path,'positive' )
    input_negative_path = os.path.join(input_path,'negative' ) 
    output_positive_path_testing = os.path.join(output_path_testing,'positive' ) #input actually
    output_negative_path_testing = os.path.join(output_path_testing ,'negative' ) #input actually
   
    output_positive_path_training = os.path.join(output_path_training,'positive' ) 
    output_negative_path_training = os.path.join(output_path_training,'negative' )
    
    output_positive_path_validation = os.path.join(output_path_validation,'positive' ) 
    output_negative_path_validation = os.path.join(output_path_validation ,'negative' )
   
    #read filenames
    input_positive_files = os.listdir( input_positive_path    )
    input_negative_files = os.listdir( input_negative_path    )
    output_positive_testing_files = os.listdir(output_positive_path_testing ) 
    output_negative_testing_files = os.listdir(output_positive_path_testing ) 
    input_positive_files_2 = input_positive_files
    input_negative_files_2 = input_negative_files 
    #files already assigned to testing set will not be used in training and validation set 
    for item in output_positive_testing_files:
        if item  in input_positive_files_2:
            input_positive_files_2.remove(item )
    for item in output_negative_testing_files:
        if item  in input_negative_files_2:
            input_negative_files_2.remove(item )
   

    randomList_positive = []
    randomList_negative = []
    #extract random index from input postivie and negative folders
    randomList_positive = random.sample(range(0, len(input_positive_files_2)), 
                                         HalfNumRequired_training + HalfNumRequired_validation    )
    randomList_negative = random.sample(range(0, len(input_positive_files_2)), 
                                        HalfNumRequired_training + HalfNumRequired_validation  )
    
    random.shuffle(randomList_positive)
    random.shuffle(randomList_negative)
    #assign files to training and validation set 
    randomList_positive_training = randomList_positive[0:  HalfNumRequired_training    ]
    randomList_positive_validation = randomList_positive[ HalfNumRequired_training :]
    randomList_negative_training = randomList_negative[0: HalfNumRequired_training ]
    randomList_negative_validation = randomList_negative[HalfNumRequired_training :]

    #Given the filename lists for training and validation sets,  copy files from path of processed CINJ images tiles to 
    #the paths of training and validation set
    for idx, ran_num in enumerate(randomList_positive_training):
        src = os.path.join(input_positive_path,  input_positive_files_2[ran_num]  )
        shutil.copy2(   src ,      output_positive_path_training)#dst could be directory or complete target filename
    
    for idx, ran_num in enumerate(  randomList_negative_training):
        src = os.path.join(input_negative_path, input_negative_files_2[ran_num]  )
        shutil.copy2(   src ,      output_negative_path_training) 
        
    for idx, ran_num in enumerate(  randomList_positive_validation):
        src = os.path.join(input_positive_path,   input_positive_files_2[ran_num]  )
        shutil.copy2(   src ,     output_positive_path_validation)   
        
    for idx, ran_num in enumerate(     randomList_negative_validation):
        src = os.path.join(input_negative_path,input_negative_files_2[ran_num]  )
        shutil.copy2(   src ,      output_negative_path_validation) 


In [16]:
import os
main_path_training = r'\ECSE484_Fall2020\testing_2020'
CINJ_path_processed = os.path.join(main_path_training , 'CINJ_processed_tiles')
input_path = CINJ_path_processed
output_path_testing = r'\ECSE484_Fall2020\testing_for_CNN_2'
output_path_training = r'\ECSE484_Fall2020\training_for_CNN_2'
output_path_validation = r'=\ECSE484_Fall2020\validation_for_CNN_2'
HalfNumRequired_training = 1750
HalfNumRequired_validation = 375

TransportFilesToDataset_dataset2_CINJ(input_path, output_path_testing, 
                                   output_path_training,output_path_validation,
                                   HalfNumRequired_training, HalfNumRequired_validation)

### Testing set
only CINJ is used in testing set<br>
The testing set of both dataset1 and dataset2 is the same 

`TransportFilesToTestDataset` : only for forming testing dataset

In [5]:
import os
import random
import shutil

def TransportFilesToTestDataset(input_path,output_path_testing,HalfNumRequired_testing):
#input_path and output_path are parent directory path of 'positive' and 'negative' directory
    input_positive_path = os.path.join(input_path,'positive' )
    input_negative_path = os.path.join(input_path,'negative' ) 
    output_positive_path_testing = os.path.join(output_path_testing,'positive' ) 
    output_negative_path_testing = os.path.join(output_path_testing,'negative' ) 
    
    input_positive_files = os.listdir( input_positive_path    )
    input_negative_files = os.listdir( input_negative_path    )
    randomList_positive = []
    randomList_negative = []
   
    #extract random index of input postivie and negative folders
    randomList_positive = random.sample(range(0, len(input_positive_files)), 
                                         HalfNumRequired_testing   )
    randomList_negative = random.sample(range(0, len(input_negative_files)), 
                                        HalfNumRequired_testing )
    
 
    
    for idx, ran_num in enumerate(randomList_positive):
        src = os.path.join(input_positive_path, input_positive_files[ran_num]  )
        shutil.copy2(   src ,     output_positive_path_testing)#dst could be directory or complete target filename
    
    for idx, ran_num in enumerate(  randomList_negative):
        src = os.path.join(input_negative_path,input_negative_files[ran_num]  )
        shutil.copy2(   src ,      output_negative_path_testing) 
        
     

 

In [6]:
import os
main_path_training = r'\ECSE484_Fall2020\testing_2020'
CINJ_path_processed = os.path.join(main_path_training , 'CINJ_processed_tiles')
input_path = CINJ_path_processed
output_path_testing = r'\ECSE484_Fall2020\testing_for_CNN'
 
if  os.path.exists(output_path_testing) :
        shutil.rmtree(output_path_testing)  
os.makedirs(os.path.join(output_path_testing ,'positive'), exist_ok=True)
os.makedirs(os.path.join(output_path_testing ,'negative'))

HalfNumRequired_testing = 1500
TransportFilesToTestDataset(input_path,output_path_testing,HalfNumRequired_testing)