In [1]:
import os
import numpy as np
import re
import shutil

# Change to: 
# _t1_frontal/sagittal/transversal,
# _t1ce_frontal/sagittal/transversal, 
# _flair_frontal/sagittal/transversal for different datasets
model_data_path = '/local/data1/elech646/Tumor_grade_classification/dataset224_t1_flair_single_sagittal'

# Create folders for training/validation/test
if not os.path.exists(f'{model_data_path}/train'):
    os.mkdir(f'{model_data_path}/train')
    os.mkdir(f'{model_data_path}/train/G2')
    os.mkdir(f'{model_data_path}/train/G3')
    os.mkdir(f'{model_data_path}/train/G4')
    
if not os.path.exists(f'{model_data_path}/val'):
    os.mkdir(f'{model_data_path}/val')
    os.mkdir(f'{model_data_path}/val/G2')
    os.mkdir(f'{model_data_path}/val/G3')
    os.mkdir(f'{model_data_path}/val/G4')
    
if not os.path.exists(f'{model_data_path}/test'):
    os.mkdir(f'{model_data_path}/test')
    os.mkdir(f'{model_data_path}/test/G2')
    os.mkdir(f'{model_data_path}/test/G3')
    os.mkdir(f'{model_data_path}/test/G4')

In [5]:
# Read images for the different grades
G4 = os.listdir("/local/data1/elech646/Tumor_grade_classification/resized_single_channel_datasets/resized_t1_flair/G4/frontal_grade_classification")
G3 = os.listdir("/local/data1/elech646/Tumor_grade_classification/resized_single_channel_datasets/resized_t1_flair/G3/frontal_grade_classification")
G2 = os.listdir("/local/data1/elech646/Tumor_grade_classification/resized_single_channel_datasets/resized_t1_flair/G2/frontal_grade_classification")

def train_val_test_split(grade):
    
    # Get always the same result
    np.random.seed(4)
    
    # Get 70 % as training (approx)
    train_idx = np.random.choice(grade, size = round(0.7*len(grade)), replace = False)
    
    # Find the non-training patients
    non_train = list(set(grade).difference(set(train_idx)))
    
    # Get 70 % of non-training as validation (approx)
    val_idx = np.random.choice(non_train, size = round(0.7*len(non_train)), replace = False)
    
    # Neither training nor validation => test
    test_idx = list(set(non_train).difference(set(val_idx)))
    
    return list(train_idx), list(val_idx), test_idx

# Get train+val for Grade 2
#G2_train, G2_val, _ = train_val_test_split(G2)
# Get test 
_, _, G2_test = train_val_test_split(G2)

# Get train+val for Grade 3
#G3_train, G3_val, _ = train_val_test_split(G3)
# Get test
_, _, G3_test = train_val_test_split(G3)

# Get train+val for Grade 4
#G4_train, G4_val, _ = train_val_test_split(G4)
# Get test
_, _, G4_test = train_val_test_split(G4)

In [6]:
def get_images_40(grade_set, grade, new_path, 
                  modality_list = ['t1', 't1ce', 't1Gd', 't2', 
                                   'flair', 't2_flair_flair', 
                                   't1_flair_flair']):
    
    # If you want to get cases that contain more than a percentage of tumor,
    # uncomment in front_cond, sag_cond, trans_cond
    if grade == 'G2':
        tumor_grade = 'G2'
    elif grade == 'G3':
        tumor_grade = 'G3'
    else:
        tumor_grade = 'G4'
    
    for train_patient in grade_set:
        for m in ['frontal', 'sagittal', 'trans']:
            path = f'/local/data1/elech646/Tumor_grade_classification/resized_single_channel_datasets/resized_t1_flair/{tumor_grade}/{m}_grade_classification/{train_patient}'
            
            # Get the names of all images
            if not os.path.exists(path):
                continue
            
            all_images = os.listdir(path)
            
            for image in all_images:
                # Find all images that contain a number in the title
                all_matches = re.findall('[0-9]*', image)
                
                # Extract the name of the modality
                #curr_modality = re.sub(".+trans_|.+sag_|.+fro_|_\d+_\d+.png", "", image)
                curr_modality = re.sub(".+trans_|.+sag_|.+fro_|_\d+_\d+.png|_\d+_NA.png", "", image)
                
                # Check if the modality of the image is in the desired list
                modal_cond = curr_modality in modality_list
                
                # Grab the percentage
                perc = [int(i) for i in all_matches if i != ''][-1]
                
                if grade == 'G4':
                    #front_cond = m == 'frontal'  #and perc in range(12, 90+1)
                    sag_cond   = m == 'sagittal' and perc in range(13, 88+1)
                    #trans_cond = m == 'trans'    #and perc in range(17, 90+1)
                    
                elif grade == 'G3':
                    #front_cond = m == 'frontal'  #and perc in range(6, 93+1)
                    sag_cond   = m == 'sagittal' and perc in range(16, 92+1)
                    #trans_cond = m == 'trans'    #and perc in range(13, 91+1)  
                    
                else:
                    #front_cond = m == 'frontal'  #and perc in range(8, 93+1)
                    sag_cond   = m == 'sagittal' and perc in range(36, 96+1)
                    #trans_cond = m == 'trans'    #and perc in range(9, 92+1)  
                   
                # Uncomment depending on the plane you want
                #sag_cond = trans_cond = False              # frontal
                front_cond = trans_cond = False            # sagittal
                #front_cond = sag_cond = False              # transversal
                if modal_cond and (front_cond or sag_cond or trans_cond):
                    shutil.copyfile(f'{path}/{image}', f'{new_path}/{grade}/{image}')                  

In [7]:
# Change accordingly for modality of choice
modality_list = ['t1_flair_flair']

# Mind the directory locations, they change according to modality and plane
save_path = '/local/data1/elech646/Tumor_grade_classification/dataset224_t1_flair_single_sagittal'

# G2
# get_images_40(G2_train, 'G2', save_path + '/train', modality_list)
# get_images_40(G2_val, 'G2', save_path + '/val', modality_list)
get_images_40(G2_test, 'G2', save_path + '/test', modality_list)

# G3
# get_images_40(G3_train, 'G3', save_path + '/train', modality_list)
# get_images_40(G3_val, 'G3', save_path + '/val', modality_list)
get_images_40(G3_test, 'G3', save_path + '/test', modality_list)

# G4
# get_images_40(G4_train, 'G4', save_path + '/train', modality_list)
# get_images_40(G4_val, 'G4', save_path + '/val', modality_list)
get_images_40(G4_test, 'G4', save_path + '/test', modality_list)