In [3]:
import os
import numpy as np
import nibabel as nib
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
import json
from sklearn.model_selection import GroupKFold

This code is using an older version of pydicom, which is no longer 
maintained as of Jan 2017.  You can access the new pydicom features and API 
by installing `pydicom` from PyPI.
See 'Transitioning to pydicom 1.x' section at pydicom.readthedocs.org 
for more information.



# Format Data

In [4]:
def permute(image):
    image = torch.Tensor(image)
    image = image.permute(3,0,1,2).numpy()
    return image

In [5]:
DATA_PATH = '../data/prostate_dataset/raw_data/'
OUT_PATH = '../data/prostate_dataset/processed_data/'

In [6]:
patient_list = [i for i in os.listdir(DATA_PATH+'labelsTr') if i.find('._')==-1]
n_slices_width = 5

In [5]:
for patient in tqdm(patient_list):
    img = np.array(nib.load(DATA_PATH+'imagesTr/'+patient).dataobj)
    seg = np.array(nib.load(DATA_PATH+'labelsTr/'+patient).dataobj)
    seg = seg.reshape(seg.shape[0],seg.shape[1],seg.shape[2],1)
    
    #permute to feed into 3D CNN
    img = permute(img)
    seg = permute(seg)

    os.makedirs(OUT_PATH+patient[:-7],exist_ok=True)
    
    for i in range(img.shape[-1]//n_slices_width):
        temp = img[:,:,:,i*n_slices_width:(i+1)*n_slices_width]
        temp_y = seg[:,:,:,i*n_slices_width:(i+1)*n_slices_width]
        
        if temp.shape[1]> 256:
            dim = temp.shape[1]
            dim -= 256
            dim /= 2
            dim = int(dim)
            temp = temp[:,dim:-dim,dim:-dim]
            temp_y = temp_y[:,dim:-dim,dim:-dim]

        
        
        #save
        np.save(OUT_PATH+patient[:-7]+f'/{i}_voxels.npy',temp)
        np.save(OUT_PATH+patient[:-7]+f'/{i}_labels.npy',temp_y)
    


100%|██████████| 32/32 [00:09<00:00,  3.48it/s]


In [14]:
temp.shape

(2, 256, 256, 5)

# Prepare split tables

In [7]:
TABLE_PATH = '../data/split_tables/prostate/'

In [8]:
patient_list = [OUT_PATH[1:]+i for i in os.listdir(OUT_PATH) if i.find('.')==-1]
print(f'Total number of patients: {len(patient_list)}')

Total number of patients: 32


In [8]:
patient_arr = []
records = []
for patient in patient_list:
    records += [patient+'/'+i for i in os.listdir('.'+patient) if i.find('voxels')!=-1]
    patient_arr += [patient]*len([patient+'/'+i for i in os.listdir('.'+patient) if i.find('voxels')!=-1])
    
records = np.array(records)
patient_arr = np.array(patient_arr)

In [9]:
#create test
kf = GroupKFold(n_splits=2)

for (train,test) in kf.split(records,records,patient_arr):
    
    records_test = records[test]
    
    
    #create test
    split = {
            'test': records_test.tolist(),
        }
    
    with open(f'{TABLE_PATH}test_split_table.json', 'w') as outfile:
            json.dump(split, outfile)
    break

patient_arr = patient_arr[train]
records = records[train]

In [10]:
#create train and validation
kf = GroupKFold(n_splits=2)

for (train,test) in kf.split(records,records,patient_arr):
    
    records_test = records[test]
    
    
    #create test
    split = {
            'test': records_test.tolist(),
        }
    
    with open(f'{TABLE_PATH}test_split_table.json', 'w') as outfile:
            json.dump(split, outfile)
    break

patient_arr = patient_arr[train]
records = records[train]

In [11]:
#create train and validation
n_patients = [2,4,8]

patients_unique = np.unique(patient_arr)

for i in n_patients:
    
    train_patients = patients_unique[:i]
    train_records = np.empty(0)
    for patient in train_patients.tolist():
        train_records = np.append(train_records,records[patient_arr==patient],axis=0)
    
    val_patients = patients_unique[-2:]
    val_records = np.empty(0)
    for patient in val_patients.tolist():
        val_records = np.append(val_records,records[patient_arr==patient],axis=0)
    
    
    
    split = {
            'train': train_records.tolist(),
            'val': val_records.tolist(),
            'pretrain': records.tolist(),
        }
    with open(f'{TABLE_PATH}{i}_split_table.json', 'w') as outfile:
            json.dump(split, outfile)


In [12]:
#create UB

train_patients = patients_unique[:patients_unique.shape[0]//2]
train_records = np.empty(0)
for patient in train_patients.tolist():
    train_records = np.append(train_records,records[patient_arr==patient],axis=0)

val_patients = patients_unique[patients_unique.shape[0]//2:]
val_records = np.empty(0)
for patient in val_patients.tolist():
    val_records = np.append(val_records,records[patient_arr==patient],axis=0)

split = {
        'train': train_records.tolist(),
        'val': val_records.tolist(),
        }
with open(f'{TABLE_PATH}UB_split_table.json', 'w') as outfile:
            json.dump(split, outfile)
