The code here is a lot simpler than Resampling-HeadNeck since the CT, PET and Mask files are in a nicer format.
All we need to do is to resample them to the required size and convert them into numpy arrays.
Note that the TIFF files are read by SITK's ImageSeriesReader in the wrong order, so we must manually sort the files before reading.

In [2]:

import pydicom
import SimpleITK as sitk
import os
import time
import matplotlib.pyplot as plt
import numpy as np

In [3]:
#read files from directory

ct_vols = [] #list of directories
pet_vols = [] #list of directories
mask_vols = [] #list of files

    
def get_processed_path(path):
    return path.replace('Breast Cancer Scans-Abridged', 'Breast Cancer Scans-Processed')

for root, dirs, files in os.walk('/home/jzhe0882/datasets/Breast Cancer Scans-Abridged'):
    for directory in dirs:            
        directory_path = os.path.join(root, directory)
        
        if 'CT' == directory:
            ct_vols.append(directory_path)
            
        elif 'PET_before' == directory:
            pet_vols.append(directory_path)
            
        elif 'mask' == directory:
            mask_vols.append(directory_path)
            
        #we only need resampled images in nrrd format for CT and PET for radiomics processing
        if 'CT' == directory or 'PET_before' == directory:
            processed_dir = get_processed_path(directory_path)
            if not os.path.isdir(processed_dir):
                os.makedirs(processed_dir) 
            
print('num ct:', len(ct_vols))
print('num pet:', len(pet_vols))
print('num mask:', len(mask_vols))

num ct: 160
num pet: 160
num mask: 160


In [4]:
#make sure the pet/ct/mask arrays are in the same order

for ct, pet, mask in zip(ct_vols, pet_vols, mask_vols):
    def up_folder(path):
        return os.path.abspath(os.path.join(path, '..'))
    
    assert up_folder(ct) == up_folder(pet)
    assert up_folder(ct) == up_folder(mask)

In [6]:
#calculate the average resolution/origins/spacings of all the breast cancer scans

def get_volume_data(file_paths):
    sizes = np.zeros((0,3))
    origins = np.zeros((0,3))
    spacings = np.zeros((0,3))
    ref_direction = (1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0) #every volume should have the same direction
    
    for path in file_paths:
        reader = sitk.ImageSeriesReader()
        files = reader.GetGDCMSeriesFileNames(path)

        #we only read the image information here and not the raw pixel data
        reader = sitk.ImageSeriesReader()
        reader.SetFileNames(files)
        volume = reader.Execute()
        
        sizes = np.concatenate((sizes, [np.array(volume.GetSize())]))
        origins = np.concatenate((origins, [np.array(volume.GetOrigin())]))
        spacings =  np.concatenate((spacings, [np.array(volume.GetSpacing())]))   
        
        assert volume.GetDirection() == ref_direction
        
    return np.mean(sizes, axis=0), np.std(sizes, axis=0), \
            np.mean(origins, axis=0), np.std(origins, axis=0), \
            np.mean(spacings, axis=0), np.std(spacings, axis=0)

np.set_printoptions(precision=2)

pet_res_mean, pet_res_std, pet_ori_mean, pet_ori_std, pet_spa_mean, pet_spa_std = get_volume_data(pet_vols)
ct_res_mean, ct_res_std, ct_ori_mean, ct_ori_std, ct_spa_mean, ct_spa_std = get_volume_data(ct_vols)

print('PET resolution: {} {}, origin: {} {}, spacing: {} {}'.format(pet_res_mean, pet_res_std, pet_ori_mean, pet_ori_std, pet_spa_mean, pet_spa_std))
print('CT resolution: {} {}, origin: {} {}, spacing: {} {}'.format(ct_res_mean, ct_res_std, ct_ori_mean, ct_ori_std, ct_spa_mean, ct_spa_std))



PET resolution: [171.2  171.2  129.09] [ 9.6   9.6  57.07], origin: [-347.78 -513.97 -661.9 ] [ 20.12  19.9  132.55], spacing: [4.06 4.06 4.45] [0.   0.   0.81]
CT resolution: [512.   512.   119.97] [ 0.    0.   51.62], origin: [-339.34 -505.34 -661.64] [ 29.94  29.94 132.71], spacing: [1.33 1.33 4.66] [0.12 0.12 0.47]


In [6]:
#calculate ideal (read- reference) spatial attributes for resampling
#all displayed volumes should be equal approximately

print('pet avg volume', np.prod(np.multiply(pet_res_mean, pet_spa_mean)))
print('ct avg volume', np.prod(np.multiply(ct_res_mean, ct_spa_mean)))

ideal_ori = pet_ori_mean
ideal_res = np.array([128, 128, 128])
res_ratio = np.divide(ideal_res, pet_res_mean)
ideal_spa = np.divide(pet_spa_mean, res_ratio)

print('ideal volume', np.prod(np.multiply(ideal_res, ideal_spa)))
print('ideal resolution', ideal_res)
print('ideal spacing', ideal_spa)
print('ideal origin', pet_ori_mean)

pet avg volume 278048364.8815038
ct avg volume 258312173.74999997
ideal volume 278048364.88150376
ideal resolution [128 128 128]
ideal spacing [5.43522578 5.43522578 4.4880249 ]
ideal origin [-347.77678757 -513.96842575 -661.896875  ]


In [27]:
#resample and convert into numpy arrays

def resample_volume(source_path, mode, reference_volume=None):  

    def get_tiff_files(source_path):
        tiff_files = []
        for root, dirs, files in os.walk(source_path):
            for file in files:
                tiff_files.append(os.path.join(root, file))
          
        #manually sort the tiff files since SITK's image series reader reads them in the wrong order
        #sort under this key: ../.../2.tiff -> 2.tiff -> 2 -> 002
        tiff_files = sorted(tiff_files, key=lambda f: "{0:03d} ".format(int(os.path.splitext(os.path.basename(f))[0])))
        return tiff_files
    
    target_ct_size = ideal_res #(128,128,128)
    target_pet_size = ideal_res #(128,128,128)
    
    reader = sitk.ImageSeriesReader()
    
    #masks are in a .tiff format, which can't be read as a DCM series
    if mode == 'mask':
        files = get_tiff_files(source_path)
        reader.SetFileNames(files)
        volume = reader.Execute()
                
    else:
        files = reader.GetGDCMSeriesFileNames(source_path)
        reader.SetFileNames(files)
        volume = reader.Execute()
                
    #assign spatial attributes to mask files, as the spatial attributes in tiff format can't be read
    if reference_volume is not None:
        #we only read the image information of the reference volume and not the raw pixel data
        reader = sitk.ImageSeriesReader()
        ref_files = reader.GetGDCMSeriesFileNames(reference_volume)
        reader.SetFileNames(ref_files)
        reference_volume = reader.Execute()
        
        #masks should be registered with CT images
        assert volume.GetSize() == reference_volume.GetSize()
        
        volume.CopyInformation(reference_volume)
    
    
    resampler = sitk.ResampleImageFilter()
    
    if mode == 'mask' :
        resampler.SetInterpolator(sitk.sitkNearestNeighbor)
    else:
        resampler.SetInterpolator(sitk.sitkLinear)
                                  
    resampler.SetReferenceImage(volume)
    resampler.SetSize((int(ideal_res[0]), int(ideal_res[1]), int(ideal_res[2])))
    resampler.SetOutputSpacing(ideal_spa)
    resampler.SetOutputOrigin(ideal_ori)
    new_volume = resampler.Execute(volume)
    
    #radiomics analysis requires sitk volumes as input; doesnt accept numpy arrays
    if mode == 'ct' or mode == 'pet':
        filename = os.path.basename(os.path.abspath(os.path.join(source_path, '..'))) + '.nrrd'
        writer = sitk.ImageFileWriter()
        writer.SetFileName(os.path.join('/home/jzhe0882/datasets/Breast Cancer Scans-Processed', mode.upper(), filename))
        writer.Execute(new_volume)
    
    #convert into numpy arrays
    array = sitk.GetArrayFromImage(new_volume)        
    return np.transpose(array) #sitk volumes and np array dimensions are indexed in reverse order

num_sampled = 0
for i in range(len(ct_vols)):
    
    #e.g. 10558339.npy
    file_name = os.path.basename(os.path.abspath(os.path.join(ct_vols[i], '..'))) + '.npy'
    
    base_path = '/home/jzhe0882/numpydata/BreastCancer'
    pet_path = os.path.join(base_path, 'PET', file_name)
    ct_path = os.path.join(base_path, 'CT', file_name)
    mask_path = os.path.join(base_path, 'Mask', file_name)

    mask = resample_volume(mask_vols[i], 'mask', ct_vols[i])
    
    if np.array_equal(np.unique(mask), [0, 1]): #only save volumes with malignant tumours
        ct = resample_volume(ct_vols[i], 'ct')
        pet = resample_volume(pet_vols[i], 'pet')
        
        np.save(mask_path, mask)
        np.save(pet_path, pet)
        np.save(ct_path, ct)
        
        num_sampled += 1
        
        if num_sampled % 20 == 0:
            print('resampled', num_sampled)
    else:
        print(file_name, 'has no tumour')


                
print('resampled', num_sampled)

10686803.nrrd
10686803.nrrd
resampled 1


In [14]:
def get_tiff_files(source_path):
    tiff_files = []
    for root, dirs, files in os.walk(source_path):
        for file in files:
            tiff_files.append(os.path.join(root, file))

    #manually sort the tiff files since SITK's image series reader reads them in the wrong order
    #sort under this key: ../.../2.tiff -> 2.tiff -> 2 -> 002
    tiff_files = sorted(tiff_files, key=lambda f: "{0:03d} ".format(int(os.path.splitext(os.path.basename(f))[0])))
    return tiff_files

# Put a volume through 3D slicer to see how the mask coincides with the segmentation. 
#check the mask of 10666779
def output_tiff_to_nrrd(): 
    reader = sitk.ImageSeriesReader()

    mask_files = get_tiff_files('/home/jzhe0882/datasets/Breast Cancer Scans-Abridged/10666779/mask')
    ct_files = reader.GetGDCMSeriesFileNames('/home/jzhe0882/datasets/Breast Cancer Scans-Abridged/10666779/CT')
    
    reader.SetFileNames(ct_files)
    ct_volume = reader.Execute()

    reader.SetFileNames(mask_files)
    mask_volume = reader.Execute()
        
    mask_volume.CopyInformation(ct_volume)
    writer = sitk.ImageFileWriter()
    writer.SetFileName(os.path.join('/home/jzhe0882/SegmentationOutput', '{}.nrrd'.format('10666779')))
    writer.Execute(mask_volume)
    
def output_numpy_to_nrrd(): 
    reader = sitk.ImageSeriesReader()
    ct_files = reader.GetGDCMSeriesFileNames('/home/jzhe0882/datasets/Breast Cancer Scans-Abridged/10666779/CT')
    reader.SetFileNames(ct_files)
    ct_volume = reader.Execute()

    mask_volume = sitk.GetImageFromArray(np.transpose(np.load(
        os.path.join('/home/jzhe0882/numpydata/BreastCancer/Mask', '{}.npy'.format(patient_name)))))
    
    size_ratio = np.divide(ct_volume.GetSize(), mask_volume.GetSize())
    reference_index = np.multiply(size_ratio, (target_centre - target_radius))
    target_origin = ct_volume.TransformContinuousIndexToPhysicalPoint(reference_index)
    
    print(reference_index, reader.GetSize(), target_centre)

    test_volume.SetOrigin((target_origin[0], target_origin[1], target_origin[2]))
    #reference_volume.TransformContinuousIndexToPhysicalPoint(0.5 * np.array(reference_volume.GetSize()))
                        #
    test_volume.SetSpacing(reference_volume.GetSpacing())
    test_volume.SetDirection(reference_volume.GetDirection())    
    
    writer = sitk.ImageFileWriter()
    writer.SetFileName(os.path.join('/home/jzhe0882/SegmentationOutput', '{}.nrrd'.format('10666779')))
    writer.Execute(mask_volume)
    
output_numpy_mask_to_nrrd()