In [1]:
#mount
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)
from google.colab import auth
auth.authenticate_user()
import gspread
from google.auth import default
creds, _ = default()
gc = gspread.authorize(creds)

Mounted at /content/gdrive/


In [2]:
!pip install --quiet SimpleITK

[K     |████████████████████████████████| 52.8 MB 183 kB/s 
[?25h

In [3]:
#import libraries
import glob
import os
import pandas as pd
import numpy as np
import shutil
from shutil import copyfile
import SimpleITK as sitk

In [None]:
#identify the patients to remove

#specify the path to the CTAs and SegGT (Both Old and New)
path_CTA = '/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-DICOM/dataset/CTA/'
path_SegGT = '/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-DICOM/dataset/SegGT/'
hospital = 'NMH'

#get the save paths
path_CTA_new = path_CTA[:-1] + '_New/'
path_CTA_old = path_CTA[:-1] + '_Old/'
path_SegGT_new = path_SegGT[:-1] + '_New/'
path_SegGT_old = path_SegGT[:-1] + '_Old/'

#identify the samples which may cause issues (from the old dataset)
ls = []
path_to_old = sorted(os.listdir(path_CTA_old))
for patient in path_to_old:
  if hospital in patient:
    ls.append(patient.replace('_MAIN.nii.gz', '').replace('-NMH',''))

#patients to remove as per the info on the old set
print(ls)

['GEO-PI-002', 'GEO-PI-003', 'GEO-PII-001', 'GEO-PII-004']


In [None]:
#helper functions
def find_files(path, ext = '.nii.gz'):
  ls = []
  for file in sorted(os.listdir(path)):
    if file.endswith(ext):
      ls.append(os.path.join(path, file))
  return ls

def set_frames(path1, path2):
  ls = []
  for CT_path, GT_path in zip(find_files(path1, '.nii.gz'), find_files(path2, '.nii.gz')):
    ls.append([CT_path, GT_path])
  visualize1 = pd.DataFrame(ls, columns = ['Original CTA', 'Original SegGT'])
  return visualize1

In [None]:
#get the frames
new_df = set_frames(path_CTA_new, path_SegGT_new)
old_df = set_frames(path_CTA_old, path_SegGT_old)

In [None]:
#prepare the new data
def prepare_new(row, path_CTA = '/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-DICOM/dataset/CTA/', path_SegGT = '/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-DICOM/dataset/SegGT/'):
  #get name
  path = row['Original CTA']
  #get file
  idx = path.rindex('/') 
  #get patient
  name = path[idx+1:]
  #get patient name
  patient = path[idx+1:]
  #get idx
  idx = patient.rindex('-') 
  #get the save_name
  save_name = patient[:idx]
  #get description
  _, status, _ , hospital, = save_name.rsplit('-')
  #save
  if status == 'PI':
    status = 'Elective Repair'
  if status == 'PII':
    status = 'Surveillance'
  #return
  return status, hospital, save_name, path_CTA + save_name + '.nii.gz', path_SegGT + save_name + '.nii.gz'

In [None]:
#new
new_df['Status'], new_df['Hospital'], new_df['Patient'], new_df['CTA'], new_df['SegGT'] = zip(*new_df.apply(prepare_new, axis = 1))

In [None]:
#prepare the new data
def prepare_old(row, path_CTA = '/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-DICOM/dataset/CTA/', path_SegGT = '/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-DICOM/dataset/SegGT/'):
  #get name
  path = row['Original CTA']
  #get file
  idx = path.rindex('/') 
  #get patient
  name = path[idx+1:]
  #get patient name
  patient = path[idx+1:]
  #get idx
  idx = patient.rindex('_') 
  #get the save_name
  save_name = patient[:idx]
  #get description
  _, status, _ , hospital, = save_name.rsplit('-')
  #save
  if status == 'PI':
    status = 'Elective Repair'
  if status == 'PII':
    status = 'Surveillance'
  #return
  return status, hospital, save_name, path_CTA + save_name + '.nii.gz', path_SegGT + save_name + '.nii.gz'

In [None]:
#old
old_df['Status'], old_df['Hospital'], old_df['Patient'], old_df['CTA'], old_df['SegGT'] = zip(*old_df.apply(prepare_old, axis = 1))

In [None]:
#combine 
df = pd.concat([new_df, old_df]).reset_index(drop = True)

In [None]:
#export all the data
if os.path.isdir(path_CTA) == False:
  os.mkdir(path_CTA)
if os.path.isdir(path_SegGT) == False:
  os.mkdir(path_SegGT)
#export
def export(row):
  CTA = sitk.ReadImage(row['Original CTA'])
  sitk.WriteImage(CTA, row['CTA'])
  SegGT = sitk.ReadImage(row['Original SegGT'])
  sitk.WriteImage(SegGT, row['SegGT'])

In [None]:
#export
_ = df.apply(export, axis = 1)

In [41]:
#save
df.to_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-DICOM/dataset/dataset.pkl')

In [None]:
#now need to construct the model inputs around the splitting!
#want z-normalization
#masks of ILT, AAA, calcifications  do not need, aorta
#masks of AAA, calcifications, ILT
#masks of AAA only?
#cropped regions of the same
#how many images have all 4?

In [23]:
#image resampling
def resample_image_standardize(itk_image, out_size = (64,64,64), is_label = False):
  original_spacing = itk_image.GetSpacing()
  original_size = itk_image.GetSize()
  out_spacing = [original_size[0] * (original_spacing[0] / out_size[0]),
                 original_size[1] * (original_spacing[1] / out_size[1]),
                 original_size[2] * (original_spacing[2] / out_size[2])]

  resample = sitk.ResampleImageFilter()
  resample.SetOutputSpacing(out_spacing)
  resample.SetOutputOrigin(itk_image.GetOrigin())
  resample.SetSize(out_size)
  resample.SetOutputDirection(itk_image.GetDirection())
  resample.SetTransform(sitk.Transform())
  #resample.SetDefaultPixelValue(itk_image.GetPixelIDValue())
  if is_label:
      resample.SetInterpolator(sitk.sitkNearestNeighbor)
  else:
      resample.SetInterpolator(sitk.sitkBSpline)
  return resample.Execute(itk_image)

#generate a binary mask
def binarize(lower, upper, image, binary_filter):
  binary_filter.SetLowerThreshold(lower)
  binary_filter.SetUpperThreshold(upper)
  return binary_filter.Execute(image)

In [19]:
df = pd.read_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-DICOM/dataset/dataset.pkl')
row = df.iloc[0]

In [20]:
#params
out_size = (64, 64, 64)
row

Original CTA      /content/gdrive/MyDrive/AAA_Project/Masters-Th...
Original SegGT    /content/gdrive/MyDrive/AAA_Project/Masters-Th...
Status                                              Elective Repair
Hospital                                                        NMH
Patient                                              GEO-PI-003-NMH
CTA               /content/gdrive/MyDrive/AAA_Project/Masters-Th...
SegGT             /content/gdrive/MyDrive/AAA_Project/Masters-Th...
Name: 0, dtype: object

In [29]:
#crop the data
def cropper(CTA, GT, AAA, out_size):
  #filter
  label_shape_filter = sitk.LabelShapeStatisticsImageFilter()
  #apply
  label_shape_filter.Execute(GT)
  #get bbox
  bbox = label_shape_filter.GetBoundingBox(1) #in pixel coordinates
  #get ROI
  CTA = sitk.RegionOfInterest(CTA, bbox[int(len(bbox)/2):], bbox[0:int(len(bbox)/2)])
  #get ROI
  AAA = sitk.RegionOfInterest(AAA, bbox[int(len(bbox)/2):], bbox[0:int(len(bbox)/2)])
  #standardize
  return resample_image_standardize(CTA, out_size, False), resample_image_standardize(AAA, out_size, True), CTA, AAA

In [33]:
#process each row
#CTA
CTA = sitk.ReadImage(row['CTA'])
#GT
GT = sitk.ReadImage(row['SegGT'])

#create masks of the two groups
binary_filter = sitk.BinaryThresholdImageFilter()
#z-norm
z = sitk.NormalizeImageFilter()
#resample to 64
CTA_64 = resample_image_standardize(CTA, out_size, is_label = False)
#z-norm after resampling
CTA_64 = z.Execute(CTA_64) #step 1

#BB-AAA

#binarize the AAA, ILT, and calcifications
AAA_ILT_Calc = binarize(2, 4, GT, binary_filter)
#binarize the AAA
AAA = binarize(2, 2, GT, binary_filter)
#resample the AAA. ILT, and calcifications to 64
AAA_ILT_Calc_64 = resample_image_standardize(AAA_ILT_Calc, out_size, is_label = True) #step 1
#crop to AAA_ILT_Calc for AAA
crop_CTA_64, crop_AAA_64, crop_CTA, crop_AAA = cropper(CTA, AAA_ILT_Calc, AAA, out_size) #step 2

#z-norm after resampling
crop_CTA_64 = z.Execute(crop_CTA_64) #step 1

#Control (no insertion of prediction)
AAA_64 = resample_image_standardize(AAA, out_size, is_label = True)

#return
#CTA_64, crop_CTA_64, AAA_ILT_Calc_64, crop_CTA_64, crop_AAA_64, crop_CTA, crop_AAA, AAA_64 



In [39]:
CTA = sitk.ReadImage(row['Original CTA'])
CTA.GetSize()

(42, 42, 46)

In [40]:
CTA = sitk.ReadImage(row['CTA'])
CTA.GetSize()

(66, 49, 192)