In [None]:
#mount
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)
from google.colab import auth
auth.authenticate_user()
import gspread
from google.auth import default
creds, _ = default()
gc = gspread.authorize(creds)

Mounted at /content/gdrive/


In [None]:
#pip installations
!pip install --quiet SimpleITK

[K     |████████████████████████████████| 52.8 MB 16.8 MB/s 
[?25h

In [None]:
#imports
import SimpleITK as sitk
import numpy as np
import os
import sklearn
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
#functions
#resampling
def resample_image_standardize_3D(itk_image, out_size = (64,64,64), is_label = False):
  original_spacing = itk_image.GetSpacing()
  original_size = itk_image.GetSize()
  out_spacing = [original_size[0] * (original_spacing[0] / out_size[0]),
                 original_size[1] * (original_spacing[1] / out_size[1]),
                 original_size[2] * (original_spacing[2] / out_size[2])]

  resample = sitk.ResampleImageFilter()
  resample.SetOutputSpacing(out_spacing)
  resample.SetOutputOrigin(itk_image.GetOrigin())
  resample.SetSize(out_size)
  resample.SetOutputDirection(itk_image.GetDirection())
  resample.SetTransform(sitk.Transform())
  #resample.SetDefaultPixelValue(itk_image.GetPixelIDValue())
  if is_label:
      resample.SetInterpolator(sitk.sitkNearestNeighbor)
  else:
      resample.SetInterpolator(sitk.sitkBSpline)
  return resample.Execute(itk_image)

#get the SegGT and CTA folders
def getListOfFiles(dirName):
    # create a list of file and sub directories
    # names in the given directory
    listOfFile = os.listdir(dirName)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirName, entry)
        # If entry is a directory then get the list of files in this directory
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)

    return sorted(allFiles)

#image resampling
def resample_image_standardize(itk_image, out_size=(512, 512), is_label=False):
    original_spacing = itk_image.GetSpacing()
    original_size = itk_image.GetSize()
    out_spacing = [
                   original_size[0] * (original_spacing[0] / out_size[0]),
                   original_size[1] * (original_spacing[1] / out_size[1])
                   ]
    resample = sitk.ResampleImageFilter()
    resample.SetOutputSpacing(out_spacing)
    resample.SetOutputOrigin(itk_image.GetOrigin())
    resample.SetSize(out_size)
    resample.SetOutputDirection(itk_image.GetDirection())
    resample.SetTransform(sitk.Transform())
    if is_label:
        resample.SetInterpolator(sitk.sitkNearestNeighbor)
    else:
        resample.SetInterpolator(sitk.sitkBSpline)
    return resample.Execute(itk_image)

#prepare the asymtomatic data
def get_ad(dataset_path):
  #init
  data = []
  #from the dataset path read in the condition for asymtomatic for now
  class_name = 'Asymptomatic'
  #get the corresponding name
  patient_paths = dataset_path + class_name + '/'
  #get all the patient ids
  for patient in sorted(os.listdir(patient_paths)):
    #for each patient folder read in the get the CT folder and Mask
    CT, Mask = sorted(os.listdir(patient_paths + patient))
    CT_path = patient_paths + patient + '/' + CT + '/'
    Mask_path = patient_paths + patient + '/' + Mask + '/'
    #iterate through each and collect the relvant paths
    for patient_CT, patient_Mask in zip(getListOfFiles(CT_path), getListOfFiles(Mask_path)):
      #specific to each image
      CT_im = patient_CT.split('/')[-1].split('.')[0]
      Mask_im = patient_Mask.split('/')[-1].split('.')[0]
      #append
      data.append([class_name, 0, 'NA', 0, patient, CT_im, Mask_im, patient_CT, patient_Mask])
  #get df
  df = pd.DataFrame(data, columns = ['Class', 'Class_Label', 'Wall', 'Wall_Label', 'Patient', 'CT_im', 'GT_im', 'CT', 'GT'])
  #return
  return df

#preapre the symptomatic data
def get_sd(dataset_path):
  #init
  data = []
  #from the dataset path read in the condition for the symtomatic data
  class_name = 'Symptomatic'
  #get the corresponding name
  patient_paths = dataset_path + class_name + '/'

  #iterate for the walls
  for wall in sorted(os.listdir(patient_paths)):
    #specify the wall
    if wall == 'Unruptred Wall':
      patient_paths1 = patient_paths + wall + '/'
      wall_label = 'Unruptured'
      wall_encode = 1
    else:
      patient_paths1 = patient_paths + wall + '/Ruptured_1/'
      wall_label = 'Ruptured'
      wall_encode = 2

    #get all the patient ids
    for patient in sorted(os.listdir(patient_paths1)):
      #for each patient folder read in the get the CT folder and Mask
      CT, Mask = sorted(os.listdir(patient_paths1 + patient))
      CT_path = patient_paths1 + patient + '/' + CT + '/'
      Mask_path = patient_paths1 + patient + '/' + Mask + '/'
      #iterate through each and collect the relvant paths
      for patient_CT, patient_Mask in zip(getListOfFiles(CT_path), getListOfFiles(Mask_path)):
        #specific to each image
        CT_im = patient_CT.split('/')[-1].split('.')[0]
        Mask_im = patient_Mask.split('/')[-1].split('.')[0]
        #append
        data.append([class_name, 1, wall_label, wall_encode, patient, CT_im, Mask_im, patient_CT, patient_Mask])
  #get df
  df = pd.DataFrame(data, columns = ['Class', 'Class_Label', 'Wall', 'Wall_Label', 'Patient', 'CT_im', 'GT_im', 'CT', 'GT'])
  #return
  return df

#generate a binary mask
def binarize(lower, upper, image, binary_filter):
  binary_filter.SetLowerThreshold(lower)
  binary_filter.SetUpperThreshold(upper)
  return binary_filter.Execute(image)

#prepare 2d image
def prepare_image_2d(image, mode, resample_size = (512, 512)):
  # resample
  if resample_size != image.GetSize():
    image = resample_image_standardize(image, resample_size)
  #CTA
  if mode == 'CTA':
    #norm
    z = sitk.NormalizeImageFilter()
    image = z.Execute(image)
    #return
    return image
  #GT
  if mode == 'GT':
    #binarize
    binary_filter = sitk.BinaryThresholdImageFilter()
    #different components
    #wall 1 (84)
    #intraluminal thrombus 2 (168)
    #aneurysm 3 (255)
    mask_wall = binarize(83, 85, image, binary_filter)
    mask_ILT = binarize(167, 169, image, binary_filter)
    mask_AAA = binarize(254, 256, image, binary_filter)
    mask_all = mask_wall + sitk.Multiply(mask_ILT, 2) + sitk.Multiply(mask_AAA, 3)
    #return
    return mask_wall, mask_ILT, mask_AAA, mask_all

#crop the data
def cropper(CTA, GT_wall, GT_ILT, GT_AAA, GT_all):
  #filter
  label_shape_filter = sitk.LabelShapeStatisticsImageFilter()
  #binarize
  GT = GT_wall + GT_ILT + GT_AAA
  #apply
  label_shape_filter.Execute(GT)
  #get bbox
  bbox = label_shape_filter.GetBoundingBox(1) #in pixel coordinates
  #get ROI
  CTA = sitk.RegionOfInterest(CTA, bbox[int(len(bbox)/2):], bbox[0:int(len(bbox)/2)])
  GT_wall = sitk.RegionOfInterest(GT_wall, bbox[int(len(bbox)/2):], bbox[0:int(len(bbox)/2)])
  GT_ILT = sitk.RegionOfInterest(GT_ILT, bbox[int(len(bbox)/2):], bbox[0:int(len(bbox)/2)])
  GT_AAA = sitk.RegionOfInterest(GT_AAA, bbox[int(len(bbox)/2):], bbox[0:int(len(bbox)/2)])
  GT_all = sitk.RegionOfInterest(GT_all, bbox[int(len(bbox)/2):], bbox[0:int(len(bbox)/2)])
  #standardize
  CTA = resample_image_standardize(CTA)
  GT_wall = resample_image_standardize(GT_wall, is_label = True)
  GT_ILT = resample_image_standardize(GT_ILT, is_label = True)
  GT_AAA = resample_image_standardize(GT_AAA, is_label = True)
  GT_all = resample_image_standardize(GT_all, is_label = True)

#crpo the data in 3D
def cropper_3D(CTA_3D, GT_wall_3D, GT_ILT_3D, GT_AAA_3D, GT_all_3D):
  #filter
  label_shape_filter = sitk.LabelShapeStatisticsImageFilter()
  #binarize
  GT = GT_wall_3D + GT_ILT_3D + GT_AAA_3D
  #apply
  label_shape_filter.Execute(GT)
  #get bbox
  bbox = label_shape_filter.GetBoundingBox(1) #in pixel coordinates
  #get ROI
  CTA_3D = sitk.RegionOfInterest(CTA_3D, bbox[int(len(bbox)/2):], bbox[0:int(len(bbox)/2)])
  GT_wall_3D = sitk.RegionOfInterest(GT_wall_3D, bbox[int(len(bbox)/2):], bbox[0:int(len(bbox)/2)])
  GT_ILT_3D = sitk.RegionOfInterest(GT_ILT_3D, bbox[int(len(bbox)/2):], bbox[0:int(len(bbox)/2)])
  GT_AAA_3D = sitk.RegionOfInterest(GT_AAA_3D, bbox[int(len(bbox)/2):], bbox[0:int(len(bbox)/2)])
  GT_all_3D = sitk.RegionOfInterest(GT_all_3D, bbox[int(len(bbox)/2):], bbox[0:int(len(bbox)/2)])
  #return
  return CTA_3D, GT_wall_3D, GT_ILT_3D, GT_AAA_3D, GT_all_3D

  return CTA, GT_wall, GT_ILT, GT_AAA, GT_all

#save image
def save_image(path, type_name, patient, tag, image):
  #save file
  if os.path.isdir(path + type_name) == False:
    os.mkdir(path + type_name)
  #save file
  save_path = path + type_name + '/' + patient + '-' + tag + '.nii.gz'
  #write
  sitk.WriteImage(image, save_path)
  #return
  return save_path

def preprocess_2D(row, path = '/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/2D/'):
  #generate all the differenet masks and prepare the corresponding CT images
  #read in the CTA
  CTA = sitk.ReadImage(row['CT']) #tiff
  #save the spacinf info
  CTA_Spacing = CTA.GetSpacing()
  #read in the SegGT
  GT = sitk.ReadImage(row['GT']) #png
  #save the spacin info
  #GT_Spacing = GT.GetSpacing()
  #convert
  CTA = sitk.GetArrayFromImage(CTA).astype(float)
  GT = sitk.GetArrayFromImage(GT).astype(float)[:,:,0]
  #convert back
  CTA = sitk.GetImageFromArray(CTA)
  GT = sitk.GetImageFromArray(GT)
  #preprocess
  CTA = prepare_image_2d(CTA, mode = 'CTA')
  GT_wall, GT_ILT, GT_AAA, GT_all = prepare_image_2d(GT, mode = 'GT')
  #save the data to a folder for use
  #spacing keep consistent for image-mask
  CTA.SetSpacing(CTA_Spacing)
  GT_wall.SetSpacing(CTA_Spacing)
  GT_ILT.SetSpacing(CTA_Spacing)
  GT_AAA.SetSpacing(CTA_Spacing)
  GT_all.SetSpacing(CTA_Spacing)

  #CTA
  CTA_path = save_image(path, 'Norm-CTA', row['Patient'], row['CT_im'], CTA)
  #Mask-Wall
  Wall_path = save_image(path, 'Mask-Wall', row['Patient'], row['GT_im'], GT_wall)
  #Mask-ILT
  ILT_path = save_image(path, 'Mask-ILT', row['Patient'], row['GT_im'], GT_ILT)
  #Mask-AAA
  AAA_path = save_image(path, 'Mask-AAA', row['Patient'], row['GT_im'], GT_AAA)
  #GT-wall
  All_path = save_image(path, 'Mask-All', row['Patient'], row['GT_im'], GT_all)

  #cropped data
  #CTA, GT_wall, GT_ILT, GT_AAA, GT_all = cropper(CTA, GT_wall, GT_ILT, GT_AAA, GT_all)

  #save the data to a folder for use
  #CTA
  #Cropped_CTA_path = save_image(path, 'Norm-CTA-Cropped', row['Patient'], row['CT_im'], CTA)
  #Mask-Wall
  #Cropped_Wall_path = save_image(path, 'Mask-Wall-Cropped', row['Patient'], row['GT_im'], GT_wall)
  #Mask-ILT
  #Cropped_ILT_path = save_image(path, 'Mask-ILT-Cropped', row['Patient'], row['GT_im'], GT_ILT)
  #Mask-AAA
  #Cropped_AAA_path = save_image(path, 'Mask-AAA-Cropped', row['Patient'], row['GT_im'], GT_AAA)
  #GT-wall
  #Cropped_All_path = save_image(path, 'Mask-All-Cropped', row['Patient'], row['GT_im'], GT_all)

  #return
  return CTA_path, Wall_path, ILT_path, AAA_path, All_path #, Cropped_CTA_path, Cropped_Wall_path, Cropped_ILT_path, Cropped_AAA_path, Cropped_All_path

#get all the 2D data
def get_2D(row):
  return sitk.GetArrayFromImage(sitk.ReadImage(row['Norm-CTA'])), sitk.GetArrayFromImage(sitk.ReadImage(row['Mask-Wall'])), sitk.GetArrayFromImage(sitk.ReadImage(row['Mask-ILT'])), sitk.GetArrayFromImage(sitk.ReadImage(row['Mask-AAA'])), sitk.GetArrayFromImage(sitk.ReadImage(row['Mask-All']))

#save image
def save_image_3D(path, type_name, patient, image):
  #save file
  if os.path.isdir(path + type_name) == False:
    os.mkdir(path + type_name)
  #save file
  save_path = path + type_name + '/' + patient + '.nii.gz'
  #write
  sitk.WriteImage(image, save_path)
  #return
  return save_path

#process 3d
def process_3D(df, path = '/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/', spacing = (0.935546875, 0.935546875, 3.5)): #retry with 3.5 mm 0.9355 mm is what was displayed in the CT or the mask?

  #get the 3D data
  df_3D = df.drop(['CT_im', 'GT_im', 'CT', 'GT', 'Norm-CTA', 'Mask-Wall', 'Mask-ILT', 'Mask-AAA', 'Mask-All'], axis = 1).drop_duplicates()

  #get all the patients this caused major issues (major revision)!
  class_list = df_3D['Class'].to_list()
  class_label_list = df_3D['Class_Label'].to_list()
  wall_list = df_3D['Wall'].to_list()
  wall_label_list = df_3D['Wall_Label'].to_list()
  patient_list = df_3D['Patient'].to_list()

  #collect
  ls = []
  #z
  z = sitk.NormalizeImageFilter()
  #iterate
  for class_ls, class_label, wall, wall_label, patient in zip(class_list, class_label_list, wall_list, wall_label_list, patient_list):
    #subset
    df_temp = df[df['Patient'] == patient]
    #apply the 2d
    CTA, GT_wall, GT_ILT, GT_AAA, GT_all = zip(*df_temp.apply(get_2D, axis = 1))
    #stack (in order!)
    CTA_3D = np.stack(CTA)
    GT_wall_3D = np.stack(GT_wall)
    GT_ILT_3D = np.stack(GT_ILT)
    GT_AAA_3D = np.stack(GT_AAA)
    GT_all_3D = np.stack(GT_all)

    #convert
    CTA_3D = sitk.GetImageFromArray(CTA_3D)
    GT_wall_3D = sitk.GetImageFromArray(GT_wall_3D)
    GT_ILT_3D = sitk.GetImageFromArray(GT_ILT_3D)
    GT_AAA_3D = sitk.GetImageFromArray(GT_AAA_3D)
    GT_all_3D = sitk.GetImageFromArray(GT_all_3D)

    #preserve spacing (copied and pasted from the CTA-2D; same spacing for all, 1 is the placeholder)
    CTA_3D.SetSpacing(spacing)
    GT_wall_3D.SetSpacing(spacing)
    GT_ILT_3D.SetSpacing(spacing)
    GT_AAA_3D.SetSpacing(spacing)
    GT_all_3D.SetSpacing(spacing)

    #norm
    CTA_3D = z.Execute(CTA_3D)
    #save
    CTA_path = save_image_3D(path, 'Norm-CTA', patient, CTA_3D)
    Wall_path = save_image_3D(path, 'Mask-Wall', patient, GT_wall_3D)
    ILT_path = save_image_3D(path, 'Mask-ILT', patient, GT_ILT_3D)
    AAA_path = save_image_3D(path, 'Mask-AAA', patient, GT_AAA_3D)
    All_path = save_image_3D(path, 'Mask-All', patient, GT_all_3D)

    #resample
    CTA_3D_64 = resample_image_standardize_3D(CTA_3D, out_size = (64, 64, 64), is_label = False)
    #norm
    CTA_3D_64 = z.Execute(CTA_3D_64)
    #resample
    GT_wall_3D_64 = resample_image_standardize_3D(GT_wall_3D, out_size = (64, 64, 64), is_label = True)
    #resample
    GT_ILT_3D_64 = resample_image_standardize_3D(GT_ILT_3D, out_size = (64, 64, 64), is_label = True)
    #resample
    GT_AAA_3D_64 = resample_image_standardize_3D(GT_AAA_3D, out_size = (64, 64, 64), is_label = True)
    #resample
    GT_all_3D_64 = resample_image_standardize_3D(GT_all_3D, out_size = (64, 64, 64), is_label = True)

    #save
    CTA_path_64 = save_image_3D(path, 'Norm-CTA-64', patient, CTA_3D_64)
    Wall_path_64 = save_image_3D(path, 'Mask-Wall-64', patient, GT_wall_3D_64)
    ILT_path_64 = save_image_3D(path, 'Mask-ILT-64', patient, GT_ILT_3D_64)
    AAA_path_64 = save_image_3D(path, 'Mask-AAA-64', patient, GT_AAA_3D_64)
    All_path_64 = save_image_3D(path, 'Mask-All-64', patient, GT_all_3D_64)

    #cropped data
    CTA_3D, GT_wall_3D, GT_ILT_3D, GT_AAA_3D, GT_all_3D = cropper_3D(CTA_3D, GT_wall_3D, GT_ILT_3D, GT_AAA_3D, GT_all_3D)
    #save the data to a folder for use
    #norm
    CTA_3D = z.Execute(CTA_3D)
    #CTA
    Cropped_CTA_path = save_image_3D(path, 'Norm-CTA-Cropped', patient, CTA_3D)
    #Mask-Wall
    Cropped_Wall_path = save_image_3D(path, 'Mask-Wall-Cropped', patient, GT_wall_3D)
    #Mask-ILT
    Cropped_ILT_path = save_image_3D(path, 'Mask-ILT-Cropped', patient, GT_ILT_3D)
    #Mask-AAA
    Cropped_AAA_path = save_image_3D(path, 'Mask-AAA-Cropped', patient, GT_AAA_3D)
    #GT-wall
    Cropped_All_path = save_image_3D(path, 'Mask-All-Cropped', patient, GT_all_3D)

    #resample crop
    CTA_3D_64 = resample_image_standardize_3D(CTA_3D, out_size = (64, 64, 64), is_label = False)
    #norm
    CTA_3D_64 = z.Execute(CTA_3D_64)
    #resample crop
    GT_wall_3D_64 = resample_image_standardize_3D(GT_wall_3D, out_size = (64, 64, 64), is_label = True)
    #resample crop
    GT_ILT_3D_64 = resample_image_standardize_3D(GT_ILT_3D, out_size = (64, 64, 64), is_label = True)
    #resample crop
    GT_AAA_3D_64 = resample_image_standardize_3D(GT_AAA_3D, out_size = (64, 64, 64), is_label = True)
    #resample
    GT_all_3D_64 = resample_image_standardize_3D(GT_all_3D, out_size = (64, 64, 64), is_label = True)
    #save the data to a folder
    #crop resample
    Cropped_CTA_64_path = save_image_3D(path, 'Norm-CTA-Cropped-64', patient, CTA_3D_64)
    #crop resample
    Cropped_Wall_64_path = save_image_3D(path, 'Mask-Wall-Cropped-64', patient, GT_wall_3D_64)
    #crop resample
    Cropped_ILT_64_path = save_image_3D(path, 'Mask-ILT-Cropped-64', patient, GT_ILT_3D_64)
    #crop resample
    Cropped_AAA_64_path = save_image_3D(path, 'Mask-AAA-Cropped-64', patient, GT_AAA_3D_64)
    #crop resample
    Cropped_All_64_path = save_image_3D(path, 'Mask-All-Cropped-64', patient, GT_all_3D_64)

    #return
    ls.append([class_ls, class_label, wall, wall_label, patient,
               CTA_path, Wall_path, ILT_path, AAA_path, All_path,
               CTA_path_64, Wall_path_64, ILT_path_64, AAA_path_64, All_path_64,
               Cropped_CTA_path, Cropped_Wall_path, Cropped_ILT_path, Cropped_AAA_path, Cropped_All_path,
               Cropped_CTA_64_path, Cropped_Wall_64_path, Cropped_ILT_64_path, Cropped_AAA_64_path, Cropped_All_64_path
               ])
  #set
  visualize = pd.DataFrame(ls, columns = ['Class', 'Class_Label', 'Wall', 'Wall_Label', 'Patient',
                                          'Norm-CTA', 'Mask-Wall', 'Mask-ILT', 'Mask-AAA', 'Mask-All',
                                          'Norm-CTA-64', 'Mask-Wall-64', 'Mask-ILT-64', 'Mask-AAA-64', 'Mask-All-64',
                                          'Norm-CTA-Cropped', 'Mask-Wall-Cropped', 'Mask-ILT-Cropped', 'Mask-AAA-Cropped', 'Mask-All-Cropped',
                                          'Norm-CTA-Cropped-64', 'Mask-Wall-Cropped-64', 'Mask-ILT-Cropped-64', 'Mask-AAA-Cropped-64', 'Mask-All-Cropped-64'
                                          ])

  #append
  #df_3D = pd.concat([df_3D.reset_index(drop = True), visualize.reset_index(drop = True)], axis = 1)

  df_3D = visualize.reset_index(drop = True)

  #return
  return df_3D

#get the quality check sheet
def quality_check_sheet(url, sheet):
  wb = gc.open_by_url(url)
  sheet = wb.worksheet(sheet)
  data = sheet.get_all_values()
  df = pd.DataFrame(data[1:])
  df.columns = data[0]
  return df

#perform the quality check
def perform_quality_check(df_quality, df_2D, df_3D, status = 'Filter Status'):

  #none
  if status is not None:
    #get the good samples only
    ls = df_quality[df_quality[status] == 'Good']['Patient'].to_list()
    #2D
    df_2D = df_2D[df_2D['Patient'].isin(ls)]
    #3D
    df_3D = df_3D[df_3D['Patient'].isin(ls)]
  #return
  return df_2D, df_3D

#label the dataframes
def df_label_split(df, train_patients, test_patients):
  #specify the training and validation sets
  df_train = df[df['Patient'].isin(train_patients)]
  df_test = df[df['Patient'].isin(test_patients)]
  #encode
  df_train['DATA'] = 'TRAIN'
  df_test['DATA'] = 'TEST'
  #return
  return pd.concat([df_train, df_test])


In [None]:
%%time
#generate all the 2D data (~54 min)

#get asymtomatic
df_ad = get_ad('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/dataset/')
#get symtomatic
df_sd = get_sd('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/dataset/')
#get all the data
df = pd.concat([df_ad, df_sd])
#get all the data
#df['Norm-CTA'], df['Mask-Wall'], df['Mask-ILT'], df['Mask-AAA'], df['Mask-All'], df['Norm-CTA-Cropped'], df['Mask-Wall-Cropped'], df['Mask-ILT-Cropped'], df['Mask-AAA-Cropped'], df['Mask-All-Cropped'] = zip(*df.apply(preprocess_2D, axis = 1))
df['Norm-CTA'], df['Mask-Wall'], df['Mask-ILT'], df['Mask-AAA'], df['Mask-All'] = zip(*df.apply(preprocess_2D, axis = 1))
#save
df.to_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/2D/data.pkl')

CPU times: user 6min 39s, sys: 1min 16s, total: 7min 55s
Wall time: 2h 26min 40s


In [None]:
#select patients were missing?
#df[df['Patient'] == 'GU001'].apply(preprocess_2D, axis = 1)

In [None]:
#read in the pkl file
df = pd.read_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/2D/data.pkl')

In [None]:
df_3D = pd.read_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/data.pkl')

In [None]:
%%time
# 2 hours
#go from the 2D dataset to the 3D dataset (includes the cropped versions) need to debug this section of the script (2 h)

#read in the pkl file
df = pd.read_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/2D/data.pkl')
#get the corresponding 3D
df_3D = process_3D(df)
#save
df_3D.to_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/data.pkl')

CPU times: user 11min 34s, sys: 2min 46s, total: 14min 21s
Wall time: 1h 55min 3s


In [None]:
#check
patient_list = df_3D['Norm-CTA']
for patient1, patient2 in zip(df_3D['Patient'].to_list(), patient_list):
  if patient1 not in patient2:
    print(patient1)


In [None]:
#from the cropped 3d data generate the cropped 2d data (ensures the same frame of reference)
#df_2D = pd.read_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/2D/data.pkl')
#df_3D = pd.read_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/data.pkl')

In [None]:
#for 1 row
#row = df_3D.iloc[0]
#get z
#z = sitk.NormalizeImageFilter()
#get the image
#image = sitk.GetArrayFromImage(sitk.ReadImage(row['Norm-CTA-Cropped']))
#convert to ls
#ls_image = np.split(image, indices_or_sections=image.shape[0], axis = 0)

In [None]:
def process_CTA(ls_image, path, type_name, patient, tag, df):
  #prepare
  df_temp = df[df['Patient'] == patient]
  #list
  tags = df_temp[tag].to_list()
  #iterate
  for image, tag in zip(ls_image, tags):
    #get the image
    im = sitk.GetImageFromArray(image[0])
    #convert
    im = resample_image_standardize(im, (512, 512), is_label = False)
    #norm
    im = z.Execute(im)
    #save
    return save_image(path, type_name, patient, tag, image)


In [None]:
#some comments

#DATA is for actual model inputs (z-normalized as well)
#2d images of size 512x512 belong in there, including cropped must be 512x512 based off the 3d frame of reference
#3d images of 64x64x64 belong in there as do the same for the cropped 64x64x64
#but also have 3d for the original input dimensions and the original for the uncropped as well --> TorchIO

#DATASET
#belongs to configurations of data that cannot be directly used for model training
#though TorchIO is a potential avenue from which this could be remedied

In [None]:
#get new sizes
def get_new_sizes(row, input_size = (128, 128, 64), path = '/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/', cropped = True):
  #on each row
  CTA = sitk.ReadImage(row['Norm-CTA'])
  GT = sitk.ReadImage(row['Mask-Wall'])
  #resize
  CTA, GT = process_image_sizes(CTA, GT, input_size)
  #save
  CTA_path = save_image_3D(path, 'Norm-CTA-128-128-64', row['Patient'], CTA)
  GT_path = save_image_3D(path, 'Mask-Wall-128-128-64', row['Patient'], GT)

  if cropped:
    #for each row
    CTA = sitk.ReadImage(row['Norm-CTA-Cropped'])
    GT = sitk.ReadImage(row['Mask-Wall-Cropped'])
    #resize
    CTA, GT = process_image_sizes(CTA, GT, input_size)
    #save
    cropped_CTA_path = save_image_3D(path, 'Norm-CTA-Cropped-128-128-64', row['Patient'], CTA)
    cropped_GT_path = save_image_3D(path, 'Mask-Cropped-Wall-128-128-64', row['Patient'], GT)

    return CTA_path, GT_path, cropped_CTA_path, cropped_GT_path
  else:
    return CTA_path, GT_path

#proces image sizes
def process_image_sizes(CTA, GT, GT_all, input_size, z = sitk.NormalizeImageFilter()):
  #resample to 3D
  CTA = resample_image_standardize_3D(CTA, out_size = input_size, is_label = False)
  GT = resample_image_standardize_3D(GT, out_size = input_size, is_label = True)
  GT_all = resample_image_standardize_3D(GT_all, out_size = input_size, is_label = True)
  #for the CTA
  CTA = z.Execute(CTA)
  new_CTA = sitk.Cast(CTA, sitk.sitkFloat32) #important for torchio!!!
  new_CTA.CopyInformation(CTA)
  #return
  return new_CTA, GT, GT_all

#get new sizes
def get_new_sizes2(row, input_size = (128, 128, 64), path = '/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/', cropped = True):
  #on each row
  CTA = sitk.ReadImage(row['Norm-CTA'])
  GT = sitk.ReadImage(row['Mask-All'])
  #resize
  CTA, GT = process_image_sizes(CTA, GT, input_size)
  #save
  CTA_path = save_image_3D(path, 'Norm-CTA-128-128-64', row['Patient'], CTA)
  GT_path = save_image_3D(path, 'Mask-All-128-128-64', row['Patient'], GT)

  if cropped:
    #for each row
    CTA = sitk.ReadImage(row['Norm-CTA-Cropped'])
    GT = sitk.ReadImage(row['Mask-All-Cropped'])
    #resize
    CTA, GT = process_image_sizes(CTA, GT, input_size)
    #save
    cropped_CTA_path = save_image_3D(path, 'Norm-CTA-Cropped-128-128-64', row['Patient'], CTA)
    cropped_GT_path = save_image_3D(path, 'Mask-Cropped-All-128-128-64', row['Patient'], GT)

    return CTA_path, GT_path, cropped_CTA_path, cropped_GT_path
  else:
    return CTA_path, GT_path

#get new sizes
def get_new_sizes3(row, input_size = (128, 128, 64), path = '/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/', cropped = False):
  #on each row
  CTA = sitk.ReadImage(row['Norm-CTA'])
  GT = sitk.ReadImage(row['Mask-Wall'])
  #resize
  CTA, GT = process_image_sizes(CTA, GT, input_size)
  #save
  GT_path = save_image_3D(path, 'Mask-Wall-128-128-64', row['Patient'], GT)
  #return
  return GT_path

#get new sizes dont want any cropping
def get_new_sizes4(row, input_size = (512, 512, 64), path = '/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/'):
  #on each row
  CTA = sitk.ReadImage(row['Norm-CTA'])
  GT = sitk.ReadImage(row['Mask-Wall'])
  GT_all = sitk.ReadImage(row['Mask-All'])
  #resize
  CTA, GT, GT_all = process_image_sizes(CTA, GT, GT_all, input_size)
  #save
  CTA_path = save_image_3D(path, 'Norm-CTA-512-512-64', row['Patient'], CTA)
  GT_path = save_image_3D(path, 'Mask-Wall-512-512-64', row['Patient'], GT)
  GT_all_path = save_image_3D(path, 'Mask-All-512-512-64', row['Patient'], GT_all)

  return CTA_path, GT_path, GT_all_path

#get new sizes dont want any cropping
def get_new_sizes5(row, input_size = (256, 256, 64), path = '/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/'):
  #on each row
  CTA = sitk.ReadImage(row['Norm-CTA'])
  GT = sitk.ReadImage(row['Mask-Wall'])
  GT_all = sitk.ReadImage(row['Mask-All'])
  #resize
  CTA, GT, GT_all = process_image_sizes(CTA, GT, GT_all, input_size)
  #save
  CTA_path = save_image_3D(path, 'Norm-CTA-256-256-64', row['Patient'], CTA)
  GT_path = save_image_3D(path, 'Mask-Wall-256-256-64', row['Patient'], GT)
  GT_all_path = save_image_3D(path, 'Mask-All-256-56-64', row['Patient'], GT_all)

  return CTA_path, GT_path, GT_all_path

In [None]:
%%time
#resized (add some additional sizes)
#df_3d = pd.read_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/data.pkl')
#get new sizes and save in the data file
#df_3d['Norm-CTA-128-128-64'], df_3d['Mask-Wall-128-128-64'], df_3d['Norm-CTA-Cropped-128-128-64'], df_3d['Mask-Cropped-Wall-128-128-64']  = zip(*df_3d.apply(get_new_sizes, axis = 1))
#save
#df_3d.to_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/data.pkl')

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 10.7 µs


In [None]:
%%time
#resized (add some additional sizes)/ some redundancy
df_3d = pd.read_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/data.pkl')
#get new sizes and save in the data file
df_3d['Norm-CTA-128-128-64'], df_3d['Mask-All-128-128-64'], df_3d['Norm-CTA-Cropped-128-128-64'], df_3d['Mask-Cropped-All-128-128-64']  = zip(*df_3d.apply(get_new_sizes2, axis = 1))
#save
df_3d.to_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/data.pkl')

CPU times: user 6min 54s, sys: 5.5 s, total: 6min 59s
Wall time: 7min 48s


In [None]:
#other did not update previously
df_3d = pd.read_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/data.pkl')
df_3d['Mask-Wall-128-128-64'] = df_3d.apply(get_new_sizes3, axis = 1)
df_3d.to_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/data.pkl')

In [None]:
#bigger data 23 min
df_3d = pd.read_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/data.pkl')
df_3d['Norm-CTA-512-512-64'], df_3d['Mask-Wall-512-512-64'], df_3d['Mask-All-512-512-64'] = zip(*df_3d.apply(get_new_sizes4, axis = 1))
df_3d.to_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/data.pkl')

In [None]:
#more medium data
df_3d = pd.read_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/data.pkl')
df_3d['Norm-CTA-256-256-64'], df_3d['Mask-Wall-256-256-64'], df_3d['Mask-All-256-256-64'] = zip(*df_3d.apply(get_new_sizes5, axis = 1))
df_3d.to_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/data.pkl')

In [None]:
#do not delete this!
random_state = 24 #set to random state 24 fo final round of experiments

In [None]:
#generate the 3 different datasets (can rerun assuming same stratification inputs & data order & random state)/ do not alter!!

#perform the quality check (if desired)
df_quality = quality_check_sheet('https://docs.google.com/spreadsheets/d/10TMbEwcVMtO6Ly3U1pIzpaxRiPrdTrI7YNm-Bxsi5YU/edit#gid=0', 'Final')
df_2d = pd.read_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/2D/data.pkl')
df_3d = pd.read_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/data.pkl')

df_2d, df_3d = perform_quality_check(df_quality, df_2d, df_3d, status = None) #status changes

#split at the patient-level
train_patients, test_patients = train_test_split(
    df_3d['Patient'].to_list(),
    train_size = 0.6,
    stratify = df_3d[['Class_Label', 'Wall_Label']],
    random_state = random_state,
    shuffle = True
    )

#encode
df_2d = df_label_split(df_2d, train_patients, test_patients)
df_3d = df_label_split(df_3d, train_patients, test_patients)

#save the splits somewhere
df_2d.to_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/2D/data_split.pkl')
df_3d.to_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/data_split.pkl')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
#generate the 3 different datasets

#perform the quality check (if desired)
df_quality = quality_check_sheet('https://docs.google.com/spreadsheets/d/10TMbEwcVMtO6Ly3U1pIzpaxRiPrdTrI7YNm-Bxsi5YU/edit#gid=0', 'Final')
df_2d = pd.read_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/2D/data.pkl')
df_3d = pd.read_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/data.pkl')

df_2d, df_3d = perform_quality_check(df_quality, df_2d, df_3d, status = 'Status') #status changes

#split at the patient-level
train_patients, test_patients = train_test_split(
    df_3d['Patient'].to_list(),
    train_size = 0.54,
    stratify = df_3d[['Class_Label', 'Wall_Label']],
    random_state = random_state,
    shuffle = True
    )

#encode
df_2d = df_label_split(df_2d, train_patients, test_patients)
df_3d = df_label_split(df_3d, train_patients, test_patients)

#save the splits somewhere
df_2d.to_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/2D/data_status_split.pkl')
df_3d.to_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/data_status_split.pkl')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
#generate the 3 different datasets

#perform the quality check (if desired)
df_quality = quality_check_sheet('https://docs.google.com/spreadsheets/d/10TMbEwcVMtO6Ly3U1pIzpaxRiPrdTrI7YNm-Bxsi5YU/edit#gid=0', 'Final')
df_2d = pd.read_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/2D/data.pkl')
df_3d = pd.read_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/data.pkl')

df_2d, df_3d = perform_quality_check(df_quality, df_2d, df_3d, status = 'Filter Status') #status changes

#split at the patient-level
train_patients, test_patients = train_test_split(
    df_3d['Patient'].to_list(),
    train_size = 0.55,
    stratify = df_3d[['Class_Label', 'Wall_Label']],
    random_state = random_state,
    shuffle = True
    )

#encode
df_2d = df_label_split(df_2d, train_patients, test_patients)
df_3d = df_label_split(df_3d, train_patients, test_patients)

#save the splits somewhere
df_2d.to_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/2D/data_filter_status_split.pkl')
df_3d.to_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/data_filter_status_split.pkl')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
#get more image resolutions of the wall and original CTA
#read in the 3d
df = pd.read_pickle('/content/gdrive/MyDrive/AAA_Project/Masters-Thesis/AAA-Wall/data/3D/data_split.pkl')

In [None]:
df.columns

Index(['Class', 'Class_Label', 'Wall', 'Wall_Label', 'Patient', 'Norm-CTA',
       'Mask-Wall', 'Mask-ILT', 'Mask-AAA', 'Mask-All', 'Norm-CTA-64',
       'Mask-Wall-64', 'Mask-ILT-64', 'Mask-AAA-64', 'Mask-All-64',
       'Norm-CTA-Cropped', 'Mask-Wall-Cropped', 'Mask-ILT-Cropped',
       'Mask-AAA-Cropped', 'Mask-All-Cropped', 'Norm-CTA-Cropped-64',
       'Mask-Wall-Cropped-64', 'Mask-ILT-Cropped-64', 'Mask-AAA-Cropped-64',
       'Mask-All-Cropped-64', 'Norm-CTA-128-128-64', 'Mask-All-128-128-64',
       'Norm-CTA-Cropped-128-128-64', 'Mask-Cropped-All-128-128-64',
       'Mask-Wall-128-128-64', 'Norm-CTA-512-512-64', 'Mask-Wall-512-512-64',
       'Mask-All-512-512-64', 'Norm-CTA-256-256-64', 'Mask-Wall-256-256-64',
       'Mask-All-256-256-64', 'DATA'],
      dtype='object')

In [None]:
df[df['DATA'] == 'TEST']['Wall_Label'].value_counts()

0    25
1    21
2     4
Name: Wall_Label, dtype: int64

In [None]:
#check
patient_list = df['Norm-CTA-128-128-64']
for patient1, patient2 in zip(df['Patient'].to_list(), patient_list):
  if patient1 not in patient2:
    print(patient1)

In [None]:
64*64*64

262144

In [None]:
512*512/(64*64)

64.0