# Main Script

In [None]:
#installations
#!pip install --quiet SimpleITK
#Anaconda Powershell
#:L
#conda activate data_processing
#jupyter serverextension enable -- py jupyter_http_over_ws
#jupyter notebook --NotebookApp.allow_origin='https://colab.research.google.com' --port=8892 --NotebookApp.port_retries=0

In [None]:
#imports
import SimpleITK as sitk
import os
import zipfile
import shutil
from zipfile import ZipFile
import pandas as pd
import numpy as np
import datetime
from datetime import datetime
import glob

In [None]:
#functions

#get duplicates
def get_dupes(ls):
  return list(keep_dupes(ls))

#keep duplicates
def keep_dupes(iterable):
    seen = set()
    dupes = set()
    for x in iterable:
        if x in seen and x not in dupes:
            yield x
            dupes.add(x)
        else:
            seen.add(x)

#unzip
def unzip(zipped_folder, unzipped_folder):
  with ZipFile(zipped_folder, 'r') as zObject:
    zObject.extractall(unzipped_folder)

#get info
def get_info(row, add_columns):

  #the zipped dicom path
  dcm_path = row['Medcave03_DCM']

  #the report path
  report_path = row['Medcave03_Report']

  #save path MedCave03
  #save_path = patient_folder + 'CTA_' + row['Patient'] + '.nii.gz'

  #create temporary directroy (quick-fix)
  unzipped_patient = row['Medcave03_DCM'].replace('.zip','/')

  #only if the CTA does not already exist (should remove!)
  #if os.path.isdir(unzipped_patient) == False:
    #create
    #os.mkdir(unzipped_patient)

  #now unzip the images (temporary)
  unzip(dcm_path, unzipped_patient)

  #get the number of series and relate study description and number of slices in each patient's image series
  reader = sitk.ImageSeriesReader()
  #get the series ids
  id_names = reader.GetGDCMSeriesIDs(unzipped_patient) #file subfolder

  #data
  data = []
  #max dicom
  max_dicom = True
  #for each series id
  for id_name in id_names:
    #obtain one instance of the file
    series_file_names = sitk.ImageSeriesReader.GetGDCMSeriesFileNames(
        unzipped_patient,
        id_name
        )
    #num dicom
    num_dicom = len(series_file_names)
    #specify
    reader_file = sitk.ImageFileReader()
    reader_file.SetFileName(series_file_names[0])
    reader_file.LoadPrivateTagsOn()
    reader_file.ReadImageInformation()
    #study
    try:
      study_des = reader_file.GetMetaData('0008|1030')
    except:
      study_des = pd.NA
    #scan des
    try:
      scan_des = reader_file.GetMetaData('0008|0070')
    except:
      scan_des = pd.NA
    #series des
    try:
      series_des = reader_file.GetMetaData('0008|103e')
    except:
      series_des = pd.NA
    #report
    try:
      with open(report_path, 'r') as f:
        full_report = f.read()
    except:
      full_report = pd.NA
    #collect all the relevant info
    data.append([row['Patient'], row['Medcave03_DCM'], row['Medcave03_Report'], series_file_names, id_name, study_des, scan_des, series_des, num_dicom, full_report])

  #delete the temporary folder
  shutil.rmtree(unzipped_patient)
  #prepare the individual pandas to later be concatenated (reorder)
  patient_info = pd.DataFrame(data, columns = ['Patient', 'Medcave03_DCM', 'Medcave03_Report', 'Medcave03_Series_Files',
                                               'Series_ID', 'Study', 'Scan', 'Series', 'Num_DICOM', 'Full_Report'])
  #maximize
  patient_info['Max_Patient_DICOM'] = np.where(patient_info['Num_DICOM'] == patient_info['Num_DICOM'].max(), True, False)
  #add the annotation columns
  patient_info = patient_info.reindex(columns = patient_info.columns.tolist() + add_columns)
  #print
  print('Info On:', row['Patient'])
  #return
  return patient_info

#get series info
def get_series_info(row, add_columns, df_refer):
  #if df_refer exists check if that patient already exists in the df_refer
  if df_refer is not None:
    #check if patient is there
    if row['Patient'] in df_refer['Patient'].to_list():
      #if patient exists skip to here
      patient_info = df_refer[df_refer['Patient'] == row['Patient']]
    else:
      #execute the same as df_refer not existing
      patient_info = get_info(row, add_columns)
  else:
    #otherwise do all this
    patient_info = get_info(row, add_columns)
  #return
  return patient_info

#get CTAs
def get_CTAs(row, save_loc, over_ride):
  #save path MedCave03
  save_path = save_loc + 'CTA_' + str(row['Patient']) + '.nii.gz'
  #check if it already exists first (assumes consistent intilization location for base path!!!)
  if (os.path.exists(save_path) == False) | (over_ride == True):
    #get the MedCave03 path (can vary)
    unzipped_patient = row['Medcave03_DCM'].replace('.zip','/')
    #temp
    if os.path.isdir(unzipped_patient) == False:
      os.mkdir(unzipped_patient)
    #now unzip the images (temporary)
    unzip(row['Medcave03_DCM'], unzipped_patient)
    #convert to nii.gz
    series_file_names = sitk.ImageSeriesReader.GetGDCMSeriesFileNames(
        unzipped_patient,
        row['Series_ID']
        )
    #print(series_file_names)
    #reader
    reader = sitk.ImageSeriesReader()
    #set
    reader.SetFileNames(series_file_names)
    #execute
    output = reader.Execute()
    #save
    sitk.WriteImage(output, save_path)
    #remove the temporary files
    shutil.rmtree(unzipped_patient) #corrected later
    #prepared
    print('Prepared:', row['Patient'])
  #return
  return save_path

In [None]:
%%time
#main script (initial run)

#specify the basepath (this can be changed depending on which cd into at start)
base = 'AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/'

#glob
base_dir = glob.glob(base + 'CTA_Lower*' + '/')

#need to get all the patient folder names
print('Patients with Missing Reports are Included. Patients with Missing Images should be Noted and Excluded.')
#iterate
patient_name = []
for base_path in base_dir:
  for name in sorted(os.listdir(base_path)):
    dcm_path = os.path.join(base_path, name + '/images.zip')
    report_path = os.path.join(base_path, name + '/report.txt')
    if (os.path.isfile(dcm_path) == True) & (os.path.isfile(report_path) == True):
      #append
      patient_name.append([int(name), dcm_path, report_path])
    elif (os.path.isfile(dcm_path) == False):
      print('Missing DCM:', dcm_path) #identify any missing entries
    elif (os.path.isfile(report_path) == False):
      print('Missing Report:', report_path)
      #append
      patient_name.append([int(name), dcm_path, pd.NA])
    else:
      print('Missing DCM & Report')

#place inside the pandas dataframe
df = pd.DataFrame(patient_name, columns = ['Patient', 'Medcave03_DCM', 'Medcave03_Report'])

#specify the annotation columns (manual selection // needs to be specified if rerunning from previous) need to add here!!!!
add_columns = ['Manual_DICOM_Selection'] #'PAD_50_Annotation', 'PAD_50_Annotation_Notes'] #add more column info here (first specify in the pandas being edited)

#specify the reference csv if it exists
#df_refer = pd.read_csv('')
df_refer = None

#can remove the repeats across the upload folders(for exact same case)
# df = df.drop_duplicates(subset = ['Patient', 'Series_ID', 'Study', 'Scan', 'Series', 'Num_DICOM', 'Full_Report'])

#get the meta info
ls_df = df.apply(get_series_info, axis = 1, args = (add_columns, df_refer))
#concat
df_meta = pd.concat(tuple(ls_df), ignore_index = True)
#save
df_meta.to_excel(base + 'Dataset_Series/Dataset_Series' + datetime.now().strftime("-%Y-%m-%d-%H-%M-%S") + '.xlsx', index = False)

In [None]:
%%time
#main script (initial run)
#add_columns = ['PAD_50_Annotation', 'PAD_50_Annotation_Notes']
over_ride = True #controls whether all files are rewritten (True) or only the missing files will be processed (False)
#base
base_path = 'AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/'
#save
save_path = 'AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/Dataset_CTAs/'
#read in the most up to date
df = pd.read_excel(base_path + 'Dataset_Series/Dataset_Series-2023-01-20-01-55-35.xlsx')
#filter based on selection criteria
df = df[df['Max_Patient_DICOM'] == True]
#create
df['Medcave03_CTA'] = df.apply(get_CTAs, axis = 1, args = (save_path, over_ride))
#add in annotation columns
#df = df.reindex(columns = df.columns.tolist() + add_columns)
#save the new csv
df.to_excel(save_path + 'Data/vRAD_Dataset_CTA' + datetime.now().strftime("-%Y-%m-%d-%H-%M-%S") + '.xlsx', index = False)

Prepared: 10261071
Prepared: 1083212565
Prepared: 1083388177
Prepared: 1351791995
Prepared: 1620136241
Prepared: 1620462782
Prepared: 278666760
Prepared: 815136778
Prepared: 815640931
Prepared: 10385082
Prepared: 1083181062
Prepared: 1083289226
Prepared: 1083651355
Prepared: 1084009745
Prepared: 1084034689
Prepared: 1331806000
Prepared: 1332457232
Prepared: 1332482978
Prepared: 1332482978
Prepared: 1332667581
Prepared: 1332709117
Prepared: 1352419296
Prepared: 1600761592
Prepared: 1601017995
Prepared: 1620120392
Prepared: 1620121687
Prepared: 1620259252
Prepared: 1620542154
Prepared: 1620772926
Prepared: 1621034905
Prepared: 1621046091
Prepared: 1888763137
Prepared: 277903873
Prepared: 277951322
Prepared: 278045217
Prepared: 278185830
Prepared: 278818864
Prepared: 278877199
Prepared: 546756428
Prepared: 547045606
Prepared: 814812480
Prepared: 9615231
Prepared: 10273970
Prepared: 10280081
Prepared: 10385082
Prepared: 10439107
Prepared: 1083181062
Prepared: 1083289226
Prepared: 108357417

In [None]:
#manual corrections
#CTA
df = pd.read_excel('AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/Dataset_CTAs/Data/vRAD_Dataset_CTA-2023-01-21-07-56-13.xlsx')
#Series
df_series = pd.read_excel('AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/Dataset_Series/Dataset_Series-2023-01-20-01-55-35.xlsx')
#remove if patients and series are the same // will not affect file writing!
#this patient has different reports across the upload folder dates (addenum)
#print(df[df['Patient'] == 278185830]['Full_Report'].iloc[0]) #1
#print(df[df['Patient'] == 278185830]['Medcave03_Report'].iloc[0]) #1

#will not affect file writing
df = df.drop_duplicates(subset = ['Patient', 'Series_ID', 'Study', 'Scan', 'Series', 'Num_DICOM', 'Max_Patient_DICOM', 'Medcave03_CTA'])

#get the duplicates//this will affect file writing//max dicom has two or more series per patient!!!
patients = get_dupes(df['Patient'].to_list())
#print
print(patients)

In [None]:
#all patients to correct
#select the patients to manually override // must be specified like this
over_patient = [
    [388127763, 'CTA THORACIC'],
    [1222486634, 'DIAPHRAGM TO TOE-C+ '],
    [1332482978, '1mm cta CTA 1.0 CE  CTA 1mm cta '],
    [1351643295, 'RUNOFF 2.5'],
    [1601022171, 'stnd 2.5'],
    [546714979, '1mm cta CTA 1.0 CE  CTA 1mm cta '],
    [1489963600, 'ARTERIAL AXIAL, iDose (4) '],
    [1730229748, '1mm cta CTA 1.0 CE  CTA 1mm cta '],
    [1998587672, 'abd/pelv mips '],
    [924844907, 'CTA Lower ext ']
]
#over
#all_patients = patients + over_patient
#iterate
all_patients = []
for patient, _ in over_patient:
  all_patients.append(patient)

#correct CTAs
df_correct = df[~df['Patient'].isin(all_patients)]

#automatic//manual correction
df_overlap = df_series[df_series['Patient'].isin(all_patients)]

#post processing
df_post = []
for patient, series in over_patient:
  temp = df_series[(df_series['Patient'] == patient) & (df_series['Series'] == series)]
  temp = temp.drop_duplicates(subset = ['Patient', 'Series_ID', 'Study', 'Scan', 'Series', 'Num_DICOM', 'Max_Patient_DICOM'])
  df_post.append(temp)
df_post = pd.concat(df_post)

In [None]:
%%time
#apply to CTAs AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset_Inputs/

#add_columns = ['PAD_50_Annotation', 'PAD_50_Annotation_Notes']
over_ride = True
#save
save_path = 'AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/Dataset_CTAs/'
#create
df_post['Medcave03_CTA'] = df_post.apply(get_CTAs, axis = 1, args = (save_path, over_ride))
#append
df_final = pd.concat([df_correct, df_post])
#save
df_final.to_excel(save_path + 'Data/vRAD_Dataset_CTA_Final' + datetime.now().strftime("-%Y-%m-%d-%H-%M-%S") + '.xlsx', index = False)

Prepared: 388127763
Prepared: 1222486634
Prepared: 1332482978
Prepared: 1351643295
Prepared: 1601022171
Prepared: 546714979
Prepared: 1489963600
Prepared: 1730229748
Prepared: 1998587672
Prepared: 924844907
CPU times: total: 15min 21s
Wall time: 24min 28s


# Debug

In [None]:
len(df_final['Patient'].to_list())

194

In [None]:
len(set(df_series['Patient'].to_list()))

194

In [None]:
#corrected (remove the unneccesary files)
df = pd.read_excel('AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/Dataset_CTAs/Data/vRAD_Dataset_CTA-2023-01-21-07-56-13.xlsx')

In [None]:
#remove the CTAs
def rm_CTAs(row):
  unzipped_patient = row['Medcave03_DCM'].replace('.zip','/')
  if os.path.isdir(unzipped_patient):
    print(unzipped_patient)
    shutil.rmtree(unzipped_patient)

In [None]:
df = pd.read_excel('AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/Dataset_CTAs/Data/vRAD_Dataset_CTA_Final-2023-01-29-02-20-18.xlsx')

In [None]:
df.apply(rm_CTAs, axis = 1)

0      None
1      None
2      None
3      None
4      None
       ... 
191    None
192    None
193    None
194    None
195    None
Length: 196, dtype: object

In [None]:
unzipped_patient

'AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\\CTA_Lower_Extremity_Runoff_2022-12-13\\10261071/images/'

In [None]:
#remove if patients and series are the same // will not affect file
#this patient has different reports across the upload folder dates (addenum)
#print(df[df['Patient'] == 278185830]['Full_Report'].iloc[0]) #1
#print(df[df['Patient'] == 278185830]['Medcave03_Report'].iloc[0]) #1
df2 = df.drop_duplicates(subset = ['Patient', 'Series_ID', 'Study', 'Scan', 'Series', 'Num_DICOM', 'Full_Report', 'Max_Patient_DICOM', 'Medcave03_CTA'])
#evidence one patient had different notes
#df2 = df.drop_duplicates(subset = ['Patient', 'Series_ID', 'Study', 'Scan', 'Series', 'Num_DICOM', 'Max_Patient_DICOM'])
#final
#get the duplicates//this will affect file writing//max dicom has two or more series per patient!!!
patients = get_dupes(df2['Patient'].to_list())
#identify the patients to remove based on patient and series name
df2 = df2[~df2['Patient'].isin(patients)]
#next
df_series =

In [None]:
#iterate
patient = patients[0]
#subset
df_sub = df[df['Patient'] == patient]
#identify
row = df_sub.iloc[1]

In [None]:
row['Series']

'1mm venous CTA 1.0 CE  venous 1mm venous'

In [None]:
row['Series'].upper()

'1MM VENOUS CTA 1.0 CE  VENOUS 1MM VENOUS'

In [None]:
rm_check = ['VENOUS', 'BONE', 'LOWER LEG', 'MIPS']
series = row['Series']

False


In [None]:
#checker
def checker(row, rm_check = ['VENOUS', 'BONE', 'LEG', 'MIPS']):
  #check
  valid = True
  for rm in rm_check:
    if rm in str(row['Series']).upper():
      valid = False
  #return
  return valid

In [None]:
len(set(df_series[df_series.apply(checker, axis = 1) == False]['Patient'].to_list()))

49

In [None]:
df_series[df_series['Patient'] == 1462633036]

Unnamed: 0,Patient,Medcave03_DCM,Medcave03_Report,Medcave03_Series_Files,Series_ID,Study,Scan,Series,Num_DICOM,Full_Report,Max_Patient_DICOM,Manual_DICOM_Selection
1191,1462633036,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,('AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\\CTA...,1.2.40.0.13.1.17220746704661976583236389967624...,CTA Abdominal Aorta and Bilateral Iliofemoral ...,SIEMENS,SAGITTAL,220,PROCEDURE INFORMATION: \nExam:?CTA Abdominal A...,False,
1192,1462633036,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,('AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\\CTA...,1.2.40.0.13.1.23556615124612193628745919404618...,CTA Abdominal Aorta and Bilateral Iliofemoral ...,SIEMENS,SAGITTAL,199,PROCEDURE INFORMATION: \nExam:?CTA Abdominal A...,False,
1193,1462633036,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,('AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\\CTA...,1.2.40.0.13.1.28075492554474604697215796230233...,CTA Abdominal Aorta and Bilateral Iliofemoral ...,SIEMENS,CORONAL,102,PROCEDURE INFORMATION: \nExam:?CTA Abdominal A...,False,
1194,1462633036,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,('AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\\CTA...,1.2.40.0.13.1.28883217196072182034302981323641...,CTA Abdominal Aorta and Bilateral Iliofemoral ...,SIEMENS,RUN-OFF 3.0 Bv38 2,334,PROCEDURE INFORMATION: \nExam:?CTA Abdominal A...,False,
1195,1462633036,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,('AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\\CTA...,1.2.40.0.13.1.32209296587863748537036527816970...,CTA Abdominal Aorta and Bilateral Iliofemoral ...,SIEMENS,LOWER LEGS 3.0 Bv38 2,336,PROCEDURE INFORMATION: \nExam:?CTA Abdominal A...,True,
1196,1462633036,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,('AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\\CTA...,1.2.40.0.13.1.33893996227050482297465294009411...,CTA Abdominal Aorta and Bilateral Iliofemoral ...,SIEMENS,CORONAL,102,PROCEDURE INFORMATION: \nExam:?CTA Abdominal A...,False,


In [None]:
df['Patient'].to_list()

In [None]:
len(df2[df2['Patient'].isin(patients)])

18

In [None]:
#need to process seperately
#venous, bone, leg mips case insensitive #need to be more sensitive to the labeling scheme!!!!

In [None]:
df['Series']

0                     CTA RUNOFF
3           CTA Abdomen w/Runoff
11                  AngioRunOff 
14      CTA RUNOFF  1.0  BR38  4
19                    ABD RUNOFF
                  ...           
1715                        Bone
1722      Angio RunOff  1.0 3DR 
1730                    RUNOFFS 
1733          Run Off  1.0  B26f
1739          STD ARTERIAL 1.25 
Name: Series, Length: 235, dtype: object

In [None]:
df_series[df_series['Patient'] == 1083212565]['Series']

3                   CTA Abdomen w/Runoff
4                       Venous  w/Runoff
5       Coronal Venous Trifurcation MIPs
6     Sagittal Venous Trifurcation MIPs 
7                     Coronal Aorta MIPs
8                     Coronal Iliac MIPs
9                   Sagittal Aorta MIPs 
10                  Sagittal Iliac MIPs 
Name: Series, dtype: object

In [None]:
df_series['Patient']

0         10261071
1         10261071
2         10261071
3       1083212565
4       1083212565
           ...    
1743     926229147
1744     926229147
1745     926229147
1746     926229147
1747     926229147
Name: Patient, Length: 1748, dtype: int64

In [None]:
df['Series'].to_list()

['CTA RUNOFF',
 'CTA Abdomen w/Runoff',
 'AngioRunOff ',
 'CTA RUNOFF  1.0  BR38  4',
 'ABD RUNOFF',
 'Run Off  2.0  Br38  1 ',
 'AXIAL_W ARTERIAL',
 'WITH',
 'CTA RUNOFF  3.0  Br40 ',
 'CTA ',
 'HELICAL RUNOFF WITH ',
 'STANDARD THIN ',
 'CTA Lower ext ',
 'RUNOFF-CONTRAST ',
 'thins delay ',
 '1.25 DMPR ',
 'STD 2.5 ANGIO ',
 '1mm cta CTA 1.0 CE  CTA 1mm cta ',
 '1mm venous CTA 1.0 CE  venous 1mm venous',
 'CTA RUNOFF 1.25X.625 STND ',
 '3 ART THIN AX ',
 'CTA ABD thins for reformat',
 'Runoff Bolus',
 'Renals-Toes  2.0  Br38  3 ',
 'RUNOFF',
 'THINS ',
 'ENHANCED ISOVUE 370 150ML ',
 '0.625 STD ',
 'CTA RUNOFF',
 'AXIAL ',
 '2.5MM STANDARD',
 'ART THIN AX ',
 'CTA ABD PLV RUNOFF ART',
 'RUNOFF-CONTRAST ',
 ' CTA 2.0 CE ',
 '1mm cta CTA 1.0 CE  CTA 1mm cta ',
 'Thins AngioRunOff  1.0  Bv38  3 ',
 'CTA RUNOFF',
 'RUNOFF',
 'AX THIN ',
 '1.25mm CTA',
 'ART THIN AX ',
 'AX RUNOFF, iDose (3)',
 'THINS LEGS',
 'CTA ',
 'ART THIN AX ',
 'HELICAL RUNOFF WITH ',
 'STANDARD THIN ',
 'CTA Runo

In [None]:
df_series[df_series['Series'] == 'CTA RUNOFF']

Unnamed: 0,Patient,Medcave03_DCM,Medcave03_Report,Medcave03_Series_Files,Series_ID,Study,Scan,Series,Num_DICOM,Full_Report,Max_Patient_DICOM,Manual_DICOM_Selection
0,10261071,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,('AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\\CTA...,1.3.12.2.1107.5.1.4.105578.3000002212122339456...,CT ANGIOGRAM ABDOMINAL AORTA AND BILATERAL ILI...,SIEMENS,CTA RUNOFF,665,PROCEDURE INFORMATION: \nExam:?CTA Abdominal A...,True,
192,1620772926,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,('AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\\CTA...,1.2.840.113619.2.491.3.17438141.80.1671348465.163,CT ANGIOGRAPHY CHEST ABD AORTA W BILATERAL RUN...,GE MEDICAL SYSTEMS,CTA RUNOFF,676,PROCEDURE INFORMATION: \nExam:?CTA Chest With ...,True,
265,278877199,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,('AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\\CTA...,1.3.12.2.1107.5.1.4.105578.3000002212161222440...,CT ANGIOGRAM ABDOMINAL AORTA AND BILATERAL ILI...,SIEMENS,CTA RUNOFF,747,PROCEDURE INFORMATION: \nExam:?CTA Abdominal A...,True,
383,1331727410,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,('AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\\CTA...,1.3.46.670589.33.1.63806793314615696600001.532...,CT ANGIO AORTA BIL RUN W/WO CO,Philips,CTA RUNOFF,612,PROCEDURE INFORMATION: \nExam:?CTA Abdominal A...,True,
421,1332645787,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,('AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\\CTA...,1.3.46.670589.33.1.63806707949058066000001.552...,CT ANGIO AORTA BIL RUN W/WO CO,Philips,CTA RUNOFF,629,PROCEDURE INFORMATION: \nExam:?CTA Abdominal A...,True,
607,1620772926,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,('AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\\CTA...,1.2.840.113619.2.491.3.17438141.80.1671348465.163,CT ANGIOGRAPHY CHEST ABD AORTA W BILATERAL RUN...,GE MEDICAL SYSTEMS,CTA RUNOFF,676,PROCEDURE INFORMATION: \nExam:?CTA Chest With ...,True,
656,1888719504,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,('AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\\CTA...,1.2.840.113619.2.278.3.587870793.548.167091197...,Abd Aorta CTA w Contr + Runof,GE MEDICAL SYSTEMS,CTA RUNOFF,502,PROCEDURE INFORMATION: \nExam:?CTA Abdominal A...,True,
752,278877199,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,('AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\\CTA...,1.3.12.2.1107.5.1.4.105578.3000002212161222440...,CT ANGIOGRAM ABDOMINAL AORTA AND BILATERAL ILI...,SIEMENS,CTA RUNOFF,747,PROCEDURE INFORMATION: \nExam:?CTA Abdominal A...,True,
756,278884141,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,('AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\\CTA...,1.3.46.670589.33.1.63806689592804148300001.509...,CT ANGIO AORTA BIL RUN W/WO CO,Philips,CTA RUNOFF,628,PROCEDURE INFORMATION: \nExam:?CTA Abdominal A...,True,
944,1194280913,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\CTA_Lo...,('AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset\\CTA...,1.3.12.2.1107.5.1.4.83522.30000023010616320016...,Vascular^CTA_RUNOFF (Adult),SIEMENS,CTA RUNOFF,558,PROCEDURE INFORMATION: \nExam:?CTA Chest With ...,True,


In [None]:
df_series[df_series['Patient'] == 1194349496]['Series']

960    CTA RUNOFF  RTD 
961    LOWER LEGS  RTD 
962          CTA RUNOFF
963            PROX COR
964            PROX SAG
965            MID COR 
966            MID SAG 
967            DIST COR
968            DIST SAG
969    CTA RUNOFF THINS
Name: Series, dtype: object

In [None]:
df_series[df_series['Patient'] == 1194349496	]['Num_DICOM']

960     319
961     140
962     531
963     142
964     177
965     173
966     177
967     173
968     177
969    1146
Name: Num_DICOM, dtype: int64

In [None]:
df['Series']

0                     CTA RUNOFF
3           CTA Abdomen w/Runoff
11                  AngioRunOff 
14      CTA RUNOFF  1.0  BR38  4
19                    ABD RUNOFF
                  ...           
1715                        Bone
1722      Angio RunOff  1.0 3DR 
1730                    RUNOFFS 
1733          Run Off  1.0  B26f
1739          STD ARTERIAL 1.25 
Name: Series, Length: 235, dtype: object

In [None]:
# initializing substring
subs = 'mip'

# using list comprehension
# to get string with substring
res = [i for i in df_series['Series'].to_list() if subs in i]

TypeError: ignored

In [None]:
df_series[df_series['Patient'] == patients[7]]['Series']

1409            AXIAL WO
1410              RUNOFF
1411      abd/pelv mips 
1412            leg mips
1413    RUNOFF 2nd PASS 
1414            SAG 2X2 
1415            COR 2X2 
1416            SAG 2X2 
1417            COR 2X2 
1418        COR MIP 10X2
1419        SAG MIP 10X2
1420        COR MIP 10X2
1421        SAG MIP 10X2
Name: Series, dtype: object

In [None]:
df2[df2['Patient'] == patients[7]]['Series']

1411    abd/pelv mips 
1412          leg mips
Name: Series, dtype: object

In [None]:
patients[3]

278185830

[1332482978, 1351643295, 1601022171, 278185830, 546714979, 1489963600, 1730229748, 1998587672, 924844907]


In [None]:
l = df_meta[df_meta['Patient'] == 9511168]
unzipped_patient = 'AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset_temp/9511168/images/'
#get the number of series and relate study description and number of slices in each patient's image series
reader = sitk.ImageSeriesReader()
#get the series ids
id_names = reader.GetGDCMSeriesIDs(unzipped_patient) #file subfolder

#data
data = []
#max dicom
max_dicom = True
#for each series id
for id_name in id_names:
  #obtain one instance of the file
  series_file_names = sitk.ImageSeriesReader.GetGDCMSeriesFileNames(
      unzipped_patient,
      id_name
      )
  #l['Series_ID'].to_list()

In [None]:
row

Patient                                                             9511168
Medcave03_DCM             AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset_temp/9...
Medcave03_Series_Files    ('AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset_temp...
Medcave03_Report          AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset_temp/9...
Series_ID                 1.2.840.113619.2.358.3.3540674098.890.16711095...
Study                                CT ANGIO AORTA ILIOFEMORAL LEG RUN OFF
Scan                                                     GE MEDICAL SYSTEMS
Series                                                               RUNOFF
Num_DICOM                                                              1043
Full_Report               PROCEDURE INFORMATION: \nExam:?CTA Abdominal A...
Max_Patient_DICOM                                                      True
Manual_DICOM_Selection                                                    h
Name: 1, dtype: object

In [None]:
#reader
reader = sitk.ImageSeriesReader()
#convert to nii.gz
dicom_names = reader.GetGDCMSeriesFileNames(row['Medcave03_Series_Files']) #file subfolder
#set
reader.SetFileNames(dicom_names)
#execute
output = reader.Execute()

RuntimeError: ignored

In [None]:
row['Series_ID']

'1.2.840.113619.2.358.3.3540674098.890.1671109505.329'

In [None]:
row['Medcave03_Series_Files']

"nishSalvi/ImageRx/PAD-Net/vRAD_Dataset_temp/9511168/images/\\\\s53018917-1.2.840.113619.2.358.3.3540674098.890.1671109505.331.1043.dcm', 'AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset_temp/9511168/images/\\\\s53018917-1.2.840.113619.2.358.3.3540674098.890.1671109505.331.1042.dcm', 'AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset_temp/9511168/images/\\\\s53018917-1.2.840.113619.2.358.3.3540674098.890.1671109505.331.1041.dcm', 'AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset_temp/9511168/images/\\\\s53018917-1.2.840.113619.2.358.3.3540674098.890.1671109505.331.1040.dcm', 'AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset_temp/9511168/images/\\\\s53018917-1.2.840.113619.2.358.3.3540674098.890.1671109505.331.1039.dcm', 'AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset_temp/9511168/images/\\\\s53018917-1.2.840.113619.2.358.3.3540674098.890.1671109505.331.1038.dcm', 'AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset_temp/9511168/images/\\\\s53018917-1.2.840.113619.2.358.3.3540674098.890.1671109505.331.1037.dcm', 'AnishSalvi/ImageRx/PAD-Net/

In [None]:
res = tuple(map(str, ))

In [None]:
res = list(map(str, row['Medcave03_Series_Files'].split(', ')))

In [None]:
#convert to nii.gz
dicom_names = reader.GetGDCMSeriesFileNames(res) #file subfolder
#set
reader.SetFileNames(dicom_names)
#execute
output = reader.Execute()

TypeError: ignored

In [None]:
res = [ele.replace('(', '') for ele in res]

In [None]:
res

["'AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset_temp/9511168/images/\\\\s53018917-1.2.840.113619.2.358.3.3540674098.890.1671109505.331.1043.dcm'",
 "'AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset_temp/9511168/images/\\\\s53018917-1.2.840.113619.2.358.3.3540674098.890.1671109505.331.1042.dcm'",
 "'AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset_temp/9511168/images/\\\\s53018917-1.2.840.113619.2.358.3.3540674098.890.1671109505.331.1041.dcm'",
 "'AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset_temp/9511168/images/\\\\s53018917-1.2.840.113619.2.358.3.3540674098.890.1671109505.331.1040.dcm'",
 "'AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset_temp/9511168/images/\\\\s53018917-1.2.840.113619.2.358.3.3540674098.890.1671109505.331.1039.dcm'",
 "'AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset_temp/9511168/images/\\\\s53018917-1.2.840.113619.2.358.3.3540674098.890.1671109505.331.1038.dcm'",
 "'AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset_temp/9511168/images/\\\\s53018917-1.2.840.113619.2.358.3.3540674098.890.1671109505.331.1037.dcm'",
 "'Ani

In [None]:


  #get the MedCave03 path (can vary)
  patient_folder = base_path + row['Patient'] + '/'

  #the zipped dicom path
  dcm_path = patient_folder + 'images.zip'

  #the report path
  report_path = patient_folder + 'report.txt'

  #save path MedCave03
  save_path = patient_folder + 'CTA_' + row['Patient'] + '.nii.gz'

  #create temporary directroy
  unzipped_patient = patient_folder + 'images/'

  #only if the CTA does not already exist (should remove!)
  if os.path.isdir(unzipped_patient) == False:
    #create
    os.mkdir(unzipped_patient)

    #now unzip the images (temporary)
    unzip(dcm_path, unzipped_patient)

    #read in the full report (can modify later)
    with open(report_path, 'r') as f:
      full_report = f.read()

    #get the number of series and relate study description and number of slices in each patient's image series
    reader = sitk.ImageSeriesReader()

    #get the series ids
    id_names = reader.GetGDCMSeriesIDs(unzipped_patient) #file subfolder

    #data
    data = []

    #max dicom
    max_dicom = True

    #for each series id
    for id_name in id_names:

      #obtain one instance of the file
      series_file_names = sitk.ImageSeriesReader.GetGDCMSeriesFileNames(
          unzipped_patient,
          id_name
          )

      #specify
      reader_file = sitk.ImageFileReader()
      reader_file.SetFileName(series_file_names[0])
      reader_file.LoadPrivateTagsOn()
      reader_file.ReadImageInformation()
      #get
      study_des = reader_file.GetMetaData('0008|1030')
      series_des = reader_file.GetMetaData('0008|103e')
      scan_des = reader_file.GetMetaData('0008|0070')
      num_dicom = len(series_file_names)

      #collect all the relevant info
      data.append([row['Patient'], dcm_path, series_file_names, report_path, save_path, id_name, study_des, scan_des, series_des, num_dicom, full_report])

    #delete the temporary folder
    shutil.rmtree(unzipped_patient)

    #prepare the individual pandas to later be concatenated (reorder)
    patient_info = pd.DataFrame(data, columns = ['Patient', 'Medcave03_DCM', 'Medcave03_Series_Files', 'Medcave03_Report', 'Medcave03_CTA', 'Series_ID', 'Study', 'Scan', 'Series', 'Num_DICOM', 'Full_Report'])

    #maximize
    patient_info['Max_Patient_DICOM'] = np.where(patient_info['Num_DICOM'] == patient_info['Num_DICOM'].max(), True, False)

    #add the annotation columns
    patient_info = patient_info.reindex(columns = patient_info.columns.tolist() + add_columns)

    #return
    patient_info


SyntaxError: ignored

In [None]:
patient_info

Unnamed: 0,Patient,Medcave03_DCM,Medcave03_Series_Files,Medcave03_Report,Medcave03_CTA,Series_ID,Study,Scan,Series,Num_DICOM,Full_Report,Max_Patient_DICOM,Manual_DICOM_Selection,PAD_50_Annotation,PAD_50_Annotation_Notes
0,10261071,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/102610...,(AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/10261...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/102610...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/102610...,1.3.12.2.1107.5.1.4.105578.3000002212122339456...,CT ANGIOGRAM ABDOMINAL AORTA AND BILATERAL ILI...,SIEMENS,CTA RUNOFF,665,PROCEDURE INFORMATION: \nExam:?CTA Abdominal A...,True,,,
1,10261071,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/102610...,(AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/10261...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/102610...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/102610...,1.3.12.2.1107.5.1.4.105578.3000002212122339456...,CT ANGIOGRAM ABDOMINAL AORTA AND BILATERAL ILI...,SIEMENS,COR,191,PROCEDURE INFORMATION: \nExam:?CTA Abdominal A...,False,,,
2,10261071,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/102610...,(AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/10261...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/102610...,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/102610...,1.3.12.2.1107.5.1.4.105578.3000002212122339456...,CT ANGIOGRAM ABDOMINAL AORTA AND BILATERAL ILI...,SIEMENS,SAG,211,PROCEDURE INFORMATION: \nExam:?CTA Abdominal A...,False,,,


In [None]:
#if there is reference then do following

#if the patient row already exists // then update that csv with his information

#else use the above script and insert where required

#else skip over and execute the rest None case

#can update row-wise but not columns

#implement for selection of the runoff sequence then annotation (x2)


In [None]:
ls = (patient_info, patient_info)

In [None]:
pd.concat(ls, ignore_index = True)

Unnamed: 0,Medcave03_DCM,Medcave03_Series_Files,Patient,Series_ID,Study,Scan,Series,Num_DICOM
0,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/102610...,(AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/10261...,10261071,1.3.12.2.1107.5.1.4.105578.3000002212122339456...,CT ANGIOGRAM ABDOMINAL AORTA AND BILATERAL ILI...,SIEMENS,CTA RUNOFF,665
1,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/102610...,(AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/10261...,10261071,1.3.12.2.1107.5.1.4.105578.3000002212122339456...,CT ANGIOGRAM ABDOMINAL AORTA AND BILATERAL ILI...,SIEMENS,COR,191
2,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/102610...,(AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/10261...,10261071,1.3.12.2.1107.5.1.4.105578.3000002212122339456...,CT ANGIOGRAM ABDOMINAL AORTA AND BILATERAL ILI...,SIEMENS,SAG,211
3,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/102610...,(AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/10261...,10261071,1.3.12.2.1107.5.1.4.105578.3000002212122339456...,CT ANGIOGRAM ABDOMINAL AORTA AND BILATERAL ILI...,SIEMENS,CTA RUNOFF,665
4,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/102610...,(AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/10261...,10261071,1.3.12.2.1107.5.1.4.105578.3000002212122339456...,CT ANGIOGRAM ABDOMINAL AORTA AND BILATERAL ILI...,SIEMENS,COR,191
5,AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/102610...,(AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/10261...,10261071,1.3.12.2.1107.5.1.4.105578.3000002212122339456...,CT ANGIOGRAM ABDOMINAL AORTA AND BILATERAL ILI...,SIEMENS,SAG,211


In [None]:
%%time

#main script

#specify the basepath (this can be changed depending on which cd into at start)
base_path = 'AnishSalvi/ImageRx/PAD-Net/vRAD_Dataset/'

#need to get all the patient folder names
patient_name = [name for name in sorted(os.listdir(base_path)) if os.path.isdir(os.path.join(base_path, name))]

#place inside the pandas dataframe
df = pd.DataFrame(patient_name, columns = ['Patient'])

#specify the reference csv if it exists
#df_refer = pd.read_csv('')

CPU times: total: 0 ns
Wall time: 6 ms
