In [1]:
import os
from pathlib import Path
import matplotlib.pyplot as plt
import pydicom
import numpy as np
import pandas as pd
import SimpleITK as sitk
from tqdm.auto import tqdm
import glob
import pandas as pd
from platipy.imaging import ImageVisualiser
from platipy.dicom.io.rtstruct_to_nifti import convert_rtstruct, read_dicom_image

In [2]:
data_directory = '../../data/UTSW_HNC/dicom_free'
nifti_dir = '../../data/UTSW_HNC/Nii_free'
vis_dir = '../../data/UTSW_HNC/vis_snapshots'
patient_dirs = glob.glob(f"{data_directory}/*")
data_path = Path(data_directory)
nii_path = Path(nifti_dir)
vis_path = Path(vis_dir)
nii_path.mkdir(exist_ok=True, parents=True)
vis_path.mkdir(exist_ok=True, parents=True)

In [7]:
study_descriptions = [
    'h/n',
    'h.n',
    'neck',
    'h&n',
]
# least_recent = pd.to_datetime(df['StudyDate').min()
patients = [pth.as_posix().split('/')[-1] for pth in nii_path.glob('*')]
sheet_names = ['CT', 'RTSTRUCT']

In [70]:
patient = "71915981"
study_id = "1.2.840.113704.1.111.4620.1383670451.6"
meta_df = pd.read_excel(data_path.joinpath(f"{patient}").joinpath(f"patient.{patient}.xlsx"), sheet_name=sheet_names)
meta_df['CT'].set_index('StudyInstanceUID', inplace=True)
meta_df['RTSTRUCT'].set_index('StudyInstanceUID', inplace=True)
ct_folder = f"{study_id}/CT.{meta_df['CT'].loc[study_id]['SeriesInstanceUID']}"
#rtstruct_file = f"{study_id}/RTSTRUCT.{meta_df['RTSTRUCT'].loc[study_id]['SeriesInstanceUID']}/{meta_df['RTSTRUCT'].loc[study_id]['SOPInstanceUID']}.dcm"
rtstruct_file = f"{study_id}/{meta_df['RTSTRUCT'].loc[study_id]['SOPInstanceUID']}.dcm"
ct_path = data_path.joinpath(patient).joinpath(ct_folder)
rtstruct_path = data_path.joinpath(patient).joinpath(rtstruct_file)
#rtstruct = pydicom.dcmread(rtstruct_path)
#ct_dir = Path("../../data/UTSW_HNC/DIMCOM_Data02-22-2024-142556/70718318/1.2.840.113704.1.111.4620.1383670451.6/CT.1.2.840.113704.1.111.5892.1383673054.8")

In [9]:
pats_to_remove = []
study_ids = {}
for pat in patients: 
    print(pat)
    #if pat != '94352737': continue
    patient_data_path = data_path.joinpath(pat)
    patient_nii_path = nii_path.joinpath(pat)
    patient_nii_path.mkdir(exist_ok=True, parents=True)
    patient_image_path = patient_nii_path.joinpath('image.nii.gz')

    
    #meta_df = pd.read_excel(patient_data_path.joinpath(f"patient.{pat}.xlsx"), sheet_name=sheet_names)

    try:
        ct_df = pd.read_excel(patient_data_path.joinpath(f"patient.{pat}.xlsx"), sheet_name='CT').set_index('StudyInstanceUID')
    except:
        print(f"    no CT available")
        pats_to_remove.append(pat)
        continue
    try:
        struct_df = pd.read_excel(patient_data_path.joinpath(f"patient.{pat}.xlsx"), sheet_name='RTSTRUCT').set_index('StudyInstanceUID')
    except:
        print(f"    no RTStruct available")
        pats_to_remove.append(pat)
        continue
        
    # flag that assigns 'TRUE' to studies that match a H/N tag 
    try:
        hn_flag = ct_df.StudyDescription.str.extract(f"(?i)({'|'.join(study_descriptions)})").notna().values
    except:
        print(f"    no relevant StudyDescription")
        pats_to_remove.append(pat)
        continue
        
    if not np.any(hn_flag):
        pats_to_remove.append(pat)
        print(f"    No HN entry in metadata")
        continue
    ct_df = ct_df[hn_flag]
    
    # get earliest study  (to get the initial scan)
    study_id = ct_df[ct_df.StudyDate==ct_df.StudyDate.min()].index[0]
    print(study_id)
    ct_folder = f"{study_id}/CT.{ct_df.loc[study_id]['SeriesInstanceUID']}"
    ct_path = patient_data_path.joinpath(ct_folder)
    try: 
        struct_df.loc[[study_id]]
    except:
        print(f"    relevant study does not have an RTStruct")
        pats_to_remove.append(pat)
        continue
        
    if struct_df.loc[[study_id]].shape[0] > 1:
        chosen_set = struct_df.loc[study_id]['StructureSetTime']==struct_df.loc[study_id]['StructureSetTime'].max()
        series_id = struct_df.loc[study_id][chosen_set]['SeriesInstanceUID'].values[0]
        file_id = struct_df.loc[study_id][chosen_set]['SOPInstanceUID'].values[0]
        print(series_id, file_id)
    else:
        series_id = struct_df.loc[study_id]['SeriesInstanceUID']
        file_id = struct_df.loc[study_id]['SOPInstanceUID']
    rtstruct_file = f"{study_id}/RTSTRUCT.{series_id}/{file_id}.dcm"
    rtstruct_path = patient_data_path.joinpath(rtstruct_file)
    print(ct_path.as_posix())
    print(f"ct good: {os.path.exists(ct_path)}")
    print(rtstruct_path.as_posix())
    print(f"rtstruct good: {os.path.exists(rtstruct_path)}")

    study_ids[pat] = study_id
study_id_df = pd.DataFrame(study_ids.values(), columns=['study_id'], index=study_ids.keys())
study_id_df.to_csv(data_path.joinpath('recurrence_free_study_ids.csv'))
    #try:
    #    rtstruct = pydicom.dcmread(rtstruct_path)
    #except:
    #    print(f"    unable to read RTSTRUCT")
    #    continue
    #    #rtstruct_file = f"{study_id}/{file_id}.dcm"
    #    #rtstruct_path = patient_data_path.joinpath(rtstruct_file)
    #    #rtstruct = pydicom.dcmread(rtstruct_path)
    #    
    #bad_specials = ['?', '\\', '>', '<', '+', '=', '|']
    #for seq in rtstruct.StructureSetROISequence:
    #    if np.any([spec in seq.ROIName for spec in bad_specials]):
    #        seq.ROIName = ''.join(filter(str.isalnum, seq.ROIName))
    #rtstruct.save_as(rtstruct_path)
    #try:
    #    convert_rtstruct(ct_path, rtstruct_path, output_dir=patient_nii_path, output_img=patient_image_path)
    #except:
    #    print(f"    unable to read CT data")
    #    continue
    #    #ct_path = patient_data_path.joinpath(f"{study_id}/CT")
    #    #convert_rtstruct(ct_path, rtstruct_path, output_dir=patient_nii_path, output_img=patient_image_path)

40014212
    No HN entry in metadata
5725276
1.2.840.113704.1.111.2084.1532090425.5
../../data/UTSW_HNC/dicom_free/5725276/1.2.840.113704.1.111.2084.1532090425.5/CT.1.2.840.113704.1.111.7888.1532092823.17
ct good: True
../../data/UTSW_HNC/dicom_free/5725276/1.2.840.113704.1.111.2084.1532090425.5/RTSTRUCT.1.2.246.352.71.2.973432337593.1085895.20180731134456/1.2.246.352.71.4.973432337593.102704.20180731134455.dcm
rtstruct good: True
5744449
    no relevant StudyDescription
5759052
1.2.840.113704.1.111.1360.1468437286.13
../../data/UTSW_HNC/dicom_free/5759052/1.2.840.113704.1.111.1360.1468437286.13/CT.1.2.840.113704.1.111.6668.1468443981.8
ct good: True
../../data/UTSW_HNC/dicom_free/5759052/1.2.840.113704.1.111.1360.1468437286.13/RTSTRUCT.2.16.840.1.113669.2.931128.175282973.20170719124746.621339/2.16.840.1.113669.2.931128.175282973.20170719124747.120583.dcm
rtstruct good: True
5772213
1.2.840.113704.1.111.5776.1390948818.1
../../data/UTSW_HNC/dicom_free/5772213/1.2.840.113704.1.111.5776

In [36]:
use_ctv = ['70346638', 
           '70366494',
           '72517194',
           '73057886',
           '90570798',
           '90639305',
           '90888477',
           '91326601',
           '92396528',
           '92635309',
           '92686560',
           '93001432',
           '93756298',
           '94268676',
           '94362734',
           '94942437',
           '93508071',
           '93573753',
           '93592484',
           '94313318',
           '94352737',
           '94942437',]
patients = [pth.as_posix().split('/')[-1] for pth in nii_path.glob('*')]
for pat in patients:
    print(pat)
    gtvs = []
    patient_nii_path = nii_path.joinpath(pat)
    if len(list(patient_nii_path.glob('Struct*.nii.gz'))) < 1:
        os.rmdir(patient_nii_path)
        print(f"    patient removed")
    for struct in patient_nii_path.glob('Struct*.nii.gz'):
        if 'gtv' in str(struct.as_posix()).lower():
            gtvs.append(struct)
            print(f"    {struct.as_posix().split('/')[-1]}")
        elif pat in use_ctv and 'ctv' in struct.as_posix().lower():
            gtvs.append(struct)
            print(f"    {struct.as_posix().split('/')[-1]}")
        else:
            os.remove(struct)
    if len(gtvs) < 1:
        print("no gtv masks available")
    #print(gtvs)
        

5725276
    Struct_GTVn_Ltneck.nii.gz
    Struct_GTVn_RtNeck.nii.gz
    Struct_GTVp.nii.gz
5759052
    Struct_GTV_66.nii.gz
5772213
    Struct_GTVLN3cm.nii.gz
    Struct_GTV_Primary.nii.gz
70031056
    Struct_GTV-Nodes.nii.gz
    Struct_GTV-Primary.nii.gz
70092288
    Struct_GTV-post-chemo-node.nii.gz
    Struct_GTV_SGL_Larynx.nii.gz
70156435
    Struct_GTV.nii.gz
70240815
    Struct_GTV-neck.nii.gz
70346638
    Struct_CTVp_56Gy.nii.gz
    Struct_CTVp_56Gy_-_Grow.nii.gz
    Struct_CTVp_60Gy.nii.gz
    Struct_CTVp_60Gy_-_Grow.nii.gz
    Struct_Union_-_CTVp_60G.nii.gz
70366494
    Struct_CTV1_6930.nii.gz
    Struct_CTV2_5610.nii.gz
70368270
    Struct_GTV-R_level_II.nii.gz
    Struct_GTV_L_JD.nii.gz
    Struct_GTV_L_neck.nii.gz
    Struct_GTV_pre-chemo-L_tonsil.nii.gz
70368603
    Struct_GTV_NODE.nii.gz
    Struct_GTV_PRIMARY.nii.gz
70381565
    Struct_niGTV70.nii.gz
    Struct_nsGTV66.5.nii.gz
    Struct_pGTV.nii.gz
70391763
    Struct_GTVn.nii.gz
    Struct_GTVns.nii.gz
    Struct_GTVp

In [74]:
pat = '97085621'
patient_nii_path = nii_path.joinpath(pat)
gtvn1_path = patient_nii_path.joinpath('Struct_GTVns-5600.nii.gz')
gtvn2_path = patient_nii_path.joinpath('Struct_GTVns-6300.nii.gz')
gtvn3_path = patient_nii_path.joinpath('Struct_GTVni-7000.nii.gz')
out_path = patient_nii_path.joinpath('Struct_GTVns.nii.gz')
gtvn1 = sitk.ReadImage(gtvn1_path)
gtvn2 = sitk.ReadImage(gtvn2_path)
gtvn3 = sitk.ReadImage(gtvn3_path)

#gtvns = gtvn1 + gtvn2
gtvns = gtvn1 + gtvn2 + gtvn3

sitk.WriteImage(gtvns, out_path)

In [41]:
len(patients)

124