In [1]:
import asyncio
import os
import subprocess
import tempfile
from pathlib import Path

import aiohttp
import ipywidgets as widgets
from idc_index import index
from imgtools.dicom.sort import DICOMSorter
from imgtools.logging import logger as imgtools_logger
from pydicom import dcmread
import random # noqa

imgtools_logger.setLevel("WARNING")

# SETUP 

In [2]:
# Create a client object from the IDC package
client = index.IDCClient()
print(f'Current IDC Version: {client.get_idc_version()}')

# Get the list of collections
collections = sorted(client.get_collections())
print(f"Found {len(collections)} collections")  
client.collection_summary

Current IDC Version: v17
Found 142 collections


Unnamed: 0_level_0,Modality,series_size_MB
collection_id,Unnamed: 1_level_1,Unnamed: 2_level_1
4d_lung,"[RTSTRUCT, CT]",183054.14
acrin_6698,"[MR, SEG]",841956.27
acrin_contralateral_breast_mr,"[MR, CR]",199592.57
acrin_flt_breast,"[PT, CT, OT]",74235.64
acrin_nsclc_fdg_pet,"[PT, CT, DX, CR, NM, MR, SC, SEG]",145677.88
...,...,...
tcga_uvm,[SM],102250.70
upenn_gbm,[MR],139399.35
vestibular_schwannoma_mc_rc,[MR],10015.97
vestibular_schwannoma_seg,"[RTDOSE, RTSTRUCT, RTPLAN, MR]",28194.20


# Filter Collections that have both RTSTRUCTS and CTs

In [3]:
# Group the data by 'collection_id' and aggregate the 'Modality' column into a set for each group
# Aggregate the 'Modality' column into a set, so we get unique modalities per 'collection_id'
# Filter groups where the 'Modality' set includes both 'RTSTRUCT' and 'CT'
rt_ct_collections = client.index\
  .groupby('collection_id')['Modality']\
  .agg(set)\
  .loc[lambda x: x.map({'RTSTRUCT', 'CT'}.issubset)]

print(f"Found {len(rt_ct_collections)} collections with both RTSTRUCT and CT")
rt_ct_collections

Found 13 collections with both RTSTRUCT and CT


collection_id
4d_lung                                           {CT, RTSTRUCT}
cc_tumor_heterogeneity               {RTSTRUCT, REG, MR, CT, PT}
cptac_ccrcc                               {SM, CT, MR, RTSTRUCT}
cptac_pda                         {US, SM, RTSTRUCT, MR, CT, PT}
cptac_ucec                            {SM, RTSTRUCT, MR, CT, PT}
lctsc                                             {CT, RTSTRUCT}
nsclc_radiomics                          {SEG, CT, SR, RTSTRUCT}
nsclc_radiomics_interobserver1               {SEG, CT, RTSTRUCT}
pancreatic_ct_cbct_seg                    {RTDOSE, CT, RTSTRUCT}
pediatric_ct_seg                                  {CT, RTSTRUCT}
prostate_anatomical_edge_cases                    {CT, RTSTRUCT}
rider_lung_ct                        {PR, RTSTRUCT, SEG, SR, CT}
soft_tissue_sarcoma                       {CT, PT, MR, RTSTRUCT}
Name: Modality, dtype: object

# Select a Collection

The default is `nsclc_radiomics`.

for the sake of the demonstration, the dropdown options are disabled.

In [4]:
COLLECTION_ID = 'nsclc_radiomics'

In [5]:

matching_series = client.index.loc[client.index.collection_id == COLLECTION_ID, ['SeriesInstanceUID', 'Modality', "series_size_MB"]]
print(f"Found {len(matching_series)} series in collection {COLLECTION_ID}")
options=[
  (
    f'SeriesUID-{row['SeriesInstanceUID'][-10:]} [Modality: {row["Modality"]}; Size: {row["series_size_MB"]}MB]',
    row["SeriesInstanceUID"]
  )
  for _, row in matching_series.iterrows()
  if row['Modality'] == 'RTSTRUCT'
]
print(f"Found {len(options)} RTSTRUCT series in collection {COLLECTION_ID}")

Found 4926 series in collection nsclc_radiomics
Found 422 RTSTRUCT series in collection nsclc_radiomics


## Setting up Directories To Download


In [6]:
# Find user's s5cmd path
s5cmd = client.s5cmdPath

# Save data to local directory
DATA_DIR = Path('data') 

# Create a temporary directory to store the downloaded files before moving them to local directory
TMP_DIR = Path(tempfile.mkdtemp())
TMP_DIR.mkdir(parents=True, exist_ok=True)



## Download chosen RTSTRUCT and CT files

1. download the RTSTRUCT
2. Query the RTSTRUCT's metadata for the CT `SeriesInstanceUID` it references
3. Download the CT files corresponding to the `SeriesInstanceUID`

In [7]:
async def download_file(s3_url: str, output_dir: str, progress) -> None:
	"""
	Download a file from a public S3 bucket URL to the specified directory.

	Parameters
	----------
	s3_url : str
		The S3 URL of the file (e.g., s3://bucket/key).
	output_dir : str
		The directory where the file will be saved.
	"""
	# Convert the S3 URL to an HTTP URL
	http_url = s3_url.replace("s3://", "https://s3.amazonaws.com/")
	
	filename = os.path.basename(http_url)
	output_path = os.path.join(output_dir, filename)

	# Ensure the output directory exists
	os.makedirs(output_dir, exist_ok=True)

	async with aiohttp.ClientSession() as session:
		async with session.get(http_url) as response:
			if response.status != 200:
				# raise Exception(f"Failed to fetch {s3_url}. HTTP status: {response.status}")
				print(f"Failed to fetch {s3_url}. HTTP status: {response.status}")
			
			# Save the file with a progress bar
			# total_size = int(response.headers.get("Content-Length", 0))
			with open(output_path, "wb") as file:
			# 	 tqdm(
			# 	total=total_size, unit="B", unit_scale=True, desc=filename
			# ) as pba
				async for chunk in response.content.iter_chunked(1024):
					file.write(chunk)
			progress.value += 1
	# print(f"Downloaded: {output_path}")

async def download_files_from_list(s3_paths: list[str], output_dir: str) -> None:
	"""
	Download all files from a list of S3 URLs.

	Parameters
	----------
	s3_paths : list[str]
		List of S3 URLs to download.
	output_dir : str
		The directory where all files will be saved.
	""" 

	pbar = widgets.IntProgress(
		value=0,
		min=0,
		max=len(s3_paths),
		description="Downloading:",
		bar_style="info",
		orientation="horizontal"
	)
	display(pbar)

	tasks = [download_file(s3_url, output_dir, pbar) for s3_url in s3_paths]
	await asyncio.gather(*tasks)

In [8]:
random.seed(42)

NUM_RTSTRUCTS = 3

rtstruct_uids = [options[i][1] for i in random.sample(range(len(options)), NUM_RTSTRUCTS)]

rt_paths = [client.get_series_file_URLs(uid)[0] for uid in rtstruct_uids]
await download_files_from_list(rt_paths, TMP_DIR)

patients = client.index[client.index.SeriesInstanceUID.isin(rtstruct_uids)].PatientID.unique()

IntProgress(value=0, bar_style='info', description='Downloading:', max=3)

In [9]:

ct_paths = []
for rt in TMP_DIR.iterdir():
    ds = dcmread(rt, stop_before_pixels=True, specific_tags=['ReferencedFrameOfReferenceSequence', 'StructureSetROISequence', 'Modality'])
    if not ds.Modality == 'RTSTRUCT':
        print(f"Skipping non-RTSTRUCT file: {rt}")
    referenced_ct = ds.ReferencedFrameOfReferenceSequence[0].RTReferencedStudySequence[0].RTReferencedSeriesSequence[0].SeriesInstanceUID
    ct_paths.extend(client.get_series_file_URLs(referenced_ct))

print(f'Found {len(ct_paths)} CT files to download for {len(rt_paths)} RTSTRUCT series')


Found 412 CT files to download for 3 RTSTRUCT series


In [10]:
print("Downloading CT files...")
await download_files_from_list(ct_paths, TMP_DIR)
print("CT files downloaded")

Downloading CT files...


IntProgress(value=0, bar_style='info', description='Downloading:', max=412)

CT files downloaded


# Sort the dicom files into an appropriate structure

The dicom files are all named with a unique UUID. 
This makes it difficult to understand which files are related to each other.

We will sort the files into a directory structure that makes it easier to understand the relationships between the files.

This uses `Med-ImageTools`' `DICOMSorter` class to sort the files into a directory structure.

The structure we are aiming for is:

```console
./data/<collection_name>/dicoms/sorted/
└── Patient-<PatientID>
    └── StudyUID-<StudyInstanceUID>
        ├── <Modality>_SeriesUID-<SeriesInstanceUID>
        └── <Modality2>_SeriesUID-<SeriesInstanceUID>
            ├── DICOM-FILE
            └── DICOM-FILE
```

**Note:**
Earlier, we downloaded the data to a temporary directory, so we will perform a `move` operation on the sorter
If you do not want to move your input data, you can use the `symlink` option to create symbolic links to the files instead of moving them.

In [11]:
sorted_path = DATA_DIR / "images" / COLLECTION_ID / "dicoms"

NIFTI_OUTPUT_DIR = DATA_DIR / "images" / COLLECTION_ID / "niftis"

dicomsorter = DICOMSorter(
  source_directory=TMP_DIR.absolute(),
  target_pattern=Path(
    sorted_path,
    "Patient-%PatientID/StudyUID-%StudyInstanceUID/%Modality_SeriesUID-%SeriesInstanceUID/"
  ).as_posix(),
)
dicomsorter.execute(action="move")


Output()

Output()

In [12]:
print("New Directory Structure: ")
subprocess.run(["tree", "-d", sorted_path.absolute()])

New Directory Structure: 
[01;34m/Users/bhklab/dev/radiomics/readii-idc-notebooks/notebooks/data/images/nsclc_radiomics/dicoms[0m
├── [01;34mPatient-LUNG1-101[0m
│   └── [01;34mStudyUID-27911[0m
│       ├── [01;34mCT_SeriesUID-55665[0m
│       └── [01;34mRTSTRUCT_SeriesUID-25865[0m
├── [01;34mPatient-LUNG1-108[0m
│   └── [01;34mStudyUID-62453[0m
│       ├── [01;34mCT_SeriesUID-81484[0m
│       └── [01;34mRTSTRUCT_SeriesUID-99496[0m
└── [01;34mPatient-LUNG1-162[0m
    └── [01;34mStudyUID-21249[0m
        ├── [01;34mCT_SeriesUID-72433[0m
        └── [01;34mRTSTRUCT_SeriesUID-38612[0m

13 directories


CompletedProcess(args=['tree', '-d', PosixPath('/Users/bhklab/dev/radiomics/readii-idc-notebooks/notebooks/data/images/nsclc_radiomics/dicoms')], returncode=0)

In [13]:
# Modalities of interest, 
MODALITIES = ["CT", "RTSTRUCT"]


PYRADIOMICS_CONFIG = Path().cwd().parent / "pyradiomics.yaml"
RANDOM_SEED = 10
IMAGE_TYPES =  ["shuffled_full","shuffled_roi","shuffled_non_roi","randomized_sampled_full","randomized_sampled_roi","randomized_sampled_non_roi"]

ROI_NAME = "GTV"
roi_match_pattern = {ROI_NAME: "^(GTV.*)$"} # Use a regex to match the ROI name to rois like "GTV 1", "GTV 2"

In [14]:
from imgtools.autopipeline import ImageAutoInput
	
dataset = ImageAutoInput(
  dir_path=sorted_path.absolute(),
  modalities=",".join(MODALITIES),
  update=True,
  n_jobs=-1,
)


  0%|                                                                                                                                                                                                 | 0/3 [00:00<?, ?it/s]


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 224.47it/s]




  relevant_study_id = self.df_new.loc[(self.df_new.edge_type.str.contains(regex_term)), "study_x"].unique()


In [15]:
from functools import partial
from readii import loaders as rdloaders
from readii.feature_extraction import generateNegativeControl
from readii.io.writers.nifti_writer import NIFTIWriter

# Define the directory structure we want with placeholders for the PatientID, StudyInstanceUID, SeriesInstanceUID
# These must be passed in as keyword arguments to the save method
# We will have custom IMAGE_ID for each image
filename_format = "SubjectID-{PatientID}/StudyUID-{StudyInstanceUID}/{Modality}_SeriesUID-{SeriesInstanceUID}/{IMAGE_ID}.nii.gz"
neg_nifti_writer = NIFTIWriter(
  root_directory=NIFTI_OUTPUT_DIR,
  filename_format=filename_format,
  overwrite=True,
)

for patient in dataset.df_combined.itertuples():
  print(f"Loading data for subject {patient.Index} : patient {patient.patient_ID}")

  # Load the CT and mask images
  ct_path = patient.folder_CT
  print(f"Loading CT {ct_path}")
  base_image = rdloaders.loadDicomSITK(ct_path)
  
  mask_path = patient.folder_RTSTRUCT_CT
  print(f"Loading mask {mask_path}")
  mask_image = rdloaders.loadRTSTRUCTSITK(
    rtstructPath=mask_path,
    baseImageDirPath=ct_path,
    roiNames=roi_match_pattern
  ).get(ROI_NAME)

  # Create a partial function for saving images
  save_image = partial(
    neg_nifti_writer.save,
    PatientID=patient.Index,
    StudyInstanceUID=patient.study[-5:],
  )

  ######################################################################################
  # first we save the original CT and mask
  ######################################################################################
  save_image(
    image=base_image,
    SeriesInstanceUID=patient.series_CT[-5:],
    Modality="CT",
    IMAGE_ID="original"
  )
  save_image(
    image=mask_image,
    SeriesInstanceUID=patient.series_RTSTRUCT_CT[-5:],
    Modality="RTSTRUCT",
    IMAGE_ID=ROI_NAME
  )
  ######################################################################################
  # Now we create the negative controls and save them
  ######################################################################################

  for NEGATIVE_CONTROL in IMAGE_TYPES:
    print(f"Generating negative control {NEGATIVE_CONTROL}")
    neg_control_image = generateNegativeControl(
      ctImage=base_image,
      alignedROIImage=mask_image,
      randomSeed=RANDOM_SEED,
      negativeControl=NEGATIVE_CONTROL
    )
    # Save the negative control image
    save_image(
      image=neg_control_image,
      SeriesInstanceUID=patient.series_CT[-5:],
      Modality="CT",
      IMAGE_ID=NEGATIVE_CONTROL
    )
  

Loading data for subject 0_LUNG1-162 : patient LUNG1-162
Loading CT /Users/bhklab/dev/radiomics/readii-idc-notebooks/notebooks/data/images/nsclc_radiomics/dicoms/Patient-LUNG1-162/StudyUID-21249/CT_SeriesUID-72433
Loading mask /Users/bhklab/dev/radiomics/readii-idc-notebooks/notebooks/data/images/nsclc_radiomics/dicoms/Patient-LUNG1-162/StudyUID-21249/RTSTRUCT_SeriesUID-38612/d547a179-83d6-4c38-ac2f-62875388f17a.dcm


Generating negative control shuffled_full


Generating negative control shuffled_roi


Generating negative control shuffled_non_roi


Generating negative control randomized_sampled_full


Generating negative control randomized_sampled_roi


Generating negative control randomized_sampled_non_roi


Loading data for subject 1_LUNG1-101 : patient LUNG1-101
Loading CT /Users/bhklab/dev/radiomics/readii-idc-notebooks/notebooks/data/images/nsclc_radiomics/dicoms/Patient-LUNG1-101/StudyUID-27911/CT_SeriesUID-55665
Loading mask /Users/bhklab/dev/radiomics/readii-idc-notebooks/notebooks/data/images/nsclc_radiomics/dicoms/Patient-LUNG1-101/StudyUID-27911/RTSTRUCT_SeriesUID-25865/e2867194-0e89-4adf-a99b-22c7c0e73b15.dcm


Generating negative control shuffled_full


Generating negative control shuffled_roi


Generating negative control shuffled_non_roi


Generating negative control randomized_sampled_full


Generating negative control randomized_sampled_roi


Generating negative control randomized_sampled_non_roi


Loading data for subject 2_LUNG1-108 : patient LUNG1-108
Loading CT /Users/bhklab/dev/radiomics/readii-idc-notebooks/notebooks/data/images/nsclc_radiomics/dicoms/Patient-LUNG1-108/StudyUID-62453/CT_SeriesUID-81484


Loading mask /Users/bhklab/dev/radiomics/readii-idc-notebooks/notebooks/data/images/nsclc_radiomics/dicoms/Patient-LUNG1-108/StudyUID-62453/RTSTRUCT_SeriesUID-99496/6191cd77-c19b-4531-8e7d-b39069390a03.dcm


Generating negative control shuffled_full


Generating negative control shuffled_roi


Generating negative control shuffled_non_roi


Generating negative control randomized_sampled_full


Generating negative control randomized_sampled_roi


Generating negative control randomized_sampled_non_roi


In [16]:
# Print the directory structure but hide the actual dicom files to avoid clutter
! tree -F $NIFTI_OUTPUT_DIR.parent -I "*.dcm"

[01;34mdata/images/nsclc_radiomics[0m/
├── [01;34mdicoms[0m/
│   ├── [01;34mPatient-LUNG1-101[0m/
│   │   └── [01;34mStudyUID-27911[0m/
│   │       ├── [01;34mCT_SeriesUID-55665[0m/
│   │       └── [01;34mRTSTRUCT_SeriesUID-25865[0m/
│   ├── [01;34mPatient-LUNG1-108[0m/
│   │   └── [01;34mStudyUID-62453[0m/
│   │       ├── [01;34mCT_SeriesUID-81484[0m/
│   │       └── [01;34mRTSTRUCT_SeriesUID-99496[0m/
│   └── [01;34mPatient-LUNG1-162[0m/
│       └── [01;34mStudyUID-21249[0m/
│           ├── [01;34mCT_SeriesUID-72433[0m/
│           └── [01;34mRTSTRUCT_SeriesUID-38612[0m/
└── [01;34mniftis[0m/
    ├── [01;34mSubjectID-0_LUNG1-162[0m/
    │   └── [01;34mStudyUID-21249[0m/
    │       ├── [01;34mCT_SeriesUID-72433[0m/
    │       │   ├── [01;31moriginal.nii.gz[0m
    │       │   ├── [01;31mrandomized_sampled_full.nii.gz[0m
    │       │   ├── [01;31mrandomized_sampled_non_roi.nii.gz[0m
    │       │   ├── [01;31mrandomized_