In [13]:
import subprocess
from pathlib import Path

from fmcib.utils import download_LUNG1
from imgtools.autopipeline import ImageAutoInput

from readii import loaders as rdloaders
from readii.feature_extraction import generateNegativeControl
from readii.io.writers.nifti_writer import NIFTIWriter
import re

# Define the directory structure we want with placeholders for the PatientID, StudyInstanceUID, SeriesInstanceUID
# These must be passed in as keyword arguments to the save method
# We will have custom IMAGE_ID for each image
import pandas as pd

def generate_and_save_negative_controls(
  patient : pd.Series,
  roi_match_pattern : dict,
  negative_control_list : list,
  writer : NIFTIWriter,
  random_seed : int
):
  print(f"Loading data for subject {patient.Index} : patient {patient.patient_ID}")
  base_image = rdloaders.loadDicomSITK(patient.folder_CT)
  ROI_NAME = list(roi_match_pattern.keys())[0]
  mask_image = rdloaders.loadRTSTRUCTSITK(
    rtstructPath=patient.folder_RTSTRUCT_CT,
    baseImageDirPath=patient.folder_CT,
    roiNames=roi_match_pattern
  ).get(ROI_NAME)
  writer.save(
    image=base_image,
    PatientID=patient.Index,
    StudyInstanceUID=patient.study[-5:],
    SeriesInstanceUID=patient.series_CT[-5:],
    Modality="CT",
    IMAGE_ID="original"
  )
  writer.save(
    image=mask_image,
    PatientID=patient.Index,
    StudyInstanceUID=patient.study[-5:],
    SeriesInstanceUID=patient.series_RTSTRUCT_CT[-5:],
    Modality="RTSTRUCT",
    IMAGE_ID=ROI_NAME
  )

  for NEGATIVE_CONTROL in negative_control_list:
    print(f"Generating negative control {NEGATIVE_CONTROL}")
    neg_control_image = generateNegativeControl(
      ctImage=base_image,
      alignedROIImage=mask_image,
      randomSeed=random_seed,
      negativeControl=NEGATIVE_CONTROL
    )
    # Save the negative control image
    writer.save(
      image=neg_control_image,
      PatientID=patient.Index,
      StudyInstanceUID=patient.study[-5:],
      SeriesInstanceUID=patient.series_CT[-5:],
      Modality="CT",
      IMAGE_ID=NEGATIVE_CONTROL
    )

def index_and_submit_saves(
  input_dir,
  modalities,
  roi_match_pattern,
  update_imgtools_index,
  n_jobs,
  nifti_output_dir,
  filename_format,
  overwrite,
  random_seed,
  negative_control_list,
):
  neg_nifti_writer = NIFTIWriter(
    root_directory=nifti_output_dir,
    filename_format=filename_format,
    overwrite=overwrite,
  )
  dataset = ImageAutoInput(
    dir_path=input_dir,
    modalities=",".join(modalities),
    update=update_imgtools_index,
    n_jobs=n_jobs,
  )

  for patient in dataset.df_combined.itertuples():
    generate_and_save_negative_controls(
      patient=patient,
      roi_match_pattern=roi_match_pattern,
      negative_control_list=negative_control_list,
      writer=neg_nifti_writer,
      random_seed=random_seed,
    )
  filename_pattern = neg_nifti_writer.pattern_resolver.formatted_pattern.replace(
    "%(", "(?P<"
  ).replace(")s", ">.*?)")

  datafiles = []
  for file_path in nifti_output_dir.rglob("*.nii.gz"):
    if (match := re.search(filename_pattern, str(file_path).replace("\\", "/"))):
      relative_path = file_path.absolute().relative_to(nifti_output_dir.absolute())
      datafiles.append({**match.groupdict(), "filepath": relative_path})
  datafiles_df = pd.DataFrame(datafiles)
  csv_path = nifti_output_dir / "dataset_index.csv"
  datafiles_df.to_csv(csv_path, index=False)
  return csv_path


# SETUP 

In [14]:
COLLECTION_ID = 'nsclc_radiomics'

# Save data to local directory
DATA_DIR = Path('data') / 'images' / COLLECTION_ID



# After sorting, convert them to nifti and save them in the niftis directory
NIFTI_OUTPUT_DIR = DATA_DIR /  "niftis"

# Modalities of interest, 
MODALITIES = ["CT", "RTSTRUCT"]

RANDOM_SEED = 10
IMAGE_TYPES =  ["shuffled_full","shuffled_roi","shuffled_non_roi","randomized_sampled_full","randomized_sampled_roi","randomized_sampled_non_roi"]

NIFTI_FILENAME_FORMAT = "SubjectID-{PatientID}/{Modality}_SeriesUID-{SeriesInstanceUID}/{IMAGE_ID}.nii.gz"

ROI_NAME = "GTV"
roi_match_pattern = {ROI_NAME: "^(GTV.*)$"} # Use a regex to match the ROI name to rois like "GTV 1", "GTV 2"

In [19]:
download_LUNG1(str(DATA_DIR), 1)
print("New Directory Structure: ")
subprocess.run(["tree", "-d", DATA_DIR])

2024-12-19 16:46:07.604 | INFO     | fmcib.utils.idc_helper:download_LUNG1:155 - Downloading LUNG1 manifest from Dropbox ...
2024-12-19 16:46:09.155 | INFO     | fmcib.utils.idc_helper:download_from_manifest:99 - Downloading DICOM data from IDC (Imaging Data Commons) ...
100%|██████████| 113/113 [00:02<00:00, 53.91it/s]

New Directory Structure: 
[01;34mdata/images/nsclc_radiomics[0m
└── [01;34mdicom[0m
    └── [01;34mLUNG1-175[0m
        └── [01;34m1.3.6.1.4.1.32722.99.99.276242784190608938349755555331454709152[0m

4 directories





CompletedProcess(args=['tree', '-d', PosixPath('data/images/nsclc_radiomics')], returncode=0)

In [20]:
csv_path = index_and_submit_saves(
  input_dir=DATA_DIR.absolute(),
  modalities=MODALITIES,
  roi_match_pattern=roi_match_pattern,
  update_imgtools_index=True,
  n_jobs=-1,
  nifti_output_dir=NIFTI_OUTPUT_DIR,
  filename_format=NIFTI_FILENAME_FORMAT,
  overwrite=True,
  random_seed=RANDOM_SEED,
  negative_control_list=IMAGE_TYPES,
)

print(f"Saved dataset index to {csv_path}")

100%|██████████| 3/3 [00:00<00:00, 2240.95it/s]
  relevant_study_id = self.df_new.loc[(self.df_new.edge_type.str.contains(regex_term)), "study_x"].unique()


Loading data for subject 0_LUNG1-175 : patient LUNG1-175
Generating negative control shuffled_full
Generating negative control shuffled_roi
Generating negative control shuffled_non_roi
Generating negative control randomized_sampled_full
Generating negative control randomized_sampled_roi
Generating negative control randomized_sampled_non_roi
Saved dataset index to data/images/nsclc_radiomics/niftis/dataset_index.csv


In [11]:
# Print the directory structure but hide the actual dicom files to avoid clutter
! tree -F $NIFTI_OUTPUT_DIR.parent -I "*.dcm"

[01;34mdata/images/nsclc_radiomics[0m/
├── [01;34mdicom[0m/
│   └── [01;34mLUNG1-232[0m/
│       └── [01;34m1.3.6.1.4.1.32722.99.99.68529125696580382731828145030595983266[0m/
├── [01;34mniftis[0m/
│   ├── [01;34mSubjectID-0_LUNG1-232[0m/
│   │   ├── [01;34mCT_SeriesUID-92625[0m/
│   │   │   ├── [01;31moriginal.nii.gz[0m
│   │   │   ├── [01;31mrandomized_sampled_full.nii.gz[0m
│   │   │   ├── [01;31mrandomized_sampled_non_roi.nii.gz[0m
│   │   │   ├── [01;31mrandomized_sampled_roi.nii.gz[0m
│   │   │   ├── [01;31mshuffled_full.nii.gz[0m
│   │   │   ├── [01;31mshuffled_non_roi.nii.gz[0m
│   │   │   └── [01;31mshuffled_roi.nii.gz[0m
│   │   └── [01;34mRTSTRUCT_SeriesUID-02754[0m/
│   │       └── [01;31mGTV.nii.gz[0m
│   └── [00mdataset_index.csv[0m
└── [00mnsclc_radiomics.csv[0m

8 directories, 10 files
