[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BHKLAB-DataProcessing/readii-idc-notebooks/blob/jjjermiah-dev/notebooks/1_RunPyradiomics.ipynb)

In [None]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    !curl -fsSL https://pixi.sh/install.sh | bash

    # get pixi.toml and pixi.lock from github based on commit 
    pixifile = 'https://raw.githubusercontent.com/BHKLAB-DataProcessing/readii-idc-notebooks/refs/heads/jjjermiah/dev/pixi.toml'
    lockfile = 'https://raw.githubusercontent.com/BHKLAB-DataProcessing/readii-idc-notebooks/refs/heads/jjjermiah/dev/pixi.lock'
    !curl -fsSL $pixifile > pixi.toml
    !curl -fsSL $lockfile > pixi.lock

    pixi = '/root/.pixi/bin/pixi'


In [8]:
from __future__ import annotations

from pathlib import Path
from rich import print
import pandas as pd
import re
from dataclasses import dataclass, field
from typing import Any, Dict, Iterator, List, Union, NamedTuple
from collections import namedtuple

from readii.io.utils.pattern_resolver import PatternResolver
from readii.utils import logger

from pathlib import Path
import logging

pyradiomics_logger = logging.getLogger("radiomics")
pyradiomics_logger.setLevel(logging.ERROR)

# Setup and Configuration

In [9]:
# a Save data to local directory
DATA_DIR = Path("data")

# If you choose a different collection in the setup notebook, you will need to change this value
COLLECTION_ID = "nsclc_radiomics"

NIFTI_OUTPUT_DIR = DATA_DIR / "images" / COLLECTION_ID / "niftis"

PYRADIOMICS_CONFIG = Path().cwd().parent / "pyradiomics.yaml"



IMAGE_TYPES = [
    "shuffled_full",
    "shuffled_roi",
    "shuffled_non_roi",
    "randomized_sampled_full",
    "randomized_sampled_roi",
    "randomized_sampled_non_roi",
]

In [10]:
! tree -F $NIFTI_OUTPUT_DIR

[01;34mdata/images/nsclc_radiomics/niftis[0m/
├── [01;34mSubjectID-0_LUNG1-162[0m/
│   └── [01;34mStudyUID-21249[0m/
│       ├── [01;34mCT_SeriesUID-72433[0m/
│       │   ├── [01;31moriginal.nii.gz[0m
│       │   ├── [01;31mrandomized_sampled_full.nii.gz[0m
│       │   ├── [01;31mrandomized_sampled_non_roi.nii.gz[0m
│       │   ├── [01;31mrandomized_sampled_roi.nii.gz[0m
│       │   ├── [01;31mshuffled_full.nii.gz[0m
│       │   ├── [01;31mshuffled_non_roi.nii.gz[0m
│       │   └── [01;31mshuffled_roi.nii.gz[0m
│       └── [01;34mRTSTRUCT_SeriesUID-38612[0m/
│           └── [01;31mGTV.nii.gz[0m
├── [01;34mSubjectID-1_LUNG1-101[0m/
│   └── [01;34mStudyUID-27911[0m/
│       ├── [01;34mCT_SeriesUID-55665[0m/
│       │   ├── [01;31moriginal.nii.gz[0m
│       │   ├── [01;31mrandomized_sampled_full.nii.gz[0m
│       │   ├── [01;31mrandomized_sampled_non_roi.nii.gz[0m
│       │   ├── [01;31mrandomized_sampled_roi.nii.gz[0m
│       │   ├── [01;31mshuffl

In [18]:
filename_format = "SubjectID-{SubjectID}/StudyUID-{StudyInstanceUID}/{Modality}_SeriesUID-{SeriesInstanceUID}/{IMAGE_ID}.nii.gz"

# Define the regex pattern based on the updated filename format
filename_pattern = (
    r"SubjectID-(?P<PatientID>[^/]+)/"
    r"StudyUID-(?P<StudyInstanceUID>[^/]+)/"
    r"(?P<Modality>[^/]+)_SeriesUID-(?P<SeriesInstanceUID>[^/]+)/"
    r"(?P<IMAGE_ID>[^/]+)\.nii\.gz"
)

# Initialize a list to hold extracted data
data = []

# Walk through the directory structure
for file_path in NIFTI_OUTPUT_DIR.rglob("*.nii.gz"):
    match = re.search(filename_pattern, str(file_path).replace("\\", "/"))
    if match:
        # Add extracted data and file path to the list
        data.append({**match.groupdict(), "filepath": str(file_path)})

# Convert the collected data into a pandas DataFrame
df = pd.DataFrame(data)

# Print the DataFrame for verification
for PatientID, patientdf in df.groupby("PatientID"):
    # Print the patient ID
    print(PatientID)
    print(patientdf.info())
    for row in patientdf.itertuples():
        print(row)
        break


<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, 0 to 7
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   PatientID          8 non-null      object
 1   StudyInstanceUID   8 non-null      object
 2   Modality           8 non-null      object
 3   SeriesInstanceUID  8 non-null      object
 4   IMAGE_ID           8 non-null      object
 5   filepath           8 non-null      object
dtypes: object(6)
memory usage: 448.0+ bytes


<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, 8 to 15
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   PatientID          8 non-null      object
 1   StudyInstanceUID   8 non-null      object
 2   Modality           8 non-null      object
 3   SeriesInstanceUID  8 non-null      object
 4   IMAGE_ID           8 non-null      object
 5   filepath           8 non-null      object
dtypes: object(6)
memory usage: 448.0+ bytes


<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, 16 to 23
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   PatientID          8 non-null      object
 1   StudyInstanceUID   8 non-null      object
 2   Modality           8 non-null      object
 3   SeriesInstanceUID  8 non-null      object
 4   IMAGE_ID           8 non-null      object
 5   filepath           8 non-null      object
dtypes: object(6)
memory usage: 448.0+ bytes
