In [5]:
import os

dir = r'Z:\pancancer\temp_rhpc\structure_use'

# Initialize lists for each subfolder
nifti_files = []
old_gt_seg_files = []
ai_seg_files = []
new_gt_seg_files = []

# Define the mapping of folder names to list variables
folder_mapping = {
    "scan": nifti_files,
    "old": old_gt_seg_files,
    "ai": ai_seg_files,
    "new": new_gt_seg_files
}

# Loop through subfolders and collect files
for subfolder, file_list in folder_mapping.items():
    subfolder_path = os.path.join(dir, subfolder)

    if os.path.isdir(subfolder_path):  # Ensure the folder exists
        for root, _, files in os.walk(subfolder_path):
            for file in files:
                full_path = os.path.join(root, file)
                file_list.append(full_path)


### Function: Compare All Three Folders and Include Missing Files Report ###
def compare_all_folders(scan_files, old_gt_files, ai_files):
    """
    Compare files across all three folders: scan, old, and ai.
    Returns:
    - A list of base names found in any of the three folders.
    - A list of full paths for scan files corresponding to each base name.
    - A list of full paths for old files corresponding to each base name.
    - A list of full paths for ai files corresponding to each base name.
    Also prints a missing files report.
    """

    def get_base_names(folder_files):
        """
        Extracts base file names (before .ai.seg.nrrd, .seg.nrrd, .nrrd, _0000.nrrd) from a list of file paths.
        Returns a dictionary {base_name: full_path}.
        """
        base_names = {}
        for file_path in folder_files:
            filename = os.path.basename(file_path)

            if filename.endswith('.ai.seg.nrrd'):  # AI segmentation
                base = filename.split('.ai.seg.nrrd', 1)[0]
                base_names[base] = file_path

            elif filename.endswith('.seg.nrrd'):  # Old GT segmentation
                base = filename.split('.seg.nrrd', 1)[0]
                base_names[base] = file_path

            elif filename.endswith('_0000.nrrd'):  # Scan file with `_0000.nrrd`
                base = filename.split('_0000.nrrd', 1)[0]
                base_names[base] = file_path

            elif filename.endswith('.nrrd'):  # General `.nrrd` file
                base = filename.split('.nrrd', 1)[0]
                if base not in base_names:  # Avoid overwriting `_0000.nrrd`
                    base_names[base] = file_path

        return base_names

    # Extract base names from all three folders
    map_scan = get_base_names(scan_files)
    map_old_gt = get_base_names(old_gt_files)
    map_ai = get_base_names(ai_files)

    # Create a sorted list of all unique base names
    all_base_names = sorted(set(map_scan.keys()) | set(map_old_gt.keys()) | set(map_ai.keys()))

    # Lists to store corresponding full paths
    scan_paths = []
    old_gt_paths = []
    ai_paths = []

    # Identify missing files per folder
    missing_in_scan = []
    missing_in_old_gt = []
    missing_in_ai = []

    for base in all_base_names:
        scan_path = map_scan.get(base, None)
        old_gt_path = map_old_gt.get(base, None)
        ai_path = map_ai.get(base, None)

        scan_paths.append(scan_path)
        old_gt_paths.append(old_gt_path)
        ai_paths.append(ai_path)

        if scan_path is None:
            missing_in_scan.append(base)
        if old_gt_path is None:
            missing_in_old_gt.append(base)
        if ai_path is None:
            missing_in_ai.append(base)

    # Print the missing files report
    print("\n### Missing Files Report ###")

    if missing_in_scan:
        print("\n❌ Missing in Scan Folder:")
        for base in missing_in_scan:
            print(f"  - {base}")

    if missing_in_old_gt:
        print("\n❌ Missing in Old GT Folder:")
        for base in missing_in_old_gt:
            print(f"  - {base}")

    if missing_in_ai:
        print("\n❌ Missing in AI Folder:")
        for base in missing_in_ai:
            print(f"  - {base}")

    if not (missing_in_scan or missing_in_old_gt or missing_in_ai):
        print("\n✅ All files are correctly matched across all folders!")

    # Return the 4 lists
    return all_base_names, scan_paths, old_gt_paths, ai_paths


### 🚀 **Run Full Comparison & Get Lists**
base_names, scan_paths, old_gt_paths, ai_paths = compare_all_folders(nifti_files, old_gt_seg_files, ai_seg_files)

# Output the results
print("\n### Base Names List ###")
print(base_names)

print("\n### Corresponding Scan Paths ###")
print(scan_paths)

print("\n### Corresponding Old GT Paths ###")
print(old_gt_paths)

print("\n### Corresponding AI Paths ###")
print(ai_paths)



### Missing Files Report ###

✅ All files are correctly matched across all folders!

### Base Names List ###
['MEGA_0010--CRLM-CT--CRLM-CT-1077--101.000000-NA-07517', 'MEGA_0011--CRLM-CT--CRLM-CT-1078--101.000000-NA-24180', 'MEGA_0012--CRLM-CT--CRLM-CT-1079--2.000000-ChestAbdomenPelvis-59040', 'MEGA_0013--CRLM-CT--CRLM-CT-1080--101.000000-NA-99240', 'MEGA_0014--CRLM-CT--CRLM-CT-1081--101.000000-NA-43584']

### Corresponding Scan Paths ###
['Z:\\pancancer\\temp_rhpc\\structure_use\\scan\\MEGA_0010--CRLM-CT--CRLM-CT-1077--101.000000-NA-07517_0000.nrrd', 'Z:\\pancancer\\temp_rhpc\\structure_use\\scan\\MEGA_0011--CRLM-CT--CRLM-CT-1078--101.000000-NA-24180_0000.nrrd', 'Z:\\pancancer\\temp_rhpc\\structure_use\\scan\\MEGA_0012--CRLM-CT--CRLM-CT-1079--2.000000-ChestAbdomenPelvis-59040_0000.nrrd', 'Z:\\pancancer\\temp_rhpc\\structure_use\\scan\\MEGA_0013--CRLM-CT--CRLM-CT-1080--101.000000-NA-99240_0000.nrrd', 'Z:\\pancancer\\temp_rhpc\\structure_use\\scan\\MEGA_0014--CRLM-CT--CRLM-CT-1081--101