## LungLobeCT_SPECT (Ottawa chest CT) Dataset Preprocessing

### This is the download link of the SPECT-CT lung dataset we are working currently on:
https://zenodo.org/records/12690803

## Extracting all the nii from the gz files

In [None]:
import os
import zipfile
import shutil
import numpy as np
import nibabel as nib
from scipy.ndimage import label as cc_label, binary_dilation
from collections import Counter

# Define the directory containing the ZIP files ----- INSERT PATH HERE -------
directory = r""

# Iterate through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".zip"):  # Check if it's a ZIP file
        zip_path = os.path.join(directory, filename)
        extract_folder = os.path.join(directory, os.path.splitext(filename)[0])  # Remove .zip extension
        
        # Ensure the extraction folder exists
        os.makedirs(extract_folder, exist_ok=True)
        
        # Extract the ZIP file
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_folder)
        
        print(f"Extracted: {filename} to {extract_folder}")

print("All ZIP files have been extracted.")


## Deleting all the gz folders

In [None]:
# Counter for tracking deletions
deleted_folders = 0

# Iterate through each extracted folder in the main directory
for folder_name in os.listdir(directory):
    folder_path = os.path.join(directory, folder_name)
    
    # Skip if it's not a directory or is a zip file
    if not os.path.isdir(folder_path) or folder_name.endswith('.zip'):
        continue
    
    # Now search through this extracted folder for subfolders containing gz files
    for root, dirs, files in os.walk(folder_path):
        for dir_name in dirs[:]:  # Create a copy of the list to modify during iteration
            subfolder_path = os.path.join(root, dir_name)
            
            # Check if this subfolder contains gz files
            has_gz_files = any(f.endswith('.gz') for f in os.listdir(subfolder_path))
            
            if has_gz_files:
                # Delete the folder containing gz files
                shutil.rmtree(subfolder_path)
                deleted_folders += 1
                print(f"Deleted folder: {subfolder_path}")
                dirs.remove(dir_name)  # Remove from dirs to prevent further processing

print(f"Operation complete. Deleted {deleted_folders} folders containing GZ files.")

## Removing the trachea and the SPECT files

In [None]:
# Loop through all subdirectories in the main directory
for case_folder in os.listdir(directory):
    case_path = os.path.join(directory, case_folder)
    if not os.path.isdir(case_path):
        continue
    
    # Look for nested subfolders (like C:\...\LLS0020\LLS0020)
    for root, dirs, files in os.walk(case_path):
        # --- Delete specific file ---
        trachea_file = os.path.join(root, f"{case_folder}_Trachea.nii.gz")
        if os.path.isfile(trachea_file):
            os.remove(trachea_file)
            print(f"Deleted file: {trachea_file}")

        # --- Delete SPECT_NIFTI folder ---
        if "SPECT_NIFTI" in dirs:
            spect_path = os.path.join(root, "SPECT_NIFTI")
            shutil.rmtree(spect_path)
            print(f"Deleted folder: {spect_path}")

## Changing the folder structures

In [None]:
for case_folder in os.listdir(directory):
    case_path = os.path.join(directory, case_folder)
    if not os.path.isdir(case_path):
        continue
    
    # Expected file locations
    lobe_file = os.path.join(case_path, case_folder, "Segmentations", f"{case_folder}_LungLobes.nii.gz")
    ct_file = os.path.join(case_path, case_folder, "CT_NIFTI", f"{case_folder}_CT.nii.gz")
    
    # Destination is the top-level case folder
    dest_lobe = os.path.join(case_path, f"{case_folder}_LungLobes.nii.gz")
    dest_ct = os.path.join(case_path, f"{case_folder}_CT.nii.gz")
    
    # Move LungLobes file if it exists
    if os.path.isfile(lobe_file):
        shutil.move(lobe_file, dest_lobe)
        print(f"Moved: {lobe_file} -> {dest_lobe}")
    
    # Move CT file if it exists
    if os.path.isfile(ct_file):
        shutil.move(ct_file, dest_ct)
        print(f"Moved: {ct_file} -> {dest_ct}")

## Additional preprocessing of the CT_Lung_Lobe_SPECT dataset for removing gaps/holes for vessels areas (only keeping gaps that have >75% area with HU less than -500)

In [None]:
# Gap-filling settings
min_gap_area = 5          # px
area_gap_threshold = 300  # px
hu_threshold = -500       # HU below which we treat as air/emphysema (don't fill)
percentage_threshold = 75 # % of pixels below HU threshold to skip filling
structure = np.ones((3, 3), dtype=bool)  # 8-connectivity

def process_case(ct_path, mask_path, out_path, case_id):
    try:
        ct_img   = nib.load(ct_path)
        mask_img = nib.load(mask_path)

        # Load arrays
        ct_data   = np.asanyarray(ct_img.get_fdata())             # float
        mask_data = np.asanyarray(mask_img.get_fdata())           # may be float
        # Round any float labels to nearest int before processing
        if np.issubdtype(mask_data.dtype, np.floating):
            mask_data = np.rint(mask_data)

        mask_data = mask_data.astype(np.int32)                    # work in int
    except Exception as e:
        print(f"‚ùå [{case_id}] Load error: {e}")
        return

    if ct_data.shape != mask_data.shape:
        print(f"‚ö†Ô∏è [{case_id}] Shape mismatch CT{ct_data.shape} vs Mask{mask_data.shape}; skipping.")
        return

    modified = mask_data.copy()
    n_slices = mask_data.shape[2]

    for sl in range(n_slices):
        slice_mask = mask_data[:, :, sl]
        slice_ct   = ct_data[:, :, sl]

        # Process each present label (ignore background 0)
        for lbl in np.unique(slice_mask):
            if lbl == 0:
                continue

            binary_lobe = (slice_mask == lbl)
            if not binary_lobe.any():
                continue

            # Islands are holes/non-lobe regions surrounded by the lobe
            inverted = ~binary_lobe
            labeled_islands, n_islands = cc_label(inverted)

            for idx in range(1, n_islands + 1):
                island = (labeled_islands == idx)
                area = int(island.sum())
                if area < min_gap_area or area > area_gap_threshold:
                    continue

                vals = slice_ct[island]
                if vals.size == 0:
                    continue

                # Skip filling if majority is air/emphysema
                pct_below = 100.0 * (vals < hu_threshold).sum() / vals.size
                if pct_below >= percentage_threshold:
                    continue

                # Fill gap to the most common neighboring lobe label
                dilated = binary_dilation(island, structure=structure)
                border = dilated & (~island)
                border_labels = slice_mask[border]
                border_labels = border_labels[border_labels != 0]
                if border_labels.size == 0:
                    continue

                target_label = Counter(border_labels).most_common(1)[0][0]
                modified[:, :, sl][island] = target_label

    # Ensure integer labels; clamp to uint8 range
    modified = np.clip(modified, 0, 255).astype(np.uint8)

    # Save with original affine/header
    out_img = nib.Nifti1Image(modified, mask_img.affine, mask_img.header)
    nib.save(out_img, out_path)
    print(f"‚úÖ [{case_id}] Saved: {out_path}")

# === Main loop over case folders ===
for case_id in os.listdir(directory):
    case_dir = os.path.join(directory, case_id)
    if not os.path.isdir(case_dir):
        continue

    ct_path   = os.path.join(case_dir, f"{case_id}_CT.nii.gz")
    mask_path = os.path.join(case_dir, f"{case_id}_LungLobes.nii.gz")
    out_path  = os.path.join(case_dir, f"{case_id}_LungLobes_gapfilled.nii.gz")

    if not (os.path.exists(ct_path) and os.path.exists(mask_path)):
        print(f"‚ö†Ô∏è [{case_id}] Missing CT or mask; skipping.")
        continue

    process_case(ct_path, mask_path, out_path, case_id)


## Confirming how much mask area was added for each lobe in each CT

In [None]:
# Labels to compare
labels_to_check = range(1, 8)  # 1..7 (change as needed)

def load_int_mask(path):
    img = nib.load(path)
    arr = np.asanyarray(img.get_fdata())
    # Round floats to nearest int, then cast to int32 (exact labels)
    if np.issubdtype(arr.dtype, np.floating):
        arr = np.rint(arr)
    arr = arr.astype(np.int32)
    return img, arr

for case_id in os.listdir(directory):
    case_dir = os.path.join(directory, case_id)
    if not os.path.isdir(case_dir):
        continue

    orig_path = os.path.join(case_dir, f"{case_id}_LungLobes.nii.gz")
    gapf_path = os.path.join(case_dir, f"{case_id}_LungLobes_gapfilled.nii.gz")

    if not (os.path.exists(orig_path) and os.path.exists(gapf_path)):
        print(f"‚ö†Ô∏è [{case_id}] Missing original or gapfilled mask; skipping.")
        continue

    try:
        orig_img, orig = load_int_mask(orig_path)
        gapf_img, gapf = load_int_mask(gapf_path)
    except Exception as e:
        print(f"‚ùå [{case_id}] Load error: {e}")
        continue

    if orig.shape != gapf.shape:
        print(f"‚ö†Ô∏è [{case_id}] Shape mismatch {orig.shape} vs {gapf.shape}; skipping.")
        continue

    # Try to get voxel volume (mm^3); fall back to None if unavailable
    try:
        z = gapf_img.header.get_zooms()[:3]
        voxel_mm3 = float(z[0] * z[1] * z[2])
    except Exception:
        voxel_mm3 = None

    print(f"\nüîµ {case_id}:")
    for lbl in labels_to_check:
        orig_area  = int((orig == lbl).sum())
        filled_area = int((gapf == lbl).sum())
        added = filled_area - orig_area

        if orig_area > 0:
            pct_inc = 100.0 * added / orig_area
        else:
            pct_inc = 100.0 if added > 0 else 0.0

        line = f"  Label {lbl}: +{added} voxels ({pct_inc:.4f}% increase)"
        if voxel_mm3 is not None:
            added_ml = added * voxel_mm3 / 1000.0  # mm^3 -> mL
            line += f" | +{added_ml:.3f} mL"
        print(line)


## Unique labels in each mask

In [None]:
for case_id in os.listdir(directory):
    case_dir = os.path.join(directory, case_id)
    if not os.path.isdir(case_dir):
        continue

    mask_path = os.path.join(case_dir, f"{case_id}_LungLobes_gapfilled.nii.gz")
    if not os.path.exists(mask_path):
        print(f"‚ö†Ô∏è [{case_id}] No gapfilled mask found.")
        continue

    try:
        mask_data = np.asanyarray(nib.load(mask_path).get_fdata())
        mask_data = np.rint(mask_data).astype(np.int32)  # Round & cast
        unique_labels = np.unique(mask_data)
        print(f"üîµ {case_id}: Unique labels = {unique_labels}")
    except Exception as e:
        print(f"‚ùå Error reading {case_id}: {e}")

## Renaming the files

In [None]:
for case_id in os.listdir(root_dir):
    case_dir = os.path.join(root_dir, case_id)
    if not os.path.isdir(case_dir):
        continue

    # --- 1. Delete original mask ---
    orig_mask = os.path.join(case_dir, f"{case_id}_LungLobes.nii.gz")
    if os.path.exists(orig_mask):
        os.remove(orig_mask)
        print(f"üóëÔ∏è Deleted original mask: {orig_mask}")

    # --- 2. Rename CT file ---
    ct_file = os.path.join(case_dir, f"{case_id}_CT.nii.gz")
    new_ct_file = os.path.join(case_dir, f"{case_id}_0000.nii.gz")
    if os.path.exists(ct_file):
        os.rename(ct_file, new_ct_file)
        print(f"üîÑ Renamed CT: {ct_file} ‚Üí {new_ct_file}")
    else:
        print(f"‚ö†Ô∏è CT not found for {case_id}")

    # --- 3. Rename gapfilled mask ---
    gapfilled_mask = os.path.join(case_dir, f"{case_id}_LungLobes_gapfilled.nii.gz")
    new_mask_file = os.path.join(case_dir, f"{case_id}.nii.gz")
    if os.path.exists(gapfilled_mask):
        os.rename(gapfilled_mask, new_mask_file)
        print(f"üîÑ Renamed gapfilled mask: {gapfilled_mask} ‚Üí {new_mask_file}")
    else:
        print(f"‚ö†Ô∏è Gapfilled mask not found for {case_id}")