In [None]:
### Download the X-Ray
!curl -L -o Xray_Hip_dataset.zip https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/zm6bxzhmfz-1.zip

!mkdir /content/datasets/

!unzip /content/Xray_Hip_dataset.zip -d /content/datasets/Xray_Hip/

!rm -rf /content/Xray_Hip_dataset.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  156M  100  156M    0     0  10.6M      0  0:00:14  0:00:14 --:--:-- 13.1M
Archive:  /content/Xray_Hip_dataset.zip
   creating: /content/datasets/Xray_Hip/images/
  inflating: /content/datasets/Xray_Hip/images/image_010.nii.gz  
  inflating: /content/datasets/Xray_Hip/images/image_089.nii.gz  
  inflating: /content/datasets/Xray_Hip/images/image_001.nii.gz  
  inflating: /content/datasets/Xray_Hip/images/image_100.nii.gz  
  inflating: /content/datasets/Xray_Hip/images/image_098.nii.gz  
  inflating: /content/datasets/Xray_Hip/images/image_117.nii.gz  
  inflating: /content/datasets/Xray_Hip/images/image_036.nii.gz  
  inflating: /content/datasets/Xray_Hip/images/image_072.nii.gz  
  inflating: /content/datasets/Xray_Hip/images/image_135.nii.gz  
  inflating: /content/datasets/Xray_Hip/images/image_054.nii.gz  
  inflating: /c

In [None]:
## Preprocess dataset

import os
import glob
import csv
import nibabel as nib
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# --- Configuration ---
# Input paths where you downloaded the dataset
BASE_INPUT_DIR = '/content/datasets/Xray_Hip'
IMAGE_INPUT_DIR = os.path.join(BASE_INPUT_DIR, 'images')
LABEL_INPUT_DIR = os.path.join(BASE_INPUT_DIR, 'labels')

# Output paths for the preprocessed 2D data
BASE_OUTPUT_DIR = '/content/preprocessed_datasets/xrayhip'
IMAGE_OUTPUT_DIR = os.path.join(BASE_OUTPUT_DIR, 'images')
MASK_OUTPUT_DIR = os.path.join(BASE_OUTPUT_DIR, 'masks')

# Split ratios
TEST_SIZE = 0.2  # 20% for testing
VAL_SIZE = 0.125 # 10% of total data (0.1 / 0.8 = 0.125)

# --- Main Script ---

def preprocess_xray_hip_dataset():
    """
    Processes the 3D Xray-Hip NIfTI dataset into 2D PNG slices and
    creates train/val/test CSV splits.
    """
    print("--- Starting Dataset Preprocessing for Xray-Hip ---")

    # 1. Create output directories
    os.makedirs(IMAGE_OUTPUT_DIR, exist_ok=True)
    os.makedirs(MASK_OUTPUT_DIR, exist_ok=True)
    print(f"Created output directories at: {BASE_OUTPUT_DIR}")

    # 2. Find all image files and get unique patient/volume IDs
    image_files = sorted(glob.glob(os.path.join(IMAGE_INPUT_DIR, '*.nii.gz')))
    if not image_files:
        print(f"Error: No .nii.gz files found in {IMAGE_INPUT_DIR}. Please check the path.")
        return

    # Extracting IDs like 'image_001' from the full path
    patient_ids = [os.path.basename(f).replace('.nii.gz', '') for f in image_files]
    print(f"Found {len(patient_ids)} unique volumes/patients.")

    # 3. Split patient IDs into train, validation, and test sets (7:1:2 split)
    train_val_ids, test_ids = train_test_split(
        patient_ids, test_size=TEST_SIZE, random_state=42
    )
    train_ids, val_ids = train_test_split(
        train_val_ids, test_size=VAL_SIZE, random_state=42
    )

    print(f"Train IDs: {len(train_ids)}, Validation IDs: {len(val_ids)}, Test IDs: {len(test_ids)}")

    # Store data for CSV files
    data_splits = {
        'train': [],
        'val': [],
        'test': []
    }

    # 4. Process each volume: load, slice, normalize, and save
    print("\nProcessing volumes and creating 2D slices...")
    for patient_id in tqdm(patient_ids, desc="Processing Volumes"):
        img_path = os.path.join(IMAGE_INPUT_DIR, f"{patient_id}.nii.gz")
        mask_path = os.path.join(LABEL_INPUT_DIR, f"{patient_id.replace('image', 'label')}.nii.gz")

        try:
            # Load NIfTI files
            img_nii = nib.load(img_path)
            mask_nii = nib.load(mask_path)

            # Get data as numpy arrays
            img_data = img_nii.get_fdata()
            mask_data = mask_nii.get_fdata()

            # Determine which split this patient belongs to
            if patient_id in train_ids:
                split = 'train'
            elif patient_id in val_ids:
                split = 'val'
            else:
                split = 'test'

            # Iterate through slices (assuming slices are on the 3rd axis)
            num_slices = img_data.shape[2]
            for i in range(num_slices):
                img_slice = img_data[:, :, i]
                mask_slice = mask_data[:, :, i]

                # --- ADDED THIS CHECK ---
                # If the mask has no foreground pixels, skip this slice entirely.
                if np.sum(mask_slice) == 0:
                    continue

                # --- Normalization and Saving ---
                # Normalize image slice to 0-255 for PNG
                if img_slice.max() > img_slice.min():
                    img_slice_normalized = (img_slice - img_slice.min()) / (img_slice.max() - img_slice.min()) * 255.0
                img_slice_uint8 = img_slice_normalized.astype(np.uint8)

                # Convert mask to 0 and 255
                mask_slice = (mask_slice > 0).astype(np.uint8) * 255

                # Create output filenames
                base_name = patient_id.split('_')[1] # e.g., '001'
                slice_filename = f"image_{base_name}.ni_z{i:03d}.png"

                img_out_path = os.path.join(IMAGE_OUTPUT_DIR, slice_filename)
                mask_out_path = os.path.join(MASK_OUTPUT_DIR, slice_filename)

                # Save as PNG
                Image.fromarray(img_slice_uint8).save(img_out_path)
                Image.fromarray(mask_slice).save(mask_out_path)

                # --- Store relative paths for CSV ---
                # Example: sa_xrayhip/images/image_044.ni_z001.png
                relative_img_path = os.path.join(os.path.basename(BASE_OUTPUT_DIR), 'images', slice_filename)
                relative_mask_path = os.path.join(os.path.basename(BASE_OUTPUT_DIR), 'masks', slice_filename)

                # Using forward slashes for cross-platform compatibility
                data_splits[split].append(
                    f"{relative_img_path.replace(os.sep, '/')},{relative_mask_path.replace(os.sep, '/')}"
                )

        except FileNotFoundError:
            print(f"Warning: Could not find corresponding label for {patient_id}. Skipping.")
        except Exception as e:
            print(f"An error occurred processing {patient_id}: {e}")

    # 5. Write the train/val/test CSV files
    print("\nSaving train/val/test CSV files...")
    for split_name, data in data_splits.items():
        csv_path = os.path.join(BASE_OUTPUT_DIR, f"{split_name}.csv")
        with open(csv_path, 'w', newline='') as f:
            # No header, just write the data lines
            f.write('\n'.join(data))
        print(f"Saved {csv_path} with {len(data)} entries.")

    print("\n--- Preprocessing Complete! ---")

In [None]:
# Make sure the input directories exist before running
if not os.path.exists(IMAGE_INPUT_DIR) or not os.path.exists(LABEL_INPUT_DIR):
    print(f"Error: Input directory {BASE_INPUT_DIR} not found or is missing 'images'/'labels' subfolders.")
    print("Please ensure your dataset is located at the correct path.")
else:
    preprocess_xray_hip_dataset()

--- Starting Dataset Preprocessing for Xray-Hip ---
Created output directories at: /content/preprocessed_datasets/xrayhip
Found 140 unique volumes/patients.
Train IDs: 98, Validation IDs: 14, Test IDs: 28

Processing volumes and creating 2D slices...


Processing Volumes: 100%|██████████| 140/140 [00:08<00:00, 16.11it/s]


Saving train/val/test CSV files...
Saved /content/preprocessed_datasets/xrayhip/train.csv with 98 entries.
Saved /content/preprocessed_datasets/xrayhip/val.csv with 14 entries.
Saved /content/preprocessed_datasets/xrayhip/test.csv with 28 entries.

--- Preprocessing Complete! ---





In [None]:
import os
import zipfile

def zip_folder(folder_path, output_path):
    # Create a ZipFile object in write mode
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Walk through the folder structure
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                # Store file paths relative to the folder to avoid full paths in zip
                arcname = os.path.relpath(file_path, folder_path)
                zipf.write(file_path, arcname)

# Example usage:
folder_to_zip = "/content/preprocessed_datasets/xrayhip"
output_zip = "xray_hip_dataset.zip"

zip_folder(folder_to_zip, output_zip)
print(f"Zipped {folder_to_zip} → {output_zip}")

Zipped /content/preprocessed_datasets/xrayhip → xray_hip_dataset.zip
