In [3]:
import os
import shutil
import random
from PIL import Image
import math

# --- Configuration ---
SOURCE_BASE_DIR = 'archive/bloodcells_dataset'
TARGET_BASE_DIR = 'data'
SPLIT_RATIOS = {'train': 0.8, 'test': 0.1, 'validate': 0.1}
EXPECTED_DIMS = (360, 363) # Pillow uses (width, height)

# --- Sanity Checks ---
if not os.path.isdir(SOURCE_BASE_DIR):
    print(f"Error: Source directory '{SOURCE_BASE_DIR}' not found.")
    exit()

if abs(sum(SPLIT_RATIOS.values()) - 1.0) > 1e-9:
     print(f"Error: Split ratios must sum to 1.0. Current sum: {sum(SPLIT_RATIOS.values())}")
     exit()

# --- Setup ---
print(f"Source directory: {SOURCE_BASE_DIR}")
print(f"Target directory: {TARGET_BASE_DIR}")
print(f"Splitting data: Train {SPLIT_RATIOS['train']*100}%, "
      f"Test {SPLIT_RATIOS['test']*100}%, "
      f"Validate {SPLIT_RATIOS['validate']*100}%")
print(f"Filtering images to keep only dimensions (WxH): {EXPECTED_DIMS}")
print("-" * 30)

# Create base target directories if they don't exist
for split_name in SPLIT_RATIOS.keys():
    os.makedirs(os.path.join(TARGET_BASE_DIR, split_name), exist_ok=True)

total_files_processed = 0
total_files_kept = 0
total_files_removed_dimension = 0
total_files_error_reading = 0
removed_file_details = [] # Store details of removed files: (filepath, reason, details)

# --- Get Class Names ---
try:
    class_names = [d for d in os.listdir(SOURCE_BASE_DIR) if os.path.isdir(os.path.join(SOURCE_BASE_DIR, d))]
    if not class_names:
        print(f"Error: No subdirectories (classes) found in {SOURCE_BASE_DIR}")
        exit()
    print(f"Found classes: {', '.join(class_names)}")
except OSError as e:
    print(f"Error accessing source directory {SOURCE_BASE_DIR}: {e}")
    exit()

# --- Process Each Class ---
for class_name in class_names:
    print(f"\nProcessing class: {class_name}...")
    source_class_dir = os.path.join(SOURCE_BASE_DIR, class_name)

    # --- Create Target Class Directories ---
    target_dirs = {}
    for split_name in SPLIT_RATIOS.keys():
        target_dir = os.path.join(TARGET_BASE_DIR, split_name, class_name)
        os.makedirs(target_dir, exist_ok=True)
        target_dirs[split_name] = target_dir

    # --- List All Potential Image Files ---
    try:
        all_files_in_class = [
            f for f in os.listdir(source_class_dir)
            if os.path.isfile(os.path.join(source_class_dir, f)) and
               f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tif', '.tiff'))
        ]
        if not all_files_in_class:
            print(f"  Warning: No image files found in {source_class_dir}. Skipping.")
            continue
        
        print(f"  Found {len(all_files_in_class)} potential image files.")
        total_files_processed += len(all_files_in_class)

    except OSError as e:
        print(f"  Error listing files in {source_class_dir}: {e}. Skipping class.")
        continue
        
    # --- Filter Files by Dimension ---
    valid_files = []
    class_removed_dimension = 0
    class_error_reading = 0
    
    print(f"  Validating dimensions (target: {EXPECTED_DIMS})...")
    for filename in all_files_in_class:
        source_path = os.path.join(source_class_dir, filename)
        try:
            with Image.open(source_path) as img:
                if img.size == EXPECTED_DIMS:
                    valid_files.append(filename)
                else:
                    # print(f"    Removing '{filename}': Incorrect dimensions ({img.size})")
                    class_removed_dimension += 1
                    removed_file_details.append((source_path, "Dimension Mismatch", img.size))
        except Exception as e:
            print(f"    ERROR: Cannot read image '{filename}': {e}. Removing from consideration.")
            class_error_reading += 1
            removed_file_details.append((source_path, "Read Error", str(e)))

    total_files_kept += len(valid_files)
    total_files_removed_dimension += class_removed_dimension
    total_files_error_reading += class_error_reading

    if not valid_files:
        print(f"  No valid images found for class '{class_name}' after filtering. Skipping splitting for this class.")
        print(f"  Summary for '{class_name}': Removed={class_removed_dimension} (dimension), Error={class_error_reading} (read).")
        continue # Skip to the next class

    print(f"  Finished validation: Kept={len(valid_files)}, Removed={class_removed_dimension} (dimension), Error={class_error_reading} (read).")

    # --- Shuffle and Split Only Valid Files ---
    random.shuffle(valid_files) # Shuffle in place
    total_valid_files = len(valid_files)

    # --- Calculate Split Sizes based on valid files ---
    train_count = math.floor(total_valid_files * SPLIT_RATIOS['train'])
    test_count = math.floor(total_valid_files * SPLIT_RATIOS['test'])
    validate_count = max(0, total_valid_files - train_count - test_count) # Assign remainder to validate

    # Adjust if rounding caused sum != total_valid_files
    current_total = train_count + test_count + validate_count
    if current_total < total_valid_files:
         validate_count += (total_valid_files - current_total) # Add remainder to validate

    print(f"  Splitting {total_valid_files} valid images into: Train={train_count}, Test={test_count}, Validate={validate_count}")

    # --- Assign Files to Splits ---
    split_files = {
        'train': valid_files[0:train_count],
        'test': valid_files[train_count : train_count + test_count],
        'validate': valid_files[train_count + test_count : ]
    }

    # --- Copy Valid Files to Target Directories ---
    files_copied_counts = {'train': 0, 'test': 0, 'validate': 0}
    for split_name, files in split_files.items():
        target_dir = target_dirs[split_name]
        for filename in files:
            # We already validated these files, so just copy
            source_path = os.path.join(source_class_dir, filename)
            target_path = os.path.join(target_dir, filename)
            try:
                shutil.copy2(source_path, target_path) # copy2 preserves metadata
                files_copied_counts[split_name] += 1
            except Exception as e:
                 print(f"    ERROR: Failed to copy {source_path} to {target_path}: {e}")

    print(f"  Files copied: Train={files_copied_counts['train']}, Test={files_copied_counts['test']}, Validate={files_copied_counts['validate']}")
    
    # Sanity check counts after potential copy errors
    copied_total = sum(files_copied_counts.values())
    expected_total = train_count + test_count + validate_count
    if copied_total != expected_total:
         print(f"  NOTE: Copied {copied_total} files, expected {expected_total}. Check for copy errors above.")


# --- Final Report ---
print("\n" + "=" * 30)
print("Processing Complete!")
print("=" * 30)
print(f"Total potential image files processed: {total_files_processed}")
print(f"Total files removed (dimension mismatch): {total_files_removed_dimension}")
print(f"Total files removed (error reading):    {total_files_error_reading}")
print(f"Total valid files kept and split:       {total_files_kept}")
print("-" * 30)
if total_files_kept != (total_files_processed - total_files_removed_dimension - total_files_error_reading):
     print("Warning: Discrepancy in file counts - check logs.")
print(f"\nDataset successfully split into Train/Test/Validate sets within the '{TARGET_BASE_DIR}' directory using only images with dimensions {EXPECTED_DIMS}.")

# Optional: Print details of removed files (can be long)
# print("\nDetails of removed/error files:")
# for path, reason, detail in removed_file_details:
#     print(f"  - {path} | Reason: {reason} | Details: {detail}")

Source directory: archive/bloodcells_dataset
Target directory: data
Splitting data: Train 80.0%, Test 10.0%, Validate 10.0%
Filtering images to keep only dimensions (WxH): (360, 363)
------------------------------
Found classes: basophil, eosinophil, erythroblast, ig, lymphocyte, monocyte, neutrophil, platelet

Processing class: basophil...
  Found 1218 potential image files.
  Validating dimensions (target: (360, 363))...
  Finished validation: Kept=1168, Removed=50 (dimension), Error=0 (read).
  Splitting 1168 valid images into: Train=934, Test=116, Validate=118
  Files copied: Train=934, Test=116, Validate=118

Processing class: eosinophil...
  Found 3117 potential image files.
  Validating dimensions (target: (360, 363))...
  Finished validation: Kept=3067, Removed=50 (dimension), Error=0 (read).
  Splitting 3067 valid images into: Train=2453, Test=306, Validate=308
  Files copied: Train=2453, Test=306, Validate=308

Processing class: erythroblast...
  Found 1551 potential image fi