In [1]:
import os
import shutil
import random

BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))

source_dir = os.path.join(BASE_DIR, "data", "preprocessed")
output_dir = os.path.join(BASE_DIR, "data", "split_data")

# Split ratios
train_ratio = 0.7
val_ratio = 0.1
test_ratio = 0.2

# Ensure output folders exist
for split in ["train", "val", "test"]:
    for category in ["Tumor", "NORMAL"]:
        os.makedirs(os.path.join(output_dir, split, category), exist_ok=True)

# Function to split and copy files
def split_data(category):
    src_path = os.path.join(source_dir, category)
    files = [f for f in os.listdir(src_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    random.shuffle(files)

    total = len(files)
    train_end = int(total * train_ratio)
    val_end = train_end + int(total * val_ratio)

    splits = {
        "train": files[:train_end],
        "val": files[train_end:val_end],
        "test": files[val_end:]
    }

    for split, split_files in splits.items():
        for f in split_files:
            src_file = os.path.join(src_path, f)
            dst_file = os.path.join(output_dir, split, category, f)
            shutil.copy(src_file, dst_file)
        print(f"{category} {split}: {len(split_files)} images copied.")

# Split both categories
split_data("Tumor")
split_data("NORMAL")
print("Data splitting completed!")


Tumor train: 1627 images copied.
Tumor val: 232 images copied.
Tumor test: 466 images copied.
NORMAL train: 1610 images copied.
NORMAL val: 230 images copied.
NORMAL test: 460 images copied.
Data splitting completed!
