In [24]:
import os
import random
import shutil

# --- Configuration ---
# Define the root directory containing your original image data
source_data_root = "./data/dataSets"

# Define the root directories for your new train and test sets
train_dir = "./data/train"
test_dir = "./data/test"

# Define the categories (subfolder names) in your dataset
categories = [
    "ALGAL_LEAF_SPOT",
    "ALLOCARIDARA_ATTACK",
    "HEALTHY_LEAF",
    "LEAF_BLIGHT",
    "PHOMOPSIS_LEAF_SPOT"
]

# Define the ratio of data to be used for the test set (e.g., 0.2 means 20% for test)
test_ratio = 0.138

# Set the random seed for reproducibility
random.seed(42)

In [25]:
# --- Clean up previous train and test directories ---
print("Cleaning up existing train and test directories...")
if os.path.exists(train_dir):
    shutil.rmtree(train_dir)
if os.path.exists(test_dir):
    shutil.rmtree(test_dir)
print("Cleanup complete.")

Cleaning up existing train and test directories...
Cleanup complete.


In [26]:
# --- Create new train and test base directories ---
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

print("\n--- Processing Data Split ---")

# Process each category
for category in categories:
    source_category_path = os.path.join(source_data_root, category)
    train_category_path = os.path.join(train_dir, category)
    test_category_path = os.path.join(test_dir, category)

    # Create category subdirectories in train and test
    os.makedirs(train_category_path, exist_ok=True)
    os.makedirs(test_category_path, exist_ok=True)

    try:
        # Get all files in the source category directory
        all_files = [f for f in os.listdir(source_category_path) if os.path.isfile(os.path.join(source_category_path, f))]
        total_files_in_category = len(all_files)

        if total_files_in_category == 0:
            print(f"Warning: No files found in {source_category_path}. Skipping this category.")
            continue

        # Calculate the number of files for the test set
        num_test_files = round(total_files_in_category * test_ratio)
        # Ensure at least one file for test if total_files_in_category > 0 and num_test_files is 0 due to rounding
        if num_test_files == 0 and total_files_in_category > 0:
            num_test_files = 1

        # Randomly select files for the test set
        test_files = random.sample(all_files, num_test_files)

        # The remaining files go to the training set
        train_files = list(set(all_files) - set(test_files))

        # Move files to the respective test and train directories
        for f in test_files:
            src_path = os.path.join(source_category_path, f)
            dst_path = os.path.join(test_category_path, f)
            shutil.copy(src_path, dst_path) # Use copy instead of rename if you want to keep original dataset intact

        for f in train_files:
            src_path = os.path.join(source_category_path, f)
            dst_path = os.path.join(train_category_path, f)
            shutil.copy(src_path, dst_path) # Use copy instead of rename

        print(f"Category '{category}':")
        print(f"  Total files: {total_files_in_category}")
        print(f"  Moved {len(test_files)} files to {test_category_path}")
        print(f"  Moved {len(train_files)} files to {train_category_path}")

    except FileNotFoundError:
        print(f"Error: Source directory not found for {category}: {source_category_path}")
    except Exception as e:
        print(f"An error occurred while processing {category}: {e}")


--- Processing Data Split ---
Category 'ALGAL_LEAF_SPOT':
  Total files: 733
  Moved 101 files to ./data/test\ALGAL_LEAF_SPOT
  Moved 632 files to ./data/train\ALGAL_LEAF_SPOT
Category 'ALLOCARIDARA_ATTACK':
  Total files: 913
  Moved 126 files to ./data/test\ALLOCARIDARA_ATTACK
  Moved 787 files to ./data/train\ALLOCARIDARA_ATTACK
Category 'HEALTHY_LEAF':
  Total files: 976
  Moved 135 files to ./data/test\HEALTHY_LEAF
  Moved 841 files to ./data/train\HEALTHY_LEAF
Category 'LEAF_BLIGHT':
  Total files: 937
  Moved 129 files to ./data/test\LEAF_BLIGHT
  Moved 808 files to ./data/train\LEAF_BLIGHT
Category 'PHOMOPSIS_LEAF_SPOT':
  Total files: 878
  Moved 121 files to ./data/test\PHOMOPSIS_LEAF_SPOT
  Moved 757 files to ./data/train\PHOMOPSIS_LEAF_SPOT


In [27]:
# --- Final verification of counts in new directories ---
print("\n--- Final File Counts in Train and Test Sets ---")
for category in categories:
    train_category_path = os.path.join(train_dir, category)
    test_category_path = os.path.join(test_dir, category)

    try:
        train_count = len(os.listdir(train_category_path))
        test_count = len(os.listdir(test_category_path))
        print(f"Category '{category}': Train={train_count}, Test={test_count}")
    except FileNotFoundError:
        print(f"Warning: Directory not found for counts: {train_category_path} or {test_category_path}")
    except Exception as e:
        print(f"An error occurred during final count for {category}: {e}")


--- Final File Counts in Train and Test Sets ---
Category 'ALGAL_LEAF_SPOT': Train=632, Test=101
Category 'ALLOCARIDARA_ATTACK': Train=787, Test=126
Category 'HEALTHY_LEAF': Train=841, Test=135
Category 'LEAF_BLIGHT': Train=808, Test=129
Category 'PHOMOPSIS_LEAF_SPOT': Train=757, Test=121
