In [36]:
import os
import shutil
import random
from sklearn.model_selection import train_test_split


#### Configuration
---

In [38]:

ORIGINAL_DATASET = r"C:\Users\USER\Documents\Thesis Dataset\Original Dataset"

# Defining path variables for splitting the data
BASE_OUTPUT = r"C:\Users\USER\Documents\Thesis Dataset\Processed Dataset"
TRAIN_DIR = os.path.join(BASE_OUTPUT, "train")
VAL_DIR   = os.path.join(BASE_OUTPUT, "val")
TEST_DIR  = os.path.join(BASE_OUTPUT, "test")

# Splitting percentage or proportion
train_ratio = 0.70
val_ratio   = 0.15
test_ratio  = 0.15

IMG_EXTENSIONS = (".jpg", ".jpeg", ".png")

#### Clean and Create Directories 

Remove old processed data to avoid duplicates and create paths for training, validation and testing

In [39]:

# Deletes an already existed file
if os.path.exists(BASE_OUTPUT):
    shutil.rmtree(BASE_OUTPUT)

# Creates the directory for the path variables
for dir_path in [TRAIN_DIR, VAL_DIR, TEST_DIR]:
    os.makedirs(dir_path, exist_ok=True)


#### SPLIT & COPY FUNCTION

In [40]:

def split_and_copy(class_images, train_path, val_path, test_path):
    """Split the list of image paths into train/val/test and copy once."""
    if len(class_images) < 3:
        print(f"Folder is Empty or does not have an image: {os.path.basename(train_path)}")
        return

    random.shuffle(class_images)

    # Splitting Data: Train vs Temp (Val+Test)
    train_imgs, temp_imgs = train_test_split(
        class_images, test_size=(1 - train_ratio), random_state=42
    )
    # Splitting the Temp: Validation and Test
    val_imgs, test_imgs = train_test_split(
        temp_imgs,
        test_size=(test_ratio / (test_ratio + val_ratio)),
        random_state=42,
    )

    # Copy each image
    for img in train_imgs:
        shutil.copy2(img, train_path)
    for img in val_imgs:
        shutil.copy2(img, val_path)
    for img in test_imgs:
        shutil.copy2(img, test_path)


#### Main loop
Loop for each plant to be splitted and copy to the processed folder

split and copy function is used

In [41]:

for crop_type in sorted(os.listdir(ORIGINAL_DATASET)):
    crop_path = os.path.join(ORIGINAL_DATASET, crop_type)
    if not os.path.isdir(crop_path):
        continue

    for disease_class in sorted(os.listdir(crop_path)):
        class_path = os.path.join(crop_path, disease_class)
        if not os.path.isdir(class_path):
            continue

        # Collect image files in each disease class
        images = [
            os.path.join(class_path, f)
            for f in os.listdir(class_path)
            if f.lower().endswith(IMG_EXTENSIONS)
        ]

        # Output directories
        train_out = os.path.join(TRAIN_DIR, crop_type, disease_class)
        val_out   = os.path.join(VAL_DIR, crop_type, disease_class)
        test_out  = os.path.join(TEST_DIR, crop_type, disease_class)

        os.makedirs(train_out, exist_ok=True)
        os.makedirs(val_out, exist_ok=True)
        os.makedirs(test_out, exist_ok=True)

        # Using the copy and splitting function
        split_and_copy(images, train_out, val_out, test_out)

        # Checking performance per disease in each crop type 
        print(f"{crop_type}/{disease_class}: {len(images)} images processed")

print("✅ Dataset successfully split into 70% Train, 15% Validation, 15% Test!")

Banana_Leaf_Disease_Dataset_Bangladesh/Healthy: 1108 images processed
Banana_Leaf_Disease_Dataset_Bangladesh/Panama Disease: 835 images processed
Banana_Leaf_Disease_Dataset_Bangladesh/Yellow and Black Sigatoka: 2597 images processed
Banana_Leaf_Disease_Dataset_Bangladesh/cordana: 442 images processed
Banana_Leaf_Disease_Dataset_Bangladesh/pestalotiopsis: 117 images processed
Coconut Tree Disease Dataset/Bud Root Dropping: 514 images processed
Coconut Tree Disease Dataset/Bud Rot: 470 images processed
Coconut Tree Disease Dataset/Gray Leaf Spot: 2135 images processed
Coconut Tree Disease Dataset/Healthy_Leaves: 123 images processed
Coconut Tree Disease Dataset/Leaf Rot: 1673 images processed
Coconut Tree Disease Dataset/WCLWD_DryingofLeaflets: 1078 images processed
Coconut Tree Disease Dataset/WCLWD_Flaccidity: 1069 images processed
Coconut Tree Disease Dataset/WCLWD_Yellowing: 1084 images processed
Sugarcane Leaf Disease Dataset/Healthy: 522 images processed
Sugarcane Leaf Disease Dat