In [1]:
import os
import shutil
import random
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Define original dataset path (update this)
dataset_path = "/content/drive/My Drive/Data"

# Define output paths for train and test sets
output_base = "/content/drive/My Drive/split_dataset"
train_dir = os.path.join(output_base, "train")
test_dir = os.path.join(output_base, "test")

# Create directories for train and test sets
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Set split ratio
train_ratio = 0.8
test_ratio = 0.2

# Iterate through each class (subfolder)
for class_name in sorted(os.listdir(dataset_path)):
    class_path = os.path.join(dataset_path, class_name)

    if not os.path.isdir(class_path):
        continue  # Skip if not a folder

    images = os.listdir(class_path)
    random.shuffle(images)  # Shuffle images for randomness

    # Compute split sizes
    num_train = int(len(images) * train_ratio)

    # Paths for new train/test folders
    train_class_path = os.path.join(train_dir, class_name)
    test_class_path = os.path.join(test_dir, class_name)

    os.makedirs(train_class_path, exist_ok=True)
    os.makedirs(test_class_path, exist_ok=True)

    # Move images into respective folders
    for i, img in enumerate(images):
        src_path = os.path.join(class_path, img)
        if i < num_train:
            dest_path = os.path.join(train_class_path, img)
        else:
            dest_path = os.path.join(test_class_path, img)

        shutil.copy(src_path, dest_path)

    print(f"Processed {class_name}: {num_train} train, {len(images) - num_train} test images.")

print("Dataset successfully split and stored in Google Drive!")


Processed Bacterial_blight: 841 train, 211 test images.
Processed Bacterial_pustule: 850 train, 213 test images.
Processed Brown_Spot: 800 train, 200 test images.
Processed Caterpillar: 2676 train, 669 test images.
Processed Diabrotica_speciosa: 1764 train, 441 test images.
Processed Downy_mildew: 952 train, 239 test images.
Processed Ferrugen: 852 train, 213 test images.
Processed Frogeye_Leaf_Spot: 937 train, 235 test images.
Processed Healthy: 5132 train, 1284 test images.
Processed Mossaic_Virus: 817 train, 205 test images.
Processed Powdery_mildew: 947 train, 237 test images.
Processed Rust: 888 train, 222 test images.
Processed Septoria: 921 train, 231 test images.
Processed Southern_blight: 843 train, 211 test images.
Processed Sudden_Death_Syndrome: 888 train, 222 test images.
Processed Target_Leaf_Spot: 887 train, 222 test images.
Processed Yellow_Mosaic: 888 train, 222 test images.
Dataset successfully split and stored in Google Drive!
