In [9]:
import os
import shutil
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import ImageDataGenerator

import json

In [2]:
# Define Paths
data_path = '../data/raw/PlantVillage'
processed_data_path = '../data/processed/'

In [3]:
# Create directories for processed data
os.makedirs(os.path.join(processed_data_path, 'train'), exist_ok=True)
os.makedirs(os.path.join(processed_data_path, 'val'), exist_ok=True)
os.makedirs(os.path.join(processed_data_path, 'test'), exist_ok=True)

In [4]:
# Define function to split data
def split_data(base_path, classes, train_size=0.7, val_size=0.15):
    for class_name in classes:
        class_path = os.path.join(base_path, class_name)
        image_files = os.listdir(class_path)
        
        # Split image files into train, val, test
        train_files, test_files = train_test_split(image_files, test_size=1-train_size)
        val_files, test_files = train_test_split(test_files, test_size=1-(val_size / (1-train_size)))
        
        # Move files to respective directories
        for file_set, split_type in zip([train_files, val_files, test_files], ['train', 'val', 'test']):
            dest_path = os.path.join(processed_data_path, split_type, class_name)
            os.makedirs(dest_path, exist_ok=True)
            for file in file_set:
                shutil.copy(os.path.join(class_path, file), dest_path)

In [5]:
# List classes
classes = [d for d in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, d))]

In [6]:
classes

['Tomato_healthy',
 'Potato___Early_blight',
 'Tomato__Tomato_YellowLeaf__Curl_Virus',
 'Tomato_Early_blight',
 'Tomato__Target_Spot',
 'Potato___Late_blight',
 'Tomato_Leaf_Mold',
 'Tomato_Spider_mites_Two_spotted_spider_mite',
 'Tomato_Septoria_leaf_spot',
 'Tomato__Tomato_mosaic_virus',
 'Pepper__bell___Bacterial_spot',
 'Tomato_Bacterial_spot',
 'Tomato_Late_blight',
 'Pepper__bell___healthy',
 'Potato___healthy']

In [7]:
# Split data
split_data(data_path, classes)

In [12]:
# Set up data generators
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

val_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

# Data generators
train_generator = train_datagen.flow_from_directory(
    os.path.join(processed_data_path, 'train'),
    target_size=(128, 128),
    batch_size=32,
    class_mode='categorical'
)

val_generator = val_datagen.flow_from_directory(
    os.path.join(processed_data_path, 'val'),
    target_size=(128, 128),
    batch_size=32,
    class_mode='categorical'
)

test_generator = test_datagen.flow_from_directory(
    os.path.join(processed_data_path, 'test'),
    target_size=(128, 128),
    batch_size=32,
    class_mode='categorical'
)

Found 14437 images belonging to 15 classes.
Found 3089 images belonging to 15 classes.
Found 3112 images belonging to 15 classes.


In [10]:
# Save paths for data generators
data_paths = {
    'train': os.path.join(processed_data_path, 'train'),
    'val': os.path.join(processed_data_path, 'val'),
    'test': os.path.join(processed_data_path, 'test')
}

# Save to a JSON file
with open('data_paths.json', 'w') as f:
    json.dump(data_paths, f)