# **Data Preparation**

# Objectives

* Clean and remove non images.
* Prepare the dataset for further processes.
* Find average image shape for original images.
* Resize the images from the datasets.
* Split the datasets into Train, Validation and Test sets.

# Outputs

* inputs/leaves_dataset/cherry-leaves/resized_images
* inputs/leaves_dataset/cherry-leaves/processed_images
* static/validation_images

# Additional Comments

* No Additional Comments

In [2]:
import os
import shutil
import random
import joblib
import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from PIL import Image
from matplotlib.image import imread
from tensorflow import keras
from keras._tf_keras.keras.preprocessing.image import ImageDataGenerator
from keras._tf_keras.keras.preprocessing.image import array_to_img

2025-05-06 07:48:09.032711: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-06 07:48:10.373587: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-05-06 07:48:13.493465: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [None]:
import os
current_dir = os.getcwd()
current_dir

'/workspaces/PP5-Cherry-Leaves/jupyter_notebooks'

In [4]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

You set a new current directory


In [5]:
current_dir = os.getcwd()
current_dir

'/workspaces/PP5-Cherry-Leaves'

# Image Removal

Checks for Corrupt Images and other image types that are not '.png', '.jpg' and '.jpeg' Image Types. This helps to prevent duplicates and conflicts with different image versions (i.e original images and resized images) ready for processing.

In [14]:
def remove_invalid_and_original_images(my_data_dir, resized_suffix="resized"):
    image_extensions = ('.png', '.jpg', '.jpeg')
    folders = os.listdir(my_data_dir)

    for folder in folders:
        folder_path = os.path.join(my_data_dir, folder)
        
        if not os.path.isdir(folder_path):  
            continue  # Skip if not a folder

        i, j = 0, 0  # Counters

        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)

            if os.path.isfile(file_path):
                if not file_name.lower().endswith(image_extensions):  
                    os.remove(file_path)  # Remove non-image files
                    i += 1
                elif resized_suffix not in file_name:  # Remove original-sized images
                    os.remove(file_path)
                    i += 1
                else:
                    try:
                        # Try opening with TensorFlow to verify it's an actual image
                        img = tf.io.read_file(file_path)
                        img = tf.io.decode_image(img, channels=3)
                        j += 1  # Valid resized image
                    except Exception as e:
                        print(f"Removing corrupted image: {file_path} - Error: {str(e)}")
                        os.remove(file_path)
                        i += 1

        print(f"Folder: {folder} - Valid Resized Images: {j}, Removed: {i}")

adds check for desired image size

In [15]:
processed_images_dir = '/workspaces/PP5-Cherry-Leaves/inputs/leaves_dataset/cherry-leaves/processed_images'

# Loop through all subfolders and images in processed_images
for class_folder in os.listdir(processed_images_dir):
    class_folder_path = os.path.join(processed_images_dir, class_folder)
    
    if os.path.isdir(class_folder_path):  # Check if it's a folder
        for image_name in os.listdir(class_folder_path):
            image_path = os.path.join(class_folder_path, image_name)
            
            if image_name.lower().endswith(('.png', '.jpg', '.jpeg')):  # Only check image files
                # Open the image and get its size
                img = Image.open(image_path)
                img_size = img.size  # (width, height)
                
                print(f"{image_name}: {img_size}")  # Print the size of each image
                
# Loop through all subfolders and images in processed_images
for class_folder in os.listdir(processed_images_dir):
    class_folder_path = os.path.join(processed_images_dir, class_folder)
    
    if os.path.isdir(class_folder_path):  # Check if it's a folder
        for image_name in os.listdir(class_folder_path):
            image_path = os.path.join(class_folder_path, image_name)
            
            if image_name.lower().endswith(('.png', '.jpg', '.jpeg')):  # Only check image files
                # Open the image and get its size
                img = Image.open(image_path)
                img_size = img.size  # (width, height)
                
                if img_size != (100, 100):
                    print(f"{image_name} is not 100x100, it's {img_size}")


processed_image_2715.png: (100, 100)
processed_image_3028.png: (100, 100)
processed_image_3029.png: (100, 100)
processed_image_2692.png: (100, 100)
processed_image_4084.png: (100, 100)
processed_image_2746.png: (100, 100)
processed_image_2733.png: (100, 100)
processed_image_2359.png: (100, 100)
processed_image_3976.png: (100, 100)
processed_image_2918.png: (100, 100)
processed_image_2811.png: (100, 100)
processed_image_3223.png: (100, 100)
processed_image_2932.png: (100, 100)
processed_image_3483.png: (100, 100)
processed_image_3959.png: (100, 100)
processed_image_3871.png: (100, 100)
processed_image_3997.png: (100, 100)
processed_image_2767.png: (100, 100)
processed_image_2193.png: (100, 100)
processed_image_2646.png: (100, 100)
processed_image_2229.png: (100, 100)
processed_image_3031.png: (100, 100)
processed_image_3121.png: (100, 100)
processed_image_3806.png: (100, 100)
processed_image_3932.png: (100, 100)
processed_image_3153.png: (100, 100)
processed_image_3394.png: (100, 100)
p

In [16]:
# Define the path to the resized_images folder
resized_images_path = '/workspaces/PP5-Cherry-Leaves/inputs/leaves_dataset/cherry-leaves/resized_images'

# Check if the folder exists and remove it
if os.path.exists(resized_images_path) and os.path.isdir(resized_images_path):
    shutil.rmtree(resized_images_path)  # Removes the entire folder and its contents
    print(f"Removed the folder: {resized_images_path}")
else:
    print(f"The folder {resized_images_path} does not exist.")


Removed the folder: /workspaces/PP5-Cherry-Leaves/inputs/leaves_dataset/cherry-leaves/resized_images


Image Resizing using ImageDataGenerator prints batch size, image shape and RGB presence for visual clarification.

In [10]:
# Create an ImageDataGenerator instance
datagen = ImageDataGenerator(rescale=1./255)  # Normalization (optional)

# Load images from a directory and resize them to 100x100
data_generator = datagen.flow_from_directory(
    '/workspaces/PP5-Cherry-Leaves/inputs/leaves_dataset/cherry-leaves/resized_images',  # Replace with your image directory
    target_size=(100, 100),  # Resize to 100x100
    batch_size=10,
    class_mode='binary'  # Change based on your task (e.g., 'binary' for 2 classes)
)

# Fetch a batch of images and labels
images, labels = next(data_generator)

# Check the shape
print(images.shape)  # Should be (batch_size, 100, 100, 3) for RGB images


Found 4208 images belonging to 2 classes.


(10, 100, 100, 3)


Saving images to 'processed_images' folder.

In [11]:
# Define source and destination directories
source_dir = '/workspaces/PP5-Cherry-Leaves/inputs/leaves_dataset/cherry-leaves/resized_images'  # Use resized images as source
save_dir = '/workspaces/PP5-Cherry-Leaves/inputs/leaves_dataset/cherry-leaves/processed_images'  # Save final processed images

# Create ImageDataGenerator for further processing (if needed)
datagen = ImageDataGenerator(rescale=1./255)  # Normalize the images

# Create the save directory if it doesn't exist
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Set up the generator with target size and batch size
data_generator = datagen.flow_from_directory(
    source_dir,
    target_size=(100, 100),  # Resize to 100x100 (if necessary again)
    batch_size=32,
    class_mode='binary',  # Adjust if needed
    shuffle=False
)

# Ensure subdirectories exist for each class
for class_name in data_generator.class_indices:
    class_dir = os.path.join(save_dir, class_name)
    if not os.path.exists(class_dir):
        os.makedirs(class_dir)

# Iterate over the batches of images and save them
batch_index = 0
for batch in data_generator:
    images, labels = batch
    for i, img in enumerate(images):
        # Convert the image array to a PIL image
        pil_img = array_to_img(img)
        
        # Determine the class name based on the label
        label = labels[i] 
        class_name = list(data_generator.class_indices.keys())[int(label)]
        
        # Create a filename
        img_filename = f"processed_image_{batch_index * data_generator.batch_size + i}.png"
        
        # Save the image in the appropriate class subdirectory
        pil_img.save(os.path.join(save_dir, class_name, img_filename))
    
    batch_index += 1

    # Stop after processing all images
    if batch_index * data_generator.batch_size >= data_generator.samples:
        break

print(f"Processed images saved to {save_dir}")

Found 4208 images belonging to 2 classes.


Processed images saved to /workspaces/PP5-Cherry-Leaves/inputs/leaves_dataset/cherry-leaves/processed_images


# Split the data into train, validation and test sets

Splitting the 'processed_image' data into train, validation and test subsets to a ration of 0.7, 0.15, 0.15 summing up to 1.0.

In [12]:
def split_train_validation_test_images(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):
    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("train_set_ratio + validation_set_ratio + test_set_ratio should sum to 1.0")
        return

    # Get class labels
    labels = os.listdir(my_data_dir)
    if 'test' in labels:
        pass
    else:
        # Create train, validation, and test folders with class labels as subfolders
        for folder in ['train', 'validation', 'test']:
            for label in labels:
                os.makedirs(name=os.path.join(my_data_dir, folder, label), exist_ok=True)

        for label in labels:
            files = os.listdir(os.path.join(my_data_dir, label))
            random.shuffle(files)

            train_set_files_qty = int(len(files) * train_set_ratio)
            validation_set_files_qty = int(len(files) * validation_set_ratio)

            count = 1
            for file_name in files:
                if count <= train_set_files_qty:
                    # Move a file to the train set
                    shutil.move(os.path.join(my_data_dir, label, file_name),
                                os.path.join(my_data_dir, 'train', label, file_name))

                elif count <= (train_set_files_qty + validation_set_files_qty):
                    # Move a file to the validation set
                    shutil.move(os.path.join(my_data_dir, label, file_name),
                                os.path.join(my_data_dir, 'validation', label, file_name))

                else:
                    # Move a file to the test set
                    shutil.move(os.path.join(my_data_dir, label, file_name),
                                os.path.join(my_data_dir, 'test', label, file_name))

                count += 1

            # Remove empty class directories after splitting
            os.rmdir(os.path.join(my_data_dir, label))

# Split the resized images into train, validation, and test sets
split_train_validation_test_images(save_dir, train_set_ratio=0.7, validation_set_ratio=0.15, test_set_ratio=0.15)

print(f"Data split into train, validation, and test sets.")

Data split into train, validation, and test sets.


Comma separated values (CSV) files generated to store tabular data using plain text. further processed into pandas dataframes to enable data plotting.

In [13]:
# Code from linx02 - genderpredictor project to create and save dataframes.

def create_dataframe(data_dir):
    data = {'file': [], 'label': []}
    for label in os.listdir(data_dir):
        label_dir = os.path.join(data_dir, label)
        for file in os.listdir(label_dir):
            file_path = os.path.join(label_dir, file)
            data['file'].append(file_path)
            data['label'].append(label)
    return pd.DataFrame(data)

data_dir = "/workspaces/PP5-Cherry-Leaves/inputs/leaves_dataset/cherry-leaves/processed_images"

# Create DataFrames for train, validation, and test datasets
train_df = create_dataframe(os.path.join(data_dir, 'train'))
validation_df = create_dataframe(os.path.join(data_dir, 'validation'))
test_df = create_dataframe(os.path.join(data_dir, 'test'))

In [14]:
output_dir = "/workspaces/PP5-Cherry-Leaves/outputs/"

dataframes = {'train': train_df, 'validation': validation_df, 'test': test_df}
for dataset, df in dataframes.items():
    csv_path = os.path.join(output_dir, f"{dataset}_dataframe.csv")
    df.to_csv(csv_path, index=False)
    print(f"{dataset.capitalize()} DataFrame saved as {csv_path}")

Train DataFrame saved as /workspaces/PP5-Cherry-Leaves/outputs/train_dataframe.csv
Validation DataFrame saved as /workspaces/PP5-Cherry-Leaves/outputs/validation_dataframe.csv
Test DataFrame saved as /workspaces/PP5-Cherry-Leaves/outputs/test_dataframe.csv


In [6]:
def move_images(src_dir, dest_dir, max_images_per_class=5):
    # List all class folders
    class_folders = [f for f in os.listdir(src_dir) if os.path.isdir(os.path.join(src_dir, f))]

    for class_folder in class_folders:
        src_class_folder = os.path.join(src_dir, class_folder)
        dest_class_folder = os.path.join(dest_dir, class_folder)

        # Create the destination folder if it doesn't exist
        if not os.path.exists(dest_class_folder):
            os.makedirs(dest_class_folder)

        # Get all image files in the class folder
        image_files = [f for f in os.listdir(src_class_folder) if f.endswith(('.png', '.jpg', '.jpeg'))]

        # Move a limited number of images
        image_files = image_files[:max_images_per_class]
        for image_file in image_files:
            src_image_path = os.path.join(src_class_folder, image_file)
            dest_image_path = os.path.join(dest_class_folder, image_file)

            # Move the image
            shutil.move(src_image_path, dest_image_path)
        print(f"Moved {len(image_files)} images from {src_class_folder} to {dest_class_folder}")

# Example usage
src_dir = "inputs/leaves_dataset/cherry-leaves/processed_images/validation"
dest_dir = "static/validation_images"
move_images(src_dir, dest_dir, max_images_per_class=4)

Moved 4 images from inputs/leaves_dataset/cherry-leaves/processed_images/validation/powdery_mildew to static/validation_images/powdery_mildew
Moved 4 images from inputs/leaves_dataset/cherry-leaves/processed_images/validation/healthy to static/validation_images/healthy
