# **Data Preparation**

# Objectives

* Clean and remove non images.
* Prepare the dataset for further processes.
* Find average image shape for original images.
* Resize the images from the datasets.
* Split the datasets into Train, Validation and Test sets.

# Outputs

Resized images from the datasets for further processing

# Additional Comments

* No Additional Comments

In [1]:
import os
import shutil
import random
import joblib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.image import imread
sns.set_style("white")

In [2]:
cwd= os.getcwd()

In [4]:
os.chdir(os.path.dirname(cwd))
print("You set a new current directory")

You set a new current directory


In [5]:
current_dir = os.getcwd()
current_dir

'/workspaces/PP5-Cherry-Leaves'

# Checks and Removes Corrupt Images and Image Types

In [1]:
import os
import tensorflow as tf

def remove_invalid_images(my_data_dir):
    image_extensions = ('.png', '.jpg', '.jpeg')
    folders = os.listdir(my_data_dir)

    for folder in folders:
        folder_path = os.path.join(my_data_dir, folder)
        
        if not os.path.isdir(folder_path):  
            continue  # Skip if not a folder

        i, j = 0, 0  # Counters

        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)

            if os.path.isfile(file_path):
                if not file_name.lower().endswith(image_extensions):  
                    os.remove(file_path)  # Remove non-image files
                    i += 1
                else:
                    try:
                        # Try opening with TensorFlow to verify it's an actual image
                        img = tf.io.read_file(file_path)
                        img = tf.io.decode_image(img, channels=3)
                        j += 1  # Valid image
                    except Exception as e:
                        print(f"Removing corrupted image: {file_path} - Error: {str(e)}")
                        os.remove(file_path)
                        i += 1

        print(f"Folder: {folder} - Valid Images: {j}, Removed: {i}")

# Example usage:
data_dir = '/workspaces/PP5-Cherry-Leaves/inputs/leaves_dataset/cherry-leaves'
remove_invalid_images(data_dir)

2025-03-27 13:04:05.883814: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-27 13:04:07.505536: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Folder: powdery_mildew - Valid Images: 2104, Removed: 0
Folder: healthy - Valid Images: 2104, Removed: 0


In [2]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Create an ImageDataGenerator instance
datagen = ImageDataGenerator(rescale=1./255)  # Normalization (optional)

# Load images from a directory and resize them to 100x100
data_generator = datagen.flow_from_directory(
    '/workspaces/PP5-Cherry-Leaves/inputs/leaves_dataset/cherry-leaves',  # Replace with your image directory
    target_size=(100, 100),  # Resize to 100x100
    batch_size=10,
    class_mode='binary'  # Change based on your task (e.g., 'binary' for 2 classes)
)

# Fetch a batch of images and labels
images, labels = next(data_generator)

# Check the shape
print(images.shape)  # Should be (batch_size, 100, 100, 3) for RGB images


Found 4208 images belonging to 2 classes.
(10, 100, 100, 3)


In [4]:
import os
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array
import numpy as np

# Define source and destination directories
source_dir = '/workspaces/PP5-Cherry-Leaves/inputs/leaves_dataset/cherry-leaves'  # Path to original images
save_dir = '/workspaces/PP5-Cherry-Leaves/inputs/leaves_dataset/cherry-leaves/resized_images'  # Path where resized images will be saved

# Create ImageDataGenerator for resizing
datagen = ImageDataGenerator(rescale=1./255)  # Normalize the images, optional

# Create the save directory if it doesn't exist
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Set up the generator with target size and batch size
data_generator = datagen.flow_from_directory(
    source_dir,
    target_size=(100, 100),  # Resize to 100x100
    batch_size=32,
    class_mode='binary',  # Change to 'binary' for binary classification
    shuffle=False  # So that we can process the images in order
)

# Subdirectories for each class
for class_name in data_generator.class_indices:
    class_dir = os.path.join(save_dir, class_name)
    if not os.path.exists(class_dir):
        os.makedirs(class_dir)


# Iterate over the batches of images and save them
batch_index = 0
for batch in data_generator:
    images, labels = batch
    for i, img in enumerate(images):
        # Convert the image array to a PIL image
        pil_img = array_to_img(img)
        
        # Determine the class name based on the label
        label = labels[i] 
        class_name = list(data_generator.class_indices.keys())[int(label)]
        
        # Create a filename
        img_filename = f"resized_image_{batch_index * data_generator.batch_size + i}.png"
        
        # Save the image in the appropriate class subdirectory
        pil_img.save(os.path.join(save_dir, class_name, img_filename))
    
    batch_index += 1

    # If you want to stop after a certain number of batches (for testing, etc.)
    if batch_index >= data_generator.samples // data_generator.batch_size:
        break

print(f"Resized images saved to {save_dir}")

Found 4208 images belonging to 3 classes.


Resized images saved to /workspaces/PP5-Cherry-Leaves/inputs/leaves_dataset/cherry-leaves/resized_images


# split the data into train, validation and test sets

In [6]:
import random

def split_train_validation_test_images(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):
    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("train_set_ratio + validation_set_ratio + test_set_ratio should sum to 1.0")
        return

    # Get class labels
    labels = os.listdir(my_data_dir)
    if 'test' in labels:
        pass
    else:
        # Create train, validation, and test folders with class labels as subfolders
        for folder in ['train', 'validation', 'test']:
            for label in labels:
                os.makedirs(name=os.path.join(my_data_dir, folder, label), exist_ok=True)

        for label in labels:
            files = os.listdir(os.path.join(my_data_dir, label))
            random.shuffle(files)

            train_set_files_qty = int(len(files) * train_set_ratio)
            validation_set_files_qty = int(len(files) * validation_set_ratio)

            count = 1
            for file_name in files:
                if count <= train_set_files_qty:
                    # Move a file to the train set
                    shutil.move(os.path.join(my_data_dir, label, file_name),
                                os.path.join(my_data_dir, 'train', label, file_name))

                elif count <= (train_set_files_qty + validation_set_files_qty):
                    # Move a file to the validation set
                    shutil.move(os.path.join(my_data_dir, label, file_name),
                                os.path.join(my_data_dir, 'validation', label, file_name))

                else:
                    # Move a file to the test set
                    shutil.move(os.path.join(my_data_dir, label, file_name),
                                os.path.join(my_data_dir, 'test', label, file_name))

                count += 1

            # Remove empty class directories after splitting
            os.rmdir(os.path.join(my_data_dir, label))

# Split the resized images into train, validation, and test sets
split_train_validation_test_images(save_dir, train_set_ratio=0.7, validation_set_ratio=0.15, test_set_ratio=0.15)

print(f"Data split into train, validation, and test sets.")

Data split into train, validation, and test sets.
