# **Data Preparation**

# Objectives

* Clean and remove non images.
* Prepare the dataset for further processes.
* Find average image shape for original images.
* Resize the images from the datasets.
* Split the datasets into Train, Validation and Test sets.

# Outputs

Resized images from the datasets for further processing

# Additional Comments

* No Additional Comments

In [1]:
import os
import shutil
import random
import joblib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.image import imread
sns.set_style("white")

In [2]:
cwd= os.getcwd()

In [4]:
os.chdir(os.path.dirname(cwd))
print("You set a new current directory")

You set a new current directory


In [5]:
current_dir = os.getcwd()
current_dir

'/workspaces/PP5-Cherry-Leaves'

# Checks and Removes Corrupt Images and Image Types

In [8]:
import os
import tensorflow as tf

def remove_invalid_images(my_data_dir):
    image_extensions = ('.png', '.jpg', '.jpeg')
    folders = os.listdir(my_data_dir)

    for folder in folders:
        folder_path = os.path.join(my_data_dir, folder)
        
        if not os.path.isdir(folder_path):  
            continue  # Skip if not a folder

        i, j = 0, 0  # Counters

        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)

            if os.path.isfile(file_path):
                if not file_name.lower().endswith(image_extensions):  
                    os.remove(file_path)  # Remove non-image files
                    i += 1
                else:
                    try:
                        # Try opening with TensorFlow to verify it's an actual image
                        img = tf.io.read_file(file_path)
                        img = tf.io.decode_image(img, channels=3)
                        j += 1  # Valid image
                    except Exception as e:
                        print(f"Removing corrupted image: {file_path} - Error: {str(e)}")
                        os.remove(file_path)
                        i += 1

        print(f"Folder: {folder} - Valid Images: {j}, Removed: {i}")

# Example usage:
data_dir = '/workspaces/PP5-Cherry-Leaves/inputs/leaves_dataset/cherry-leaves'
remove_invalid_images(data_dir)

Folder: powdery_mildew - Valid Images: 2104, Removed: 0
Folder: healthy - Valid Images: 2104, Removed: 0


In [18]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Create an ImageDataGenerator instance
datagen = ImageDataGenerator(rescale=1./255)  # Normalization (optional)

# Load images from a directory and resize them to 100x100
data_generator = datagen.flow_from_directory(
    '/workspaces/PP5-Cherry-Leaves/inputs/leaves_dataset/cherry-leaves',  # Replace with your image directory
    target_size=(100, 100),  # Resize to 100x100
    batch_size=10,
    class_mode='binary'  # Change based on your task (e.g., 'binary' for 2 classes)
)

# Fetch a batch of images and labels
images, labels = next(data_generator)

# Check the shape
print(images.shape)  # Should be (batch_size, 100, 100, 3) for RGB images


Found 4208 images belonging to 2 classes.
(10, 100, 100, 3)


In [19]:
import os
from tensorflow.keras.preprocessing.image import array_to_img

# Define save_dir
save_dir = '/workspaces/PP5-Cherry-Leaves/inputs/leaves_dataset/cherry-leaves/resized-images'

# Create the save directory if it doesn't exist
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

processed_images = 0
total_images = data_generator.samples

for batch_index, (images, labels) in enumerate(data_generator):
    for i, img in enumerate(images):
        # Process and save the image
        pil_img = array_to_img(img)
        img_filename = f"resized_image_{batch_index * len(images) + i}.png"
        pil_img.save(os.path.join(save_dir, img_filename))
        processed_images += 1
    print(f"Processed {processed_images}/{total_images} images")

    # Stop when all images are processed
    if processed_images >= total_images:
        print("All images processed. Exiting loop.")
        break

Processed 10/4208 images
Processed 20/4208 images
Processed 30/4208 images
Processed 40/4208 images
Processed 50/4208 images
Processed 60/4208 images
Processed 70/4208 images
Processed 80/4208 images
Processed 90/4208 images
Processed 100/4208 images
Processed 110/4208 images
Processed 120/4208 images
Processed 130/4208 images
Processed 140/4208 images
Processed 150/4208 images
Processed 160/4208 images
Processed 170/4208 images
Processed 180/4208 images
Processed 190/4208 images
Processed 200/4208 images
Processed 210/4208 images
Processed 220/4208 images
Processed 230/4208 images
Processed 240/4208 images
Processed 250/4208 images
Processed 260/4208 images
Processed 270/4208 images
Processed 280/4208 images
Processed 290/4208 images
Processed 300/4208 images
Processed 310/4208 images
Processed 320/4208 images
Processed 330/4208 images
Processed 340/4208 images
Processed 350/4208 images
Processed 360/4208 images
Processed 370/4208 images
Processed 380/4208 images
Processed 390/4208 im