# Data Preparation for Mildew Detection in Cherry Leaves

## Introduction

This notebook prepares the dataset for a mildew detection model in cherry leaves. Data cleaning and splitting into train, validation, and test sets are performed.

In [11]:
# Import necessary libraries for file manipulation and data splitting.

import os
import shutil
import random

In [12]:
# Define the path to the raw data directory

raw_data_dir = "/workspaces/mildew-detection-in-cherry-leaves/data/cherry-leaves"

In [13]:
# Function to remove non-image files from the specified directory. This ensures only image data is used for model training

def remove_non_image_files(directory):
    """
    Removes non-image files from the specified directory.

    Args:
        directory: The directory to clean.
    """
    image_extensions = ('.png', '.jpg', '.jpeg', '.gif')
    for class_name in os.listdir(directory):
        class_path = os.path.join(directory, class_name)
        if not os.path.isdir(class_path):
            continue
        files = os.listdir(class_path)
        removed = 0
        kept = 0
        for file in files:
            file_path = os.path.join(class_path, file)
            if os.path.isfile(file_path) and not file.lower().endswith(image_extensions):
                os.remove(file_path)
                removed += 1
            else:
                kept += 1
        print(f"Class: {class_name} - Image files: {kept}, Non-image files: {removed}")

In [14]:
# Execute the function to remove non-image files from the raw data directory

raw_data_dir = "/workspaces/mildew-detection-in-cherry-leaves/data/cherry-leaves"
remove_non_image_files(raw_data_dir)

Class: train - Image files: 2, Non-image files: 0
Class: test - Image files: 2, Non-image files: 0
Class: validation - Image files: 2, Non-image files: 0


In [None]:
# Function to split the data into training, validation, and test sets based on specified ratios

def split_data_sets(directory, train_ratio, validation_ratio, test_ratio):
    """
    Splits data into train, validation, and test sets within the specified directory.
    """
    if train_ratio + validation_ratio + test_ratio != 1.0:
        print("Sum of ratios must be 1.0")
        return

    # Get class names (directories that are not 'train', 'validation' or 'test')
    classes = [c for c in os.listdir(directory) 
               if os.path.isdir(os.path.join(directory, c)) 
               and c not in ['train', 'validation', 'test']]

    train_counts = {class_name: 0 for class_name in classes}
    validation_counts = {class_name: 0 for class_name in classes}
    test_counts = {class_name: 0 for class_name in classes}

    for data_set in ['train', 'validation', 'test']:
        for class_name in classes:
            os.makedirs(os.path.join(directory, data_set, class_name), exist_ok=True)

    for class_name in classes:
        class_path = os.path.join(directory, class_name)
        if not os.path.isdir(class_path):
            continue
        files = os.listdir(class_path)
        random.shuffle(files)

        train_size = int(len(files) * train_ratio)
        validation_size = int(len(files) * validation_ratio)

        for i, file in enumerate(files):
            source_path = os.path.join(class_path, file)
            if i < train_size:
                destination_path = os.path.join(directory, 'train', class_name, file)
                train_counts[class_name] += 1
            elif i < train_size + validation_size:
                destination_path = os.path.join(directory, 'validation', class_name, file)
                validation_counts[class_name] += 1
            else:
                destination_path = os.path.join(directory, 'test', class_name, file)
                test_counts[class_name] += 1
            shutil.move(source_path, destination_path)

    # Remove original class directories
    for class_name in classes:
        class_path = os.path.join(directory, class_name)
        if os.path.isdir(class_path):
            shutil.rmtree(class_path)

    print("Data distribution:")
    for class_name in classes:
        print(f"Class: {class_name}")
        print(f"  Train: {train_counts[class_name]}")
        print(f"  Validation: {validation_counts[class_name]}")
        print(f"  Test: {test_counts[class_name]}")

In [16]:
# Execute the function to split the data into training, validation, and test sets with the specified ratios

raw_data_dir = "/workspaces/mildew-detection-in-cherry-leaves/data/cherry-leaves"
split_data_sets(raw_data_dir, 0.7, 0.15, 0.15)

Data distribution:


# Conclusions:

* Data has been cleaned and split into training, validation, and test sets.
* Prepared data is ready for further processing.

## Next Steps:

* Data Visualization: Verify data quality and distribution.
* Exploratory Data Analysis (EDA): Explore image characteristics.
* Modeling: Select and train a model.
* Evaluation: Verify model performance.