In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! pip install rawpy

Collecting rawpy
  Downloading rawpy-0.25.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (6.4 kB)
Downloading rawpy-0.25.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rawpy
Successfully installed rawpy-0.25.1


In [3]:
import os
from PIL import Image, ImageFile
import numpy as np
import random
import rawpy

from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, array_to_img

# Handle truncated images
ImageFile.LOAD_TRUNCATED_IMAGES = True

def process_dataset(input_path, output_path, img_size=(224, 224), augment=True, augment_count=5):
    """
    Processes a dataset by converting images to JPEG, resizing, compressing,
    and applying augmentation if specified.

    Parameters:
    - input_path: str, path to the raw dataset folder
    - output_path: str, path to save processed images
    - img_size: tuple, target image size (width, height)
    - augment: bool, whether to apply augmentation
    - augment_count: int, number of augmented images to create per original image
    """

    datagen = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.1,
        height_shift_range=0.1,
        shear_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        fill_mode='nearest'
    )

    counter = 0  # Count processed images
    skipped_files = []

    for class_name in os.listdir(input_path):
        class_input_dir = os.path.join(input_path, class_name)
        class_output_dir = os.path.join(output_path, class_name)

        if not os.path.isdir(class_input_dir):
            continue

        os.makedirs(class_output_dir, exist_ok=True)

        for filename in os.listdir(class_input_dir):
            file_path = os.path.join(class_input_dir, filename)

            try:
                if not filename.lower().endswith(('.jpg', '.jpeg', '.png', '.dng')):
                    print(f"Skipping unsupported file format: {filename}")
                    skipped_files.append(filename)
                    continue

                # Load image
                if filename.lower().endswith('.dng'):
                    with rawpy.imread(file_path) as raw:
                        rgb = raw.postprocess()
                        img = Image.fromarray(rgb)
                else:
                    with Image.open(file_path) as img:
                        if img.mode != 'RGB':
                            img = img.convert('RGB')

                # Resize image
                img = img.resize(img_size, Image.Resampling.LANCZOS)

                # Save original image as JPEG
                save_path = os.path.join(class_output_dir, f"{counter}.jpeg")
                img.save(save_path, format='JPEG', quality=85)

                # Apply augmentation if specified
                if augment:
                    x = img_to_array(img)
                    x = x.reshape((1,) + x.shape)

                    aug_iter = datagen.flow(x, batch_size=1)

                    for i in range(augment_count):
                        batch = next(aug_iter)
                        augmented_img = array_to_img(batch[0])
                        aug_name = os.path.join(class_output_dir, f"{counter}_aug{i}.jpeg")
                        augmented_img.save(aug_name, format='JPEG', quality=85)

                counter += 1

            except Exception as e:
                print(f"Error processing {file_path}: {e}")
                skipped_files.append(filename)
                continue

    print("Dataset processing complete.")
    print(f"Total files processed: {counter}")
    if skipped_files:
        print(f"Files skipped or errored ({len(skipped_files)}): {skipped_files}")


In [4]:
input_folder = '/content/drive/MyDrive/sugercane-dataset/raw_dataset'
output_folder = '/content/drive/MyDrive/sugercane-dataset/dataset3'

process_dataset(input_folder, output_folder, img_size=(224, 224), augment=True, augment_count=5)



Error processing /content/drive/MyDrive/sugercane-dataset/raw_dataset/Normal leaf/IMG_20250908_165803.jpg: 'NoneType' object has no attribute 'seek'
Error processing /content/drive/MyDrive/sugercane-dataset/raw_dataset/Normal leaf/IMG_20250908_165802.jpg: 'NoneType' object has no attribute 'seek'
Error processing /content/drive/MyDrive/sugercane-dataset/raw_dataset/Normal leaf/IMG_20250908_165801_1.jpg: 'NoneType' object has no attribute 'seek'
Error processing /content/drive/MyDrive/sugercane-dataset/raw_dataset/Normal leaf/IMG_20250908_165801.jpg: 'NoneType' object has no attribute 'seek'
Error processing /content/drive/MyDrive/sugercane-dataset/raw_dataset/Normal leaf/IMG_20250908_165800.jpg: 'NoneType' object has no attribute 'seek'
Error processing /content/drive/MyDrive/sugercane-dataset/raw_dataset/Normal leaf/IMG_20250908_165759_1.jpg: 'NoneType' object has no attribute 'seek'
Error processing /content/drive/MyDrive/sugercane-dataset/raw_dataset/Normal leaf/IMG_20250908_165759.