In [1]:
import os
import shutil
import random

def copy_limited_images(base_dir, target_dir, selected_classes, max_images=500):
    """
    Copy a limited number of images per class to the target directory.

    Args:
    - base_dir (str): Path to the base directory containing all the class folders.
    - target_dir (str): Path to the target directory where selected folders will be copied.
    - selected_classes (list): List of class prefixes to copy (e.g., ["Apple", "Banana", "Orange"]).
    - max_images (int): Maximum number of images to copy per class.
    """
    os.makedirs(target_dir, exist_ok=True)  # Ensure the target directory exists

    # Loop through the folders in the base directory
    for folder_name in os.listdir(base_dir):
        folder_path = os.path.join(base_dir, folder_name)
        
        # Check if folder belongs to the selected classes
        if any(folder_name.startswith(cls) for cls in selected_classes):
            target_folder_path = os.path.join(target_dir, folder_name)
            os.makedirs(target_folder_path, exist_ok=True)
            print(f"Copying up to {max_images} images from {folder_name}...")

            # Get all image files in the folder
            image_files = [
                f for f in os.listdir(folder_path)
                if f.lower().endswith(('.jpg', '.jpeg', '.png'))
            ]

            # Shuffle and select a subset of images
            random.shuffle(image_files)
            selected_files = image_files[:max_images]

            # Copy the selected images to the target folder
            for file_name in selected_files:
                src_path = os.path.join(folder_path, file_name)
                dest_path = os.path.join(target_folder_path, file_name)
                shutil.copy(src_path, dest_path)

    print("Copying completed.")

# Define paths
base_dir = r'C:\Users\cheng\Documents\VSC\Higher_level_CV\Examination_project\data'  # Source directory
target_dir = r'C:\Users\cheng\Documents\VSC\Higher_level_CV\Examination_project\advanced_version_for_accuracy_faster_RCNN\data'  # Target directory

# Specify the classes to copy and the maximum number of images per class
selected_classes = ["Apple", "Banana", "Orange"]
max_images_per_class = 500

# Run the function
copy_limited_images(base_dir, target_dir, selected_classes, max_images_per_class)


Copying up to 500 images from Apple_Bad...
Copying up to 500 images from Apple_Good...
Copying up to 500 images from Banana_Bad...
Copying up to 500 images from Banana_Good...
Copying up to 500 images from Orange_Bad...
Copying up to 500 images from Orange_Good...
Copying completed.


Now using YOLOv5 to preprocess the raw data. I need to gerenate bounding boxes as input for CNN

In [1]:
import os
import shutil
from PIL import Image
import torch

# Paths
base_image_dir = r'C:\Users\cheng\Documents\VSC\Higher_level_CV\Examination_project\advanced_version_for_accuracy_faster_RCNN\data'  # Folder with class images
output_label_dir = r'C:\Users\cheng\Documents\VSC\Higher_level_CV\Examination_project\advanced_version_for_accuracy_faster_RCNN\labeled_data'  # Folder to save annotations
os.makedirs(output_label_dir, exist_ok=True)

# Define a class mapping dictionary
class_mapping = {
    "Apple_Bad": 0, "Apple_Good": 1,
    "Banana_Bad": 2, "Banana_Good": 3,
    "Orange_Bad": 4, "Orange_Good": 5
}

# Load YOLOv5 model
print("Loading YOLOv5 model...")
model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)

# Loop through each subfolder (each class)
for class_folder in os.listdir(base_image_dir):
    class_folder_path = os.path.join(base_image_dir, class_folder)
    
    if os.path.isdir(class_folder_path):  # Ensure it's a directory
        # Retrieve the class ID from class_mapping
        class_id = class_mapping.get(class_folder, None)
        if class_id is None:
            print(f"Warning: No class ID found for folder '{class_folder}', skipping...")
            continue
        
        # Make a corresponding folder in the output labels directory
        class_output_dir = os.path.join(output_label_dir, class_folder)
        os.makedirs(class_output_dir, exist_ok=True)

        # Loop through each image in the class folder
        for image_file in os.listdir(class_folder_path):
            if image_file.endswith('.jpg') or image_file.endswith('.png'):
                # Load the image
                img_path = os.path.join(class_folder_path, image_file)
                img = Image.open(img_path)
                width, height = img.size  # Get image dimensions

                # Run YOLOv5 inference
                results = model(img)
                
                # Extract bounding boxes and save in YOLO format
                label_filename = os.path.splitext(image_file)[0] + ".txt"
                label_path = os.path.join(class_output_dir, label_filename)
                
                with open(label_path, "w") as f:
                    for *box, conf, _ in results.xywh[0]:  # YOLOv5 outputs in (x_center, y_center, width, height)
                        x_center, y_center, box_width, box_height = [b.item() for b in box]
                        
                        # Normalize coordinates to [0, 1] range
                        x_center /= width
                        y_center /= height
                        box_width /= width
                        box_height /= height
                        
                        # Write bounding box to label file in YOLO format with consistent class ID
                        f.write(f"{class_id} {x_center} {y_center} {box_width} {box_height}\n")

print("Preprocessing completed. Annotations saved.")


Loading YOLOv5 model...


Using cache found in C:\Users\cheng/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2024-11-15 Python-3.9.10 torch-2.5.0+cpu CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp

Preprocessing completed. Annotations saved.


  with amp.autocast(autocast):


Saveing annoted images to a folder that can be used for fine tuning later


Deleting all without annotations (something went wrong)

In [None]:
import os
from PIL import Image
import torch

# Directories
image_dir = r"C:\Users\cheng\Documents\VSC\Higher_level_CV\Examination_project\Advanced_version_for_accuracy_faster_RCNN\data"  # Directory with images
label_dir = r"C:\Users\cheng\Documents\VSC\Higher_level_CV\Examination_project\Advanced_version_for_accuracy_faster_RCNN\labeled_data"  # Directory with labels
os.makedirs(label_dir, exist_ok=True)

# Load YOLOv5 pre-trained model
model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)

# Find unannotated images
unannotated_images = []

for class_folder in os.listdir(image_dir):
    class_image_dir = os.path.join(image_dir, class_folder)
    class_label_dir = os.path.join(label_dir, class_folder)

    if os.path.isdir(class_image_dir):
        os.makedirs(class_label_dir, exist_ok=True)  # Ensure corresponding label folder exists

        for image_file in os.listdir(class_image_dir):
            if image_file.lower().endswith(('.jpg', '.png', '.jpeg')):
                # Check if the corresponding annotation file exists
                label_file = os.path.splitext(image_file)[0] + ".txt"
                label_path = os.path.join(class_label_dir, label_file)
                if not os.path.exists(label_path):
                    unannotated_images.append((os.path.join(class_image_dir, image_file), label_path))

print(f"Found {len(unannotated_images)} unannotated images.")

# Annotate unannotated images using YOLOv5
for image_path, label_path in unannotated_images:
    img = Image.open(image_path)
    width, height = img.size

    # Run YOLOv5 inference
    results = model(image_path)
    predictions = results.xywh[0].numpy()  # Bounding boxes in (x_center, y_center, width, height)

    # Save predictions in YOLO format
    with open(label_path, "w") as f:
        for *box, conf, cls in predictions:
            x_center, y_center, box_width, box_height = box
            # Normalize bounding box coordinates
            x_center /= width
            y_center /= height
            box_width /= width
            box_height /= height
            # Write to file: class_id x_center y_center width height
            f.write(f"{int(cls)} {x_center:.6f} {y_center:.6f} {box_width:.6f} {box_height:.6f}\n")

    print(f"Annotated: {image_path} -> {label_path}")

print("Annotation process completed!")


In [4]:
import os

# Directories
image_dir = r"C:\Users\cheng\Documents\VSC\Higher_level_CV\Examination_project\Advanced_version_for_accuracy_faster_RCNN\data"
label_dir = r"C:\Users\cheng\Documents\VSC\Higher_level_CV\Examination_project\Advanced_version_for_accuracy_faster_RCNN\labeled_data"

# Initialize sets
image_extensions = ('.jpg', '.png', '.jpeg')
label_extension = '.txt'

# Collect filenames without extensions
image_files = {os.path.splitext(f)[0] for root, _, files in os.walk(image_dir) for f in files if f.endswith(image_extensions)}
label_files = {os.path.splitext(f)[0] for root, _, files in os.walk(label_dir) for f in files if f.endswith(label_extension)}

# Find matches and mismatches
matched_files = image_files & label_files  # Files present in both
unmatched_images = image_files - label_files  # Images without labels
unmatched_labels = label_files - image_files  # Labels without images

# Output results
print(f"Total images: {len(image_files)}")
print(f"Total annotations: {len(label_files)}")
print(f"Matched files: {len(matched_files)}")
print(f"Unmatched images (no annotations): {len(unmatched_images)}")
print(f"Unmatched labels (no images): {len(unmatched_labels)}")

# List examples of unmatched files (if any)
if unmatched_images:
    print("\nExamples of unmatched images:")
    print("\n".join(list(unmatched_images)[:5]))

if unmatched_labels:
    print("\nExamples of unmatched labels:")
    print("\n".join(list(unmatched_labels)[:5]))

Total images: 1984
Total annotations: 1984
Matched files: 1984
Unmatched images (no annotations): 0
Unmatched labels (no images): 0


In [5]:
import os
import cv2

def save_annotated_image(image_path, label_path, save_dir):
    """
    Save an image with bounding boxes drawn on it based on YOLO annotations.

    Args:
        image_path (str): Path to the image file.
        label_path (str): Path to the corresponding label file.
        save_dir (str): Directory to save the annotated image.
    """
    # Verify if the label file exists
    if not os.path.exists(label_path):
        print(f"No annotation found for: {image_path}. Skipping...")
        return

    # Load the image
    img = cv2.imread(image_path)
    if img is None:
        print(f"Failed to load image: {image_path}")
        return
    height, width = img.shape[:2]

    # Read and parse the label file
    with open(label_path, "r") as f:
        lines = f.readlines()

    # Draw bounding boxes
    for line in lines:
        values = line.strip().split()
        if len(values) != 5:
            print(f"Unexpected label format in {label_path}: {line}")
            continue

        class_id, x_center, y_center, w, h = map(float, values)
        x_center, y_center = int(x_center * width), int(y_center * height)
        w, h = int(w * width), int(h * height)
        x1, y1 = int(x_center - w / 2), int(y_center - h / 2)
        x2, y2 = int(x_center + w / 2), int(y_center + h / 2)

        # Draw the bounding box and label
        cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(img, f"Class {int(class_id)}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    # Prepare the save directory
    os.makedirs(save_dir, exist_ok=True)

    # Save the annotated image
    save_path = os.path.join(save_dir, os.path.basename(image_path))
    cv2.imwrite(save_path, img)
    print(f"Annotated image saved to: {save_path}")


# Directories
image_base_dir = r"C:\Users\cheng\Documents\VSC\Higher_level_CV\Examination_project\Advanced_version_for_accuracy_faster_RCNN\data"
label_base_dir = r"C:\Users\cheng\Documents\VSC\Higher_level_CV\Examination_project\Advanced_version_for_accuracy_faster_RCNN\labeled_data"
save_base_dir = r"C:\Users\cheng\Documents\VSC\Higher_level_CV\Examination_project\Advanced_version_for_accuracy_faster_RCNN\annotated_images"

# Iterate through all class subfolders
for class_folder in os.listdir(image_base_dir):
    class_image_dir = os.path.join(image_base_dir, class_folder)
    class_label_dir = os.path.join(label_base_dir, class_folder)
    class_save_dir = os.path.join(save_base_dir, class_folder)

    if os.path.isdir(class_image_dir) and os.path.isdir(class_label_dir):
        print(f"Processing class: {class_folder}")
        # Process each image in the class folder
        for image_file in os.listdir(class_image_dir):
            if image_file.endswith('.jpg') or image_file.endswith('.png'):
                image_path = os.path.join(class_image_dir, image_file)
                label_path = os.path.join(class_label_dir, os.path.splitext(image_file)[0] + ".txt")
                save_annotated_image(image_path, label_path, class_save_dir)
    else:
        print(f"Skipping {class_folder} as either images or labels are missing.")

Processing class: Apple_Bad
Annotated image saved to: C:\Users\cheng\Documents\VSC\Higher_level_CV\Examination_project\Advanced_version_for_accuracy_faster_RCNN\annotated_images\Apple_Bad\IMG20200728175856.jpg
Annotated image saved to: C:\Users\cheng\Documents\VSC\Higher_level_CV\Examination_project\Advanced_version_for_accuracy_faster_RCNN\annotated_images\Apple_Bad\IMG20200728175909.jpg
Annotated image saved to: C:\Users\cheng\Documents\VSC\Higher_level_CV\Examination_project\Advanced_version_for_accuracy_faster_RCNN\annotated_images\Apple_Bad\IMG20200728175910.jpg
Annotated image saved to: C:\Users\cheng\Documents\VSC\Higher_level_CV\Examination_project\Advanced_version_for_accuracy_faster_RCNN\annotated_images\Apple_Bad\IMG20200728175919.jpg
Annotated image saved to: C:\Users\cheng\Documents\VSC\Higher_level_CV\Examination_project\Advanced_version_for_accuracy_faster_RCNN\annotated_images\Apple_Bad\IMG20200728175922.jpg
Annotated image saved to: C:\Users\cheng\Documents\VSC\Higher_

In [8]:

import os

# Directory with the image subfolders
image_dir = r"C:\Users\cheng\Documents\VSC\Higher_level_CV\Examination_project\Advanced_version_for_accuracy_faster_RCNN\annotated_images"

# Image extensions to look for
image_extensions = ('.jpg', '.png', '.jpeg')

# Function to count files recursively
def count_files_in_subfolders(directory, extensions):
    count = 0
    for root, _, files in os.walk(directory):
        count += len([f for f in files if f.lower().endswith(extensions)])
    return count

# Count the image files
num_images = count_files_in_subfolders(image_dir, image_extensions)

print(f"Total number of images: {num_images}")



Total number of images: 1984
