# Optimisation Technique Notebook: Removing Similar-Looking Images


 


This notebook demonstrates how to remove similar-looking images from a dataset.

In [1]:
# Import necessary libraries
import os
from PIL import Image
import numpy as np
import cv2
import imutils

## Functions for Image Comparison

This function draws a color mask on the input image to exclude non-essential regions.

Parameters: <br>
- img: Input image (BGR format).
- borders: Tuple of percentage values (left, top, right, bottom) specifying the mask borders.
- color: Color of the mask (default is black). 

Returns: <br>
- Image with color mask applied.

In [2]:
def draw_color_mask(img, borders, color=(0, 0, 0)):
    
    h = img.shape[0]
    w = img.shape[1]

    x_min = int(borders[0] * w / 100)
    x_max = w - int(borders[2] * w / 100)
    y_min = int(borders[1] * h / 100)
    y_max = h - int(borders[3] * h / 100)

    img = cv2.rectangle(img, (0, 0), (x_min, h), color, -1)
    img = cv2.rectangle(img, (0, 0), (w, y_min), color, -1)
    img = cv2.rectangle(img, (x_max, 0), (w, h), color, -1)
    img = cv2.rectangle(img, (0, y_max), (w, h), color, -1)

    return img

This function preprocesses an image for change detection.

Parameters:<br>
- img: Input image (BGR format).
- gaussian_blur_radius_list: List of radius values for Gaussian blur (optional).
- black_mask: Tuple of percentage values (left, top, right, bottom) for the color mask (optional).

Returns:<br>
- Preprocessed grayscale image.


In [3]:
def preprocess_image_change_detection(img, gaussian_blur_radius_list=None, black_mask=(5, 10, 5, 0)):
    
    gray = img.copy()
    gray = cv2.cvtColor(gray, cv2.COLOR_BGR2GRAY)
    if gaussian_blur_radius_list is not None:
        for radius in gaussian_blur_radius_list:
            gray = cv2.GaussianBlur(gray, (radius, radius), 0)

    gray = draw_color_mask(gray, black_mask)

    return gray

This function compares two frames for change detection.

Parameters:
- prev_frame: Previous frame (grayscale image).
- next_frame: Next frame (grayscale image).
- min_contour_area: Minimum contour area threshold for change detection.

Returns:
- Similarity score, list of contours, and thresholded image.

In [4]:
def compare_frames_change_detection(prev_frame, next_frame, min_contour_area):

    frame_delta = cv2.absdiff(prev_frame, next_frame)
    thresh = cv2.threshold(frame_delta, 45, 255, cv2.THRESH_BINARY)[1]

    thresh = cv2.dilate(thresh, None, iterations=2)
    cnts = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = imutils.grab_contours(cnts)

    score = 0
    res_cnts = []
    for c in cnts:
        if cv2.contourArea(c) < min_contour_area:
            continue

        res_cnts.append(c)
        score += cv2.contourArea(c)

    return score, res_cnts, thresh

## Removing Similar-Looking Images

This function removes similar-looking images from the specified folder.

Parameters:
- folder_path: Path to the folder containing the images.
- min_similarity_score: Threshold for similarity between images.

Returns:
- None. Removes duplicate images from the folder.

In [5]:
import os
import shutil

def remove_similar_images(folder_path, min_similarity_score):
    image_files = os.listdir(folder_path)

    # Keep track of unique images
    unique_images = []
    duplicate_images = []

    # Function to resize an image to a specific size
    def resize_image(img, target_size):
        return cv2.resize(img, target_size, interpolation=cv2.INTER_AREA)

    # Create a new folder for unique images
    new_folder_path = os.path.join(folder_path, "unique_images")
    os.makedirs(new_folder_path, exist_ok=True)

    for file_name in image_files:
        image_path = os.path.join(folder_path, file_name)

        # Check if the file is an image (with supported format)
        if not os.path.isfile(image_path) or not image_path.lower().endswith(('.png', '.jpg', '.jpeg')):
            print(f"Skipping non-image file: {file_name}")
            continue

        try:
            # Load the image using PIL
            image = Image.open(image_path)

            # Convert the PIL image to a NumPy array (required for OpenCV compatibility)
            image = np.array(image)

            # Resize the image to a common size for comparison
            target_size = (640, 480)  # Adjust the size as per your preference
            image = resize_image(image, target_size)

            # Preprocess image for comparison
            preprocessed_image = preprocess_image_change_detection(image)

            # Compare current image with unique images
            is_duplicate = False
            for unique_image, _ in unique_images:
                score, _, _ = compare_frames_change_detection(unique_image, preprocessed_image, min_contour_area=500)
                if score >= min_similarity_score:
                    is_duplicate = True
                    break

            # If the image is not a duplicate, save it to the new folder
            if not is_duplicate:
                unique_images.append((preprocessed_image, file_name))
                new_image_path = os.path.join(new_folder_path, file_name)
                shutil.copy(image_path, new_image_path)
            else:
                duplicate_images.append(file_name)

        except Exception as e:
            print(f"Failed to process image '{file_name}': {e}")
            continue

    print(f"Removed {len(duplicate_images)} similar-looking images.")
    print(f"Saved {len(unique_images)} unique images to '{new_folder_path}'.")

## Example Usage

In [6]:
folder_path = "D:/Téléchargements/dataset"  # Path of my folder
min_similarity_score = 500  # Adjusted based on my preference

remove_similar_images(folder_path, min_similarity_score)

Skipping non-image file: .DS_Store
Failed to process image 'c21_2021_03_27__10_36_36.png': cannot identify image file 'D:/Téléchargements/dataset\\c21_2021_03_27__10_36_36.png'
Removed 1078 similar-looking images.
Saved 1 unique images to 'D:/Téléchargements/dataset\unique_images'.
