In [None]:
# To filter out duplicates

In [None]:
import os
from PIL import Image
import imagehash
import shutil
import cv2
# Loads Images 
def load_images_from_folder(folder_path):
    images = []
    file_names = []
    # Goes through each file in the folder 
    for filename in os.listdir(folder_path):
        # Check if the file is an image
        if filename.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
            image_path = os.path.join(folder_path, filename)
            try:
                # Records the file name (i.e 544.jpg)
                image = Image.open(image_path)
                images.append(image)
                file_names.append(filename)
            except Exception as e:
                print(f"Failed to load image {filename}: {e}")
    return images, file_names

def find_duplicate_images(folder_path, output_folder):
    # Load images and filenames
    images, file_names = load_images_from_folder(folder_path)
    # Dictionary to store hashes and corresponding file names
    image_hashes = {}
    duplicates = []

    for image, file_name in zip(images, file_names):
        # Generate pHash for each image
        hash_value = imagehash.phash(image)
        # If image's hash value corresponds to another image's hash value
        # then record duplicate , else records the images hash_value in images_hashes
        if hash_value in image_hashes:
            # Records duplicates
            duplicates.append(file_name)
            print(f"Duplicate found: {file_name} and {image_hashes[hash_value]}")
        else:
            image_hashes[hash_value] = file_name
    print(f'Duplicates: {duplicates}')
    # Move duplicates to the output folder
    # Makes folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for duplicate in duplicates:
        shutil.move(os.path.join(folder_path, duplicate), os.path.join(output_folder, duplicate))

    print(f"Moved {len(duplicates)} duplicate images to {output_folder}")


In [None]:

# Define the folder paths
folder_path = 'input_folder'
output_folder = 'Duplicates'

# Find and filter out duplicate images
find_duplicate_images(folder_path, output_folder)
