In [1]:
import os
import hashlib
import shutil

# Helper function to calculate the hash of a file
def calculate_hash(file_path):
    hasher = hashlib.md5()
    with open(file_path, 'rb') as f:
        buf = f.read()
        hasher.update(buf)
    return hasher.hexdigest()

# Detect duplicates
def find_duplicates(images_dir):
    hash_map = {}
    duplicates = []
    
    for image_file in os.listdir(images_dir):
        image_path = os.path.join(images_dir, image_file)
        if os.path.isfile(image_path):
            file_hash = calculate_hash(image_path)
            if file_hash in hash_map:
                duplicates.append((image_file, hash_map[file_hash]))
            else:
                hash_map[file_hash] = image_file
                
    return duplicates

# Remove duplicates and their labels
def remove_duplicates(images_dir, labels_dir):
    duplicates = find_duplicates(images_dir)
    for duplicate, original in duplicates:
        # Paths for duplicate image and its label
        duplicate_image_path = os.path.join(images_dir, duplicate)
        duplicate_label_path = os.path.join(labels_dir, duplicate.replace('.jpg', '.txt'))
        
        # Remove the duplicate image and its label if they exist
        if os.path.exists(duplicate_image_path):
            os.remove(duplicate_image_path)
        if os.path.exists(duplicate_label_path):
            os.remove(duplicate_label_path)
        
        print(f"Removed duplicate: {duplicate} (Original: {original})")
    
    print(f"Total duplicates removed: {len(duplicates)}")


# Directories for images and labels
images_dir = "data/train/images"  # Replace with your images directory
labels_dir = "data/train/labels"  # Replace with your labels directory

# Run the script
remove_duplicates(images_dir, labels_dir)


Removed duplicate: download-3-_jpeg_jpg.rf.f99d87c14619d35ceb5da05bdb7b40dc.jpg (Original: download-3-_jpeg_jpg.rf.7eae169e810ddec11b970a4a3f0a5019.jpg)
Removed duplicate: image_181_jpg.rf.52cf3de191a31cd77ead055a01b73c97.jpg (Original: image_181_jpg.rf.4693995f3fe0e9db43aa6be023f6c812.jpg)
Removed duplicate: image_183_jpg.rf.dff546ab6afb41f292edc0776dc632d3.jpg (Original: image_183_jpg.rf.868bfb2f11d0dbb63d3b95c0d8e0ba83.jpg)
Removed duplicate: image_199_jpg.rf.c19112d285b280f14569100c26b7a949.jpg (Original: image_199_jpg.rf.9fdc6cef7a44cfd4e79faa2dc6553aea.jpg)
Removed duplicate: image_212_jpg.rf.6d673ce8c60719058af1a34dfb05f97d.jpg (Original: image_212_jpg.rf.31fdb53d746d40965f156acac6451d23.jpg)
Removed duplicate: image_212_jpg.rf.db0cc7d579a6dd55cf98a9cc51704d0f.jpg (Original: image_212_jpg.rf.0abb5832f665e59fdfdea826a26e76b4.jpg)
Removed duplicate: KakaoTalk_20221018_001425789_06_jpg.rf.a4657976a595b2d49226a1104a7fdd98.jpg (Original: KakaoTalk_20221018_001425789_06_jpg.rf.817a8fe

In [2]:
# Directories for images and labels
images_dir = "data/test/images"  # Replace with your images directory
labels_dir = "data/test/labels"  # Replace with your labels directory

# Run the script
remove_duplicates(images_dir, labels_dir)


Total duplicates removed: 0


In [3]:
# Directories for images and labels
images_dir = "data/valid/images"  # Replace with your images directory
labels_dir = "data/valid/labels"  # Replace with your labels directory

# Run the script
remove_duplicates(images_dir, labels_dir)


Removed duplicate: image_202_jpg.rf.8dcc18c2a64fee258e34732d9c625a97.jpg (Original: image_202_jpg.rf.5b279d2106ba531ef5104d660aedfc7c.jpg)
Total duplicates removed: 1
