In [3]:
import os
import hashlib
from pathlib import Path

def compute_hash(filepath):
    with open(filepath, "rb") as f:
        return hashlib.md5(f.read()).hexdigest()

def get_image_hashes(directory):
    hashes = {}
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith(('png', 'jpg', 'jpeg', 'bmp', 'gif')):
                full_path = Path(root) / file
                img_hash = compute_hash(full_path)
                hashes[img_hash] = full_path
    return hashes

train_dir = "data/flower_photos"
test_dir = "data/flower_photos_test"

train_hashes = get_image_hashes(train_dir)
test_hashes = get_image_hashes(test_dir)

# Find duplicates
duplicates_train = set(train_hashes) & set(test_hashes)

print(f"Found {len(duplicates_train)} duplicates in training set.")

# Optional: list duplicate files
for dup_hash in duplicates_train:
    print("Duplicate in train:", train_hashes[dup_hash])


Found 0 duplicates in training set.


In [2]:
# Remove duplicates safely
def remove_duplicates(duplicates, hashes_dict):
    for img_hash in duplicates:
        file_path = hashes_dict[img_hash]
        os.remove(file_path)
        print(f"Removed {file_path}")

# Run these commands carefully (make sure paths and results are correct first!)
remove_duplicates(duplicates_train, train_hashes)


Removed data/flower_photos/tulips/434146736_310a42d9cb_m.jpg
Removed data/flower_photos/dandelion/7162551630_3647eb9254.jpg
Removed data/flower_photos/tulips/4579128789_1561575458_n.jpg
Removed data/flower_photos/dandelion/2330339852_fbbdeb7306_n.jpg
Removed data/flower_photos/roses/14414100710_753a36fce9.jpg
Removed data/flower_photos/sunflowers/6953297_8576bf4ea3.jpg
Removed data/flower_photos/roses/1775233884_12ff5a124f.jpg
Removed data/flower_photos/sunflowers/253586685_ee5b5f5232.jpg
Removed data/flower_photos/roses/1446097778_97149b8362.jpg
Removed data/flower_photos/tulips/4574785121_5d8ec4626e.jpg
Removed data/flower_photos/tulips/3422915985_9bf7264d36.jpg
Removed data/flower_photos/daisy/9161647994_e39b65cb9c_n.jpg
Removed data/flower_photos/tulips/8712244311_da8e90bf8e_n.jpg
Removed data/flower_photos/sunflowers/4869189730_f47c124cda_n.jpg
Removed data/flower_photos/sunflowers/9410186154_465642ed35.jpg
Removed data/flower_photos/daisy/10559679065_50d2b16f6d.jpg
Removed data/f