# Duplicate Picture Identifier
##### by Collin Heist
## Package Imports

In [2]:
# Import the necessary packages
from PIL import Image
import imagehash
import os
import time
import numpy as np
import pandas as pd

## Adjustable Variables
- `time_per__compare` is how long each comparison takes (taken experimentally)
- `hash_threshold` is the maximum acceptable value for a difference between two hashes that will be counted as similar
- `progress_report_amount` is how often (in images) to update the user
- `moxe_extensions` is a tuple of valid file formats that will be moved when sorting files
- `dupe_extensions` is a tuple of valid file formats that are hashed and compared for duplicate identification

In [3]:
time_per_compare = 1e-4
hash_threshold = 3
progress_report_amount = 100
move_extensions = (".jpg", ".jpeg", ".png", ".gif", ".mp4", ".mov")
dupe_extensions = (".jpg", ".jpeg", ".png")

## Function to get the image list
This function takes a given directory and returns all files that fit either the move or duplicate extension criteria, as set by the value of `move`.

In [119]:
def get_image_list(path, move=True):
    list_of_files = os.listdir(path) # Get a list of all files in the current directory
    all_files = list()
    # Iterate over all the entries
    for entry in list_of_files:
        # Create full path
        full_path = os.path.join(path, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(full_path):
            all_files = all_files + get_image_list(full_path, move)
        else:
            all_files.append(full_path)
                
    # Return a filtered list that only contains files who end in the valid extension
    return [image for image in all_files if image.lower().endswith(move_extensions)] if move else [image for image in all_files if image.lower().endswith(dupe_extensions)]

## Function to estimate the number of comparisons performed

In [5]:
def num_comparisons(image_list):
    return int(len(image_list) * (len(image_list) + 1) / 2)

## Function to check all images in a given path for duplicates
This is the coup de grâce; all images in the provided path are compared, duplicates are identified and a convenient `DataFrame` is returned.

In [115]:
def check_duplicates(path, verbose=True):
    image_list = get_image_list(path, move=False)
    if verbose:
        print ("%i images, requiring %i comparisons" % (len(image_list), num_comparisons(image_list)))
        print ("This should take about %f minutes" % (time_per_compare * num_comparisons(image_list) / 60.0))
        start_time = time.time() # Mark the start time for user-notification
        
    # Loop through all images, calculate the hash and store it and filename in two lists
    name_list, hash_list = [], []
    for count, image in enumerate(image_list):
        name_list.append(image[len(path):])
        hash_list.append(imagehash.average_hash(Image.open(image)))
        if count % progress_report_amount == 0 and verbose:
            print ("Hashing image #{}, {} hashes to go.".format(count, len(image_list) - count))
    
    # Create a DataFrame for managing the hash comparisons and results
    df = pd.DataFrame(np.transpose([name_list, hash_list]), columns=["File", "Hash"])
    df["Duplicates"] = "" # Create an empty (for now) DataFrame column
    # Loop through all rows of the DataFrame, computing the difference in hashes between all
    for row_num, row in df.iterrows():
        # Store all file names whose hash is below the threshold in that row's Duplicate column
        df.at[row_num, "Duplicates"] = " OR ".join(map(str, df[(~df['Duplicates'].str.contains(row['File']).any()) & (abs(df["Hash"] - row["Hash"]) < hash_threshold) & (df["File"] != row["File"])]["File"].values))
        if row_num % progress_report_amount == 0 and verbose:
            print ("Computed hash differences on {} images. {} comparisons to go.".format(row_num+1, num_comparisons(image_list[row_num:])))
    
    # If applicable, notify the user of how long the operations took
    if verbose:
        print ("Took {:.3} minutes".format((time.time() - start_time) / 60.0))
        
    # Return the DataFrame - ignoring all entries that do not have duplicates
    return df[df["Duplicates"] != ""][["File", "Duplicates"]]

## Specific function to rename the `/grouped/` subfolders

In [7]:
# Rename the /Grouped/ subfolder
def rename_grouped(path):
    image_list = get_image_list(path)
    originalLen = len(image_list)
    if not os.path.isdir(path + "new/"): # Make the /new/ subfolder if it doesn't exist
        os.makedirs(path + "new/")
        print ("Created /new/ folder")
    adjust_val = 0
    for count, image in enumerate(image_list):
        group_count = image.split("-")[0] # Grab the group count of this image
        # group_list is the list of all image in the same group
        group_list = [img for img in image_list if img.split("-")[0] == group_count]
         # If the subgroup of this image has already been renamed, adjust the count accordingly
        if len(group_list) == 0:
            adjust_val += 1
        # Loop through all images of this same subgroup
        for sub_count, sub_image in enumerate(group_list):
            if image.lower().endswith(".jpeg"): # New name for longer file names
                new_name = "%snew/%s-%s%s" % (path, count+1-adjust_val, sub_count+1, image[-5:])
            else:
                new_name = "%snew/%s-%s%s" % (path, count+1-adjust_val, sub_count+1, image[-4:])
            os.rename(path + sub_image, new_name) # Move to the /new/ folder (with the new name)
            time.sleep(0.08) # Sleep between each command to avoid losing files
            
        image_list = get_image_list(path) # Reset the image list now that some have been moved
        
    # Move the images back from /Grouped/new/ to /Grouped/
    for image in os.listdir(path + "new/"):
        os.rename(path + "new/" + image, path + image)
        time.sleep(0.08)
        
    os.rmdir(path + "new") # Delete /path/new/ subfolder
    print ("%i images renamed." % originalLen)

## Function to rename all functions in a given directory (1, 2, 3...)

In [8]:
def renameAll(path, verbose=True):
    if path.endswith("Grouped/") or path.endswith("Known/"): # Prevent accidentally renaming the grouped folder
        rename_grouped(path)
        return
    imageList = get_image_list(path)
    if not os.path.isdir(path + "new/"): # Make the /new/ subfolder if it doesn't exist
        os.makedirs(path + "new/")
        if verbose:
            print ("Created /new/ folder.")
    for count, image in enumerate(imageList):
        if image.lower().endswith(".jpeg"): # Rename command for longer file names
            new_name = "%snew/%s%s" % (path, count+1, image[-5:])
        else:
            new_name = "%snew/%s%s" % (path, count+1, image[-4:])
        os.rename(path + image, new_name)
        time.sleep(0.08) # Sleep between each command to avoid losing files
        
    # Move images back from /path/new/ to /path/
    for image in os.listdir(path + "new/"):
        os.rename(path + "new/" + image, path + image)
        time.sleep(0.08)
        
    os.rmdir(path + "new/") # Delete /path/new/ subfolder
    if verbose:
        print ("%i images renamed." % len(imageList))

# Function Calls

In [121]:
image_path = "/Users/CollinHeist/Documents/test"
image_path += "/"
rename = False

if rename:
    renameAll(image_path, False)
else:
    with pd.option_context('display.max_rows', 1000, 'display.max_colwidth', 10000):
        display(check_duplicates(image_path, True))

4 images, requiring 10 comparisons
This should take about 0.000017 minutes
Hashing image #0, 4 hashes to go.
Computed hash differences on 1 images. 10 comparisons to go.
Took 0.00315 minutes


Unnamed: 0,File,Duplicates
0,test1/2846.png,test1/3279.png OR test2/2846.png OR test2/3279.png
