# Duplicate Picture Identifier
##### by Collin Heist
## Package Imports

In [2]:
# Import the necessary packages
from PIL import Image
import imagehash
import os
import time
import numpy as np
import pandas as pd

## Adjustable Variables
- `time_per__compare` is how long each comparison takes (taken experimentally)
- `hash_threshold` is the maximum acceptable value for a difference between two hashes that will be counted as similar
- `progress_report_amount` is how often (in images) to update the user
- `moxe_extensions` is a tuple of valid file formats that will be moved when sorting files
- `dupe_extensions` is a tuple of valid file formats that are hashed and compared for duplicate identification

In [3]:
time_per_compare = 1e-4
hash_threshold = 3
progress_report_frequency = 500
video_extensions = ('.mov', '.mp4', '.gif')
image_extensions = ('.jpg', '.jpeg', '.png', '.heic')
dupe_extensions = (".jpg", ".jpeg", ".png")

## Function to get the image list
This function takes a given directory and returns all files that fit either the move or duplicate extension criteria, as set by the value of `move`.

In [4]:
def get_image_list(path, ignore_list=[None], move=True, videos=False):
    list_of_files = os.listdir(path) # Get a list of all files in the current directory
    all_files = []
    # Iterate over all the entries
    for entry in list_of_files:
        # Create full path
        full_path = os.path.join(path, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(full_path) and entry not in ignore_list:
            all_files = all_files + get_image_list(full_path, ignore_list, move, videos)
        else:
            all_files.append(full_path)
                            
    # Return a filtered list that only contains files who end in the valid extension
    if move is False: # Duplicates being identified
        return [image for image in all_files if image.lower().endswith(dupe_extensions)]
    elif move is True and videos is False: # Moving images
        return [image for image in all_files if image.lower().endswith(image_extensions)]
    elif move is True and videos is True: # Moving videos
        return [image for image in all_files if image.lower().endswith(video_extensions)]

## Function to estimate the number of comparisons performed

In [5]:
def num_comparisons(image_list):
    return int(len(image_list) * (len(image_list) + 1) / 2)

## Function to check all images in a given path for duplicates
This is the coup de grâce; all images in the provided path are compared, duplicates are identified and a convenient `DataFrame` is returned.

In [6]:
def identify_duplicates(path, ignore_list=[None], verbose=True):
    image_list = get_image_list(path, ignore_list=ignore_list, move=False)
    if verbose:
        print ("{} images, requiring {} comparisons".format(len(image_list), num_comparisons(image_list)))
        print ("This should take about {} minutes".format(int(time_per_compare * num_comparisons(image_list) / 60.0)))
        start_time = time.time() # Mark the start time for user-notification
        
    # Loop through all images, calculate the hash and store it and filename in two lists
    name_list, hash_list = [], []
    for count, image in enumerate(image_list):
        name_list.append(image)
        try:
            hash_val = imagehash.average_hash(Image.open(image))
        except OSError:
            print ("Delete {}".format(image))
            hash_val = 0
            
        hash_list.append(hash_val)
        if count % progress_report_frequency == 0 and verbose:
            print ("Hashing image #{}, {} hashes to go.".format(count, len(image_list) - count))
    
    if verbose:
        print ("\nFinished hashing {} images. Starting comparisons.".format(len(image_list)))
    
    # Create a DataFrame for managing the hash comparisons and results
    df = pd.DataFrame(np.transpose([name_list, hash_list]), columns=["File", "Hash"])
    df["Duplicates"] = "" # Create an empty (for now) DataFrame column
    df['DupeStr'] = '' # Create an empty column that will house the string-converted array of duplicates
    # Loop through all rows of the DataFrame, computing the difference in hashes between all
    for row_num, row in df.iterrows():
        # Store all file names whose hash is below the threshold in that row's Duplicate column
        df.at[row_num, "Duplicates"] = df[(~df['DupeStr'].str.contains(row['File']).any()) & (abs(df["Hash"] - row["Hash"]) < hash_threshold) & (df["File"] != row["File"])]["File"].values
        df.at[row_num, 'DupeStr'] = ' '.join(map(str, row['Duplicates']))
    
        if row_num % progress_report_frequency == 0 and verbose:
            print ("Computed hash differences on {} images. {} comparisons to go.".format(row_num+1, num_comparisons(image_list[row_num:])))
    
    # Create the duplicated entries folder that has aliases to each duplicated file
    create_duplicate_aliases(path, df)
    
    # If applicable, notify the user of how long the operations took
    if verbose:
        print ("Took {:.2} minutes".format((time.time() - start_time) / 60.0))
        
    # Return the DataFrame just for display purposes
    return df[df['Duplicates'].astype(str) != '[]'][['File', 'Duplicates']].reset_index(drop=True)

## Create a `/Duplicates/` folder that contains aliases to each duplicated picture

In [21]:
def create_duplicate_aliases(path, df):
    # Filter the DataFrame to ignore all entries without duplicates
    df = df[df["Duplicates"].astype(str) != '[]'][["File", "Duplicates"]].reset_index(drop=True)
    
    # Return if no duplicates are found
    if len(df) == 0:
        return
    
    # Create the folder where duplicate will be placed
    try:
        os.mkdir(os.path.join(path, "Duplicates"))
    except:
        print ("/Duplicates/ folder already exists.")
        
    # Loop through all rows in the duplicate entries - for each row, create a 
    for row_num, row in df.iterrows():
        # Create a numbered folder to contain aliases to all matched duplicates
        os.mkdir(os.path.join(path, 'Duplicates', str(row_num)))
        # Create the alias to the source file that there are duplicates of
        os.symlink(row['File'], os.path.join(path, 'Duplicates', str(row_num), "{}.{}".format(row['File'][len(path):].split('/')[0], row['File'].split('.')[-1])))
        # Loop through all found duplicates and create aliases for each one
        for dupe_num, dupe in enumerate(row['Duplicates']):
            os.symlink(dupe, os.path.join(path, 'Duplicates', str(row_num), '{} ({}).{}'.format(dupe[len(path):].split('/')[0], dupe_num, dupe.split(".")[-1])))

## Specific function to rename the `/grouped/` subfolders

In [8]:
# Rename the /Grouped/ subfolder
def rename_grouped(path):
    image_list = get_image_list(path)
    originalLen = len(image_list)
    if not os.path.isdir(path + "new/"): # Make the /new/ subfolder if it doesn't exist
        os.makedirs(path + "new/")
        print ("Created /new/ folder")
    adjust_val = 0
    for count, image in enumerate(image_list):
        group_count = image.split("-")[0] # Grab the group count of this image
        # group_list is the list of all image in the same group
        group_list = [img for img in image_list if img.split("-")[0] == group_count]
         # If the subgroup of this image has already been renamed, adjust the count accordingly
        if len(group_list) == 0:
            adjust_val += 1
        # Loop through all images of this same subgroup
        for sub_count, sub_image in enumerate(group_list):
            if image.lower().endswith(".jpeg"): # New name for longer file names
                new_name = "{}new/{}-{}{}".format(path, count+1-adjust_val, sub_count+1, image[-5:])
            else:
                new_name = "{}new/{}-{}{}".format(path, count+1-adjust_val, sub_count+1, image[-4:])
            os.rename(path + sub_image, new_name) # Move to the /new/ folder (with the new name)
            time.sleep(0.08) # Sleep between each command to avoid losing files
            
        image_list = get_image_list(path) # Reset the image list now that some have been moved
        
    # Move the images back from /Grouped/new/ to /Grouped/
    for image in os.listdir(path + "new/"):
        os.rename(path + "new/" + image, path + image)
        time.sleep(0.08)
        
    os.rmdir(path + "new") # Delete /path/new/ subfolder
    print ("{} images renamed.".format(originalLen))

## Function to rename all functions in a given directory (1, 2, 3...)

In [9]:
def rename_all(path, verbose=True):
    if path.endswith("Grouped/") or path.endswith("Known/"): # Prevent accidentally renaming the grouped folder
        rename_grouped(path)
        return
    
    ## Move Images
    if verbose:
        print ('Moving images.')
    # Make the /temp/ subfolder if it doesn't exist
    if not os.path.isdir(os.path.join(path, 'temp')): 
        os.makedirs(os.path.join(path, 'temp'))
        if verbose:
            print ("Created /temp/ folder.")
        
    # Move all the images to the /temp/ subfolder
    image_list = get_image_list(path, move=True, videos=False)
    for count, image in enumerate(image_list):
        os.rename(image, "{}/{}.{}".format(os.path.join(path, 'temp'), count+1, image.split('.')[-1]))
        time.sleep(0.10)
        
    # Move all the images back from /path/temp/ to /path/
    for image in get_image_list(os.path.join(path, 'temp'), move=True, videos=False):
        os.rename(image, os.path.join(path, image.split('/temp/')[-1]))
        time.sleep(0.10)

    ## Move Videos
    if verbose:
        print ('Moving videos.')
    # Make the /Videos/ subfolder if it doesn't exist
    if not os.path.isdir(os.path.join(path, 'Videos')):
        os.makedirs(os.path.join(path, 'Videos'))
        if verbose:
            print ('Created /Videos/ folder.')
            
    # Move all the videos to the /temp/ subfolder
    for count, image in enumerate(get_image_list(path, move=True, videos=True)):
        os.rename(image, "{}/V{}.{}".format(os.path.join(path, 'temp'), count+1, image.split('.')[-1]))
        time.sleep(0.15)
    
    # Move videos back from /path/temp/ to /path/Videos/
    for image in get_image_list(os.path.join(path, 'temp'), move=True, videos=True):
        os.rename(image, os.path.join(path, 'Videos', image.split('/temp/')[-1]))
        time.sleep(0.15)
        
    # Delete the /temp/ directory
    os.rmdir(os.path.join(path, 'temp')) # Delete /path/new/ subfolder
    if verbose:
        print ("Deleting temporary subfolder.")
        
    # If there were no videos, delete the /Videos/ subfolder
    if len(get_image_list(os.path.join(path, 'Videos'), move=True, videos=True)) == 0:
        os.rmdir(os.path.join(path, 'Videos'))
        print ("No videos found - deleting subfolder.")
        
    if verbose:
        print ("{} items renamed.".format(len(get_image_list(path))))

# Function Calls

In [1]:
image_path = '/Volumes/Seagate Backup Plus Drive/Miscellaneous/Google Drive'
image_path += '/'
ignore_list = ['2018', '2019']
rename = False

if rename:
    rename_all(image_path, verbose=True)
else:
    with pd.option_context('display.max_rows', 1000, 'display.max_colwidth', 10000):
        display(identify_duplicates(image_path, ignore_list, verbose=True))

NameError: name 'pd' is not defined