# Duplicate Picture Identifier
##### by Collin Heist

## `FileSorter` Class

### Initialization Parameters
- `primary_dir`: _String_
 - This is the primarily directory where all file sorting should take place relative to.
 - This directory should be empty on its own, but contain the relevant subfolders for sorting.
- `sort`: _String, optional_
 - Specific directory (relative to `primary_dir` where the to-be-sorted files are found.
 - If unspecified, defaults to `/primary_dir/Sort/`.
- `named`: _String, optional_
 - Specified directory where all _named_ (either by filename or username OCR) files will attempt to be placed.
 - If unspecified, defaults to `/primary_dir/Named/`.
 - This should contain subfolders corresponding to each named instance (trip, person, etc.).
 - Within each named subfolder there should be a text document containing the label to match unsorted files to.
 - An example is `/primary_dir/Named/Trip to Chicago 2020/label.txt` that says: `@chicago2020` - thus all files named `chicago2020-*` will be placed within this subfolder.
- `unnamed`: _String, optional_
 - Specified directory where all unnamed (files labelled as `Unnamed-*.*`) will be placed.
 - If unspecified, defaults to `/primary_dir/Unnamed/`.
- `tt`: _String, optional_
 - Specified directory where all named (via filename, not OCR) videos are placed.
 - If unspecified, defaults to `/primary_dir/TT/`.
 - The purpose if this is to separate named _videos_ from pictures, primarily because I cannot perform OCR on images (and thus they're hard to map).
- `hash_threshold`: _Integer, optional_
 - This is the maximum difference between hashes that will be counted as a duplicate (with regards to duplicate identifcation).
 - If unspecified, defaults to `5`.
 - A value between 0 and 7 seem reasonable, with 0 leading to a lot of missed duplicates, and 7 giving a lot of false positives.

## Typical File Structure

Files from `/Sort/` are placed into their appropriate directory within `/Named/`, or `/Unnamed/` when `.sort()` is called.

    Primary Directory
    ├── Named
    |   ├── Named Folder 1
    |   |   └── mapping.txt
    |   └── Named Folder 2
    |       └── mapping.txt
    ├── Sort 
    |   ├── random_image.jpeg
    |   ├── named_image1-1.png
    |   ├── named_image1-2.png
    |   ├── named_image2-1.png
    |   ├── named_image2-2.png
    |   ├── Ungrouped-1.heic
    |   └── ...
    ├── TT
    ├── Unnamed
    |   └── Ungrouped
    |       ├── 1.jpeg
    |       ├── 2.jpg
    |       └── ...
    └──

In [1]:
from PIL import Image, ImageOps
import imagehash               # Hashing images for duplicate identification
import os                      # File management (directories, files, etc.)
import time                    # Time delays between file movements
import pandas as pd            # Dataframe (how all data is parsed)
from tqdm.notebook import tqdm # Progress Bars on sorting functionality
import re                      # Regex searches on filenames and username identification
import cv2                     # More optical character recognition
import pytesseract             # Optical Character Recognition (OCR) for usernames
import numpy as np             # Array manipulation

In [2]:
class FileSorter():
    video_extensions = ('.mov', '.mp4', '.gif', '.webm', '.mkv')
    image_extensions = ('.jpg', '.jpeg', '.png', '.heic')
    dupe_extensions = (".jpg", ".jpeg", ".png")
    
    format_string = '{desc}: {percentage:05.2f}%{bar}{n_fmt}/{total_fmt} [{elapsed}, {rate:05.2f}]'
    
    sort_regex = re.compile('^.*-\d+.') # Match named files [username-number.extension]
    username_regex = re.compile('(@|\/[^u][^\/])\S+') # Match @username and /u/username
    
    ocr_tesseract_config = '--psm 6 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789._' # Valid characters in usernames
    box_threshold = 250 # 0 (black) - 255 (white) for average of the 92 pixels being looked at
    item_move_delay = 0.15 # How many seconds to wait between calls of os.rename()
    
    def __init__(self, primary_dir, sort='Sort', named='Named', unnamed='Unnamed', tt='TT', hash_threshold=5):
        self.dir = primary_dir
        self.sort_dir = os.path.join(primary_dir, sort)
        self.named_dir = os.path.join(primary_dir, named)
        self.ungrouped_dir = os.path.join(primary_dir, unnamed, 'Ungrouped')
        self.tt_dir = os.path.join(primary_dir, tt)
        self.hash_threshold = hash_threshold
        
    def __str__(self):
        print_str = f'Hash Threshold: {self.hash_threshold}'
        return print_str + '{}/\n - {}/\n - {}/\n - {}/'.format(self.dir, self.sort_dir[len(self.dir):], self.named_dir[len(self.dir):], self.tt_dir[len(self.dir):])
        
    def __repr__(self):
        nl, t = '\n', '    '
        return (f'{self.__class__.__name__}({nl}{t}{self.dir!r}, {nl}{t}{self.sort_dir!r}, {nl}{t}{self.named_dir!r},\
                {nl}{t}{self.ungrouped_dir!r}, {nl}{t}{self.tt_dir!r}, {nl}{t}{self.hash_threshold!r}{nl})') 
    
    # Function to sort self.sort_dir to their respsective folders in self.named_dir
    def sort(self, sort_named_after=True):
        # Get list of all images + videos for moving within this directory
        file_list = self.get_images(self.sort_dir) + self.get_videos(self.sort_dir)
        username_list = []
        
        df = pd.DataFrame(columns=['Source', 'S-Source', 'Username', 'Destination', 'S-Destination', 'Extension'])
        df['Source'] = file_list
        df['S-Source'] = [file[len(self.dir) + 1:] for file in file_list]
        df['Extension'] = df['Source'].str.split('.').str[-1].str.lower() # Get the extension of all images
        
        if len(df) == 0: # If no files are to be sorted
            return ('No files in ', str(self.sort_dir))
        
        # Get the username of all images; either through OCR or looking at the filename
        for _, row in tqdm(df.iterrows(), total=len(df), desc='Recognizing Usernames', ncols='100%', bar_format=FileSorter.format_string):
            image = row['Source'][len(self.sort_dir) + 1:]
            match = FileSorter.sort_regex.match(image)
            if match == None and ('.' + row['Extension']) in FileSorter.image_extensions:
                username_list.append(self.username_ocr(row['Source'])) # If the file isn't prenamed, perform OCR
            elif match == None and ('.' + row['Extension']) in FileSorter.video_extensions: # All unnamed videos
                username_list.append('No Username Found')
            else:
                username_list.append(image[:match.group().rfind('-')]) # If the file is prenamed

        df['Username'] = username_list # Assign found username to each image
        df = self.__find_destinations(df) # Find corresponding destination files for all identified socials
        df['S-Destination'] = df['Destination'].str.slice(len(self.dir) + 1) # Shorten filepath
        df.loc[df['Destination'] == 'No Destination Found', 'S-Destination'] = 'No Destination Found'
        
        # Get the DataFrame of the files that have a valid, sorted destination
        move_df = df[df['Destination'] != 'No Destination Found']
        if len(move_df) != 0:
            for _, row in tqdm(move_df.iterrows(), total=len(move_df), desc='Moving Socials', ncols='100%', bar_format=FileSorter.format_string):
                self.move_file(row['Source'], row['Destination'], row['Extension'], is_video=False)
                time.sleep(FileSorter.item_move_delay)
            
        # Move all username'd, but unsortable files
        rename_df = df[(df['Username'] != 'No Username Found') & (df['Destination'] == 'No Destination Found')]
        if len(rename_df) != 0:
            usernames = rename_df['Username'].unique()
            for username in tqdm(usernames, unit='Usernames', desc='Renaming Identified Files', ncols='100%', bar_format=FileSorter.format_string):
                self.__rename_unsorted_username(df[df['Username'] == username])
                
        # Now sort all sub-folders of self.named_dir
        if sort_named_after:
            self.sort_all_named()
        
        return df[['S-Source', 'Username', 'S-Destination']]
    
    # Function to go through all items of self.named_dir and sort them
    def sort_all_named(self):
        all_folders = self.get_folders(self.named_dir, recursion=True)
        for folder in tqdm(all_folders, desc='Sorting Named Folders', ncols='100%', bar_format=FileSorter.format_string):
            if not self.check_if_sorted(folder): # Only sort if unsorted already
                self.rename_directory(folder)

    # Function to rename all files of a given username that were not able to be placed in self.named_dir
    def __rename_unsorted_username(self, df):
        # If all values are videos, then move to self.tt directory instead, otherwise self.sort_dir
        dest_dir = self.tt_dir if ('.' + df['Extension']).isin(FileSorter.video_extensions).all() else self.sort_dir 
        item_number = 1
        for _, row in df.iterrows():
            # While the destination filename exists, skip over that file
            while os.path.exists(os.path.join(dest_dir, f"{row['Username']}-{item_number}.{row['Extension']}")):
                item_number += 1
                
            if os.path.exists(row['Source']):
                os.rename(row['Source'], os.path.join(dest_dir, f"{row['Username']}-{item_number}.{row['Extension']}"))
                time.sleep(FileSorter.item_move_delay)
                item_number += 1
        
    # Function to move file.extension from source to destination folder, adds V
    def move_file(self, source, destination, extension, is_video=False):
        item_number = 1
        while os.path.exists(os.path.join(destination, f'{item_number}.{extension}')):
            item_number += 1
            
        new_name = f"{'Videos/' if is_video else ''}{'V' if is_video else ''}{item_number}.{extension}"
        try:
            os.rename(source, os.path.join(destination, new_name))
        except FileNotFoundError:
            os.mkdir(os.path.join(destination, 'Videos')) # Create /Videos/ subfolder if it didn't exist
            os.rename(source, os.path.join(destination, new_name)) # Move file now that /Videos/ folder exists
    
    # Find the destination folders for all socials specified within the passed DataFrame (based on matching usernames)
    def __find_destinations(self, df):
        named_usernames = df['Username'].unique() # Get unique list of all identified usernames
        text_files = self.get_text_files(self.named_dir)
        text_usernames = [self.get_usernames(text_file) for text_file in text_files]
        
        # Go through all the unique socials in the sorting group
        for username in named_usernames: 
            matching_file = 'No Destination Found/'
            
            if username.lower() == 'ungrouped':
                matching_file = self.ungrouped_dir + '/'
            elif username.lower() == 'grouped':
                matching_file = 'No Destination Found/'
            else:
                for file, username_list in zip(text_files, text_usernames):
                    if username.lower() in username_list:
                        matching_file = file
                        break
                    
            df.loc[df['Username'] == username, 'Destination'] = matching_file[:matching_file.rfind('/')]
            
        return df
    
    # Get the usernames listed within a given text file
    def get_usernames(self, text_file):
        with open(text_file, 'r') as file:
            try:
                return [FileSorter.username_regex.search(word).group()[1:].lower() for line in file.readlines() for word in line.split(',')]
            except AttributeError: # If .group() fails (meaning no match on the regex), then print the file and return invalid
                print (f'Invalid username entry within {text_file}')
                return ['---invalid_username---']
            
    # Perform OCR on an image specified at path - crops the image based on where the identified social tag is
    def username_ocr(self, path):
        image = ImageOps.invert(Image.open(path).convert('L')) # Open, invert, convert to black and white
        image_arr = np.array(image)
        image_width, image_height = image.size
        if image_width < 92 or image_height < 92: 
            return "No Username Found" # If the image is too small for a valid username tag, return
        
        # Determine which corner of the image the label is in by looking at sections of 92 pixels in each corner
        is_bottom_left = np.average(image_arr[-92:, 1:4]) > FileSorter.box_threshold and np.average(image_arr[-5:-2, :92]) > FileSorter.box_threshold
        is_top_left = np.average(image_arr[:92, 1:4]) > FileSorter.box_threshold and np.average(image_arr[1:4, :92]) > FileSorter.box_threshold
        is_bottom_right = np.average(image_arr[-92:, -5:-2]) > FileSorter.box_threshold and np.average(image_arr[-5:-2, -92:]) > FileSorter.box_threshold
        is_top_right = np.average(image_arr[:92, -5:-2]) > FileSorter.box_threshold and np.average(image_arr[1:4, -92:]) > FileSorter.box_threshold
        if np.sum([is_bottom_left, is_top_left, is_bottom_right, is_top_right]) != 1:
            return 'No Username Found' # If the number of boxes found != 1, return
        
        # Find width of the label field by counting the number of black (white) pixels
        row = 3 if is_top_left or is_top_right else -4         # Which pixel row of the image to count along
        direction = 1 if is_bottom_left or is_top_left else -1 # Which direction to iterate through the image in
        column = 3 if is_bottom_left or is_top_left else -4    # Which pixel column of the image to count along
        box_width = 0
        moving_window = [image_arr[row, column], image_arr[row, column + direction], image_arr[row, column + 2 * direction]]
        while np.average(moving_window) > FileSorter.box_threshold and box_width < image_width - 5:
            moving_window = [image_arr[row, column], image_arr[row, column + direction], image_arr[row, column + 2 * direction]]
            column += direction
            box_width += 1
        
        # Crop the image, perform OCR and return the result
        crop_x1 = 92 if is_bottom_left or is_top_left else image_width - box_width + 92
        crop_y1 = 0 if is_top_left or is_top_right else image_height - 92
        crop_x2 = box_width if is_bottom_left or is_top_left else image_width
        crop_y2 = 92 if is_top_left or is_top_right else image_height
        if crop_x1 > image_width or crop_x2 > image_width or crop_y1 > image_height or crop_y2 > image_height or crop_x2 < crop_x1 or crop_y2 < crop_y1:
            return 'No Username Found' # If the computed crop positions are out of bounds, return
        
        # Provided no errors occurred, parse the cropped image for a username and return it
        return pytesseract.image_to_string(image.crop((crop_x1, crop_y1, crop_x2, crop_y2)), lang='eng', config=FileSorter.ocr_tesseract_config)
    
    # Function to check if the contents of a  given path are sorted or not 
    def check_if_sorted(self, path, is_videos=False):
        # Verify if all images are sorted
        contents = self.get_videos(path, False) if is_videos else self.get_images(path, False)
        short_contents = [file[len(path):] for file in contents]
        num_elements = len(short_contents)
        try:
            # Try and cast pre-extension filename as integers, sort them, and compare to proper names
            content_numbers = np.sort(np.array([file[1 + int(is_videos):file.rfind('.')] for file in short_contents]).astype(int))
            correct_numbers = np.arange(1, num_elements + 1)
            # If there are >0 inequalities in the element names when compared to correct names, they're unsorted
            if np.sum(content_numbers != correct_numbers) == 0:
                if os.path.exists(os.path.join(path, 'Videos')): # If /Videos/ exists, check it is sorted as well
                    return self.check_if_sorted(os.path.join(path, 'Videos'), True)
                else: # If no ValueError was thrown and video's didn't exist / was checked, folder is Sorted
                    return True
            else:
                return False
        except ValueError: # Value error occurs trying to cast np array .astype(int) - meaning incorrect names
            return False
            
    # Function to rename all files of a given path - /path/1, /path/2, etc. for images, /path/Videos/V1, ... for Videos
    def rename_directory(self, path):
        # Make the /temp/ subfolder if it doesn't exist
        os.makedirs(os.path.join(path, 'temp'), exist_ok=True)

        # Move all the images to the /temp/ subfolder
        image_list = self.get_images(path)
        for count, image in enumerate(image_list):
            os.rename(image, f"{os.path.join(path, 'temp', str(count+1))}.{image.split('.')[-1]}")
            time.sleep(FileSorter.item_move_delay)

        # Move all the images back from /path/temp/ to /path/
        for image in self.get_images(os.path.join(path, 'temp')):
            os.rename(image, os.path.join(path, image.split('/temp/')[-1]))
            time.sleep(FileSorter.item_move_delay)

        ## Move Videos
        # Make the /Videos/ subfolder if it doesn't exist
        os.makedirs(os.path.join(path, 'Videos'), exist_ok=True)

        do_recurse_videos = False if path.endswith('-Famous') else True
        video_list = self.get_videos(path, recursion=do_recurse_videos)
        if len(video_list) != 0:
            # Move all the videos to the /temp/ subfolder
            for count, image in enumerate(video_list):
                os.rename(image, f"{os.path.join(path, 'temp', 'V' + str(count+1))}.{image.split('.')[-1]}")
                time.sleep(FileSorter.item_move_delay)

            # Move videos back from /path/temp/ to /path/Videos/
            for image in self.get_videos(os.path.join(path, 'temp')):
                os.rename(image, os.path.join(path, 'Videos', image.split('/temp/')[-1]))
                time.sleep(FileSorter.item_move_delay)

        # If there were no videos, delete the /Videos/ subfolder
        if len(self.get_videos(os.path.join(path, 'Videos'))) == 0:
            os.rmdir(os.path.join(path, 'Videos'))

        # Delete the /temp/ directory
        os.rmdir(os.path.join(path, 'temp')) # Delete /path/new/ subfolder
        
    def get_folders(self, path, recursion=True, ignore_list=['Videos']):
        list_of_files = os.listdir(path)
        all_folders = []
        
        for entry in list_of_files:
            full_path = os.path.join(path, entry)
            if os.path.isdir(full_path) and entry not in ignore_list:
                all_folders.append(full_path)
                if recursion:
                    all_folders = all_folders + self.get_folders(full_path, recursion)
                    
        return all_folders
    
    # Function to get all text files within a given path and return a list of their directories
    def get_text_files(self, path, recursion=True):
        list_of_files = os.listdir(path)
        all_files = []
        
        for entry in list_of_files:
            full_path = os.path.join(path, entry)
            if os.path.isdir(full_path) and recursion:
                all_files = all_files + self.get_text_files(full_path, recursion)
            else:
                all_files.append(full_path)
                
        return [file for file in all_files if file.lower().endswith('.txt')]
    
    def get_videos(self, path, recursion=False, ignore_list=['Duplicates']):
        return self.__get_file_list(path, move=True, videos=True, recursion=recursion, ignore_list=ignore_list)
    
    def get_images(self, path, recursion=False, ignore_list=['Duplicates']):
        return self.__get_file_list(path, move=True, videos=False, recursion=recursion, ignore_list=ignore_list)
    
    def get_dupe_images(self, path, recursion=True, ignore_list=['Duplicates']):
        return self.__get_file_list(path, move=False, recursion=recursion, ignore_list=ignore_list)
    
    # Function to get all images at a given path (returns different files based on move, videos, recursion)
    def __get_file_list(self, path, move=True, videos=False, recursion=True, ignore_list=['Duplicates']):
        list_of_files = os.listdir(path) # Get a list of all files in the current directory
        all_files = []
        
        for entry in list_of_files:
            full_path = os.path.join(path, entry)
            # If entry is a directory then get the list of files in this directory 
            if os.path.isdir(full_path) and entry not in ignore_list and recursion:
                all_files = all_files + self.__get_file_list(full_path, move, videos, recursion, ignore_list)
            else:
                all_files.append(full_path)
                
        if move == False:
            return [image for image in all_files if image.lower().endswith(FileSorter.dupe_extensions)]
        elif move == True and videos == False:
            return [image for image in all_files if image.lower().endswith(FileSorter.image_extensions)]
        elif move == True and videos == True:
            return [image for image in all_files if image.lower().endswith(FileSorter.video_extensions)]

In [11]:
fs = FileSorter('/Volumes/Seagate Backup Plus Drive/Miscellaneous/Google Drive', tt='TikTok')
sort_df = fs.sort(sort_named_after=True)

HBox(children=(FloatProgress(value=0.0, description='Recognizing Usernames', layout=Layout(flex='2'), max=379.…




HBox(children=(FloatProgress(value=0.0, description='Moving Socials', layout=Layout(flex='2'), max=97.0, style…




HBox(children=(FloatProgress(value=0.0, description='Renaming Identified Files', layout=Layout(flex='2'), max=…




HBox(children=(FloatProgress(value=0.0, description='Sorting Named Folders', layout=Layout(flex='2'), max=386.…




In [None]:
def identify_duplicates(path, ignore_list=[None], verbose=True):
    image_list = get_image_list(path, ignore_list=ignore_list, move=False)
    if verbose:
        print ("{} images, requiring {} comparisons".format(len(image_list), num_comparisons(image_list)))
        print ("This should take about {} minutes".format(int(time_per_compare * num_comparisons(image_list) / 60.0)))
        start_time = time.time() # Mark the start time for user-notification
        
    # Loop through all images, calculate the hash and store it and filename in two lists
    name_list, hash_list = [], []
    time.sleep(0.25)
    bar_format_string = '{desc}: {percentage:05.2f}%{bar}{n_fmt}/{total_fmt} [{elapsed}, {rate:05.2f} {unit}/s]'
    for image in tqdm(image_list, unit='hashes', desc='Image Hashing', ncols='100%', bar_format=bar_format_string):
        try:
            hash_val = imagehash.average_hash(Image.open(image))
        except OSError:
            print ("Delete {}".format(image))
            hash_val = 0
            
        name_list.append(image)
        hash_list.append(hash_val)
    
    # Create a DataFrame for managing the hash comparisons and results
    df = pd.DataFrame(np.transpose([name_list, hash_list]), columns=["File", "Hash"])
    df["Duplicates"] = "" # Create an empty (for now) DataFrame column
    df['DupeStr'] = '' # Create an empty column that will house the string-converted array of duplicates
    
    # Loop through all rows of the DataFrame, computing the difference in hashes between all
    for row_num, row in tqdm(df.iterrows(), total=len(df), unit='comparisons', desc='Hash Comparison', ncols='100%', bar_format=bar_format_string):
        # Store all file names whose hash is below the threshold in that row's Duplicate column
        df.at[row_num, "Duplicates"] = df[(~df['DupeStr'].str.contains(row['File']).any()) & (abs(df["Hash"] - row["Hash"]) < hash_threshold) & (df["File"] != row["File"])]["File"].values
        df.at[row_num, 'DupeStr'] = ' '.join(map(str, row['Duplicates']))
    
    # Create the duplicated entries folder that has aliases to each duplicated file
    create_duplicate_aliases(path, df)
    
    # If applicable, notify the user of how long the operations took
    if verbose:
        print ("Took {:.2} minutes".format((time.time() - start_time) / 60.0))
        
    # Return the DataFrame just for display purposes
    return df[df['Duplicates'].astype(str) != '[]'][['File', 'Duplicates']].reset_index(drop=True)

In [2]:
time_per_compare = 1e-4
hash_threshold = 5
progress_report_frequency = 50
video_extensions = ('.mov', '.mp4', '.gif', '.webm', '.mkv')
image_extensions = ('.jpg', '.jpeg', '.png', '.heic')
dupe_extensions = (".jpg", ".jpeg", ".png")

## Function to get the image list
This function takes a given directory and returns all files that fit either the move or duplicate extension criteria, as set by the value of `move`.

## Function to check all images in a given path for duplicates
This is the coup de grâce; all images in the provided path are compared, duplicates are identified and a convenient `DataFrame` is returned.

In [5]:
def identify_duplicates(path, ignore_list=[None], verbose=True):
    image_list = get_image_list(path, ignore_list=ignore_list, move=False)
    if verbose:
        print ("{} images, requiring {} comparisons".format(len(image_list), num_comparisons(image_list)))
        print ("This should take about {} minutes".format(int(time_per_compare * num_comparisons(image_list) / 60.0)))
        start_time = time.time() # Mark the start time for user-notification
        
    # Loop through all images, calculate the hash and store it and filename in two lists
    name_list, hash_list = [], []
    time.sleep(0.25)
    bar_format_string = '{desc}: {percentage:05.2f}%{bar}{n_fmt}/{total_fmt} [{elapsed}, {rate:05.2f} {unit}/s]'
    for image in tqdm(image_list, unit='hashes', desc='Image Hashing', ncols='100%', bar_format=bar_format_string):
        try:
            hash_val = imagehash.average_hash(Image.open(image))
        except OSError:
            print ("Delete {}".format(image))
            hash_val = 0
            
        name_list.append(image)
        hash_list.append(hash_val)
    
    # Create a DataFrame for managing the hash comparisons and results
    df = pd.DataFrame(np.transpose([name_list, hash_list]), columns=["File", "Hash"])
    df["Duplicates"] = "" # Create an empty (for now) DataFrame column
    df['DupeStr'] = '' # Create an empty column that will house the string-converted array of duplicates
    
    # Loop through all rows of the DataFrame, computing the difference in hashes between all
    for row_num, row in tqdm(df.iterrows(), total=len(df), unit='comparisons', desc='Hash Comparison', ncols='100%', bar_format=bar_format_string):
        # Store all file names whose hash is below the threshold in that row's Duplicate column
        df.at[row_num, "Duplicates"] = df[(~df['DupeStr'].str.contains(row['File']).any()) & (abs(df["Hash"] - row["Hash"]) < hash_threshold) & (df["File"] != row["File"])]["File"].values
        df.at[row_num, 'DupeStr'] = ' '.join(map(str, row['Duplicates']))
    
    # Create the duplicated entries folder that has aliases to each duplicated file
    create_duplicate_aliases(path, df)
    
    # If applicable, notify the user of how long the operations took
    if verbose:
        print ("Took {:.2} minutes".format((time.time() - start_time) / 60.0))
        
    # Return the DataFrame just for display purposes
    return df[df['Duplicates'].astype(str) != '[]'][['File', 'Duplicates']].reset_index(drop=True)

## Create a `/Duplicates/` folder that contains aliases to each duplicated picture

In [6]:
def create_duplicate_aliases(path, df):
    # Filter the DataFrame to ignore all entries without duplicates
    df = df[df["Duplicates"].astype(str) != '[]'][["File", "Duplicates"]].reset_index(drop=True)
    
    # Return if no duplicates are found
    if len(df) == 0:
        return
    
    # Create the folder where duplicate will be placed
    try:
        os.mkdir(os.path.join(path, "Duplicates"))
    except:
        print ("/Duplicates/ folder already exists.")
        
    # Loop through all rows in the duplicate entries - for each row, create a 
    for row_num, row in df.iterrows():
        # Create a numbered folder to contain aliases to all matched duplicates
        os.mkdir(os.path.join(path, 'Duplicates', str(row_num)))
        # Create the alias to the source file that there are duplicates of
        os.symlink(row['File'], os.path.join(path, 'Duplicates', str(row_num), "0 - {}, {}.{}".format(row['File'].split('/')[-2], row['File'].split('/')[-1].split('.')[-2], row['File'].split('.')[-1])))
        # Loop through all found duplicates and create aliases for each one
        for dupe_num, dupe in enumerate(row['Duplicates']):
            os.symlink(dupe, os.path.join(path, 'Duplicates', str(row_num), '{} - {}, {}.{}'.format(dupe_num + 1, dupe.split('/')[-2], dupe.split('/')[-1].split('.')[-2], dupe.split(".")[-1])))

## Specific function to rename the `/grouped/` subfolders

In [7]:
# Rename the /Grouped/ subfolder
def rename_grouped(path):
    image_list = get_image_list(path)
    originalLen = len(image_list)
    if not os.path.isdir(path + "new/"): # Make the /new/ subfolder if it doesn't exist
        os.makedirs(path + "new/")
        print ("Created /new/ folder")
    adjust_val = 0
    for count, image in enumerate(image_list):
        group_count = image.split("-")[0] # Grab the group count of this image
        # group_list is the list of all image in the same group
        group_list = [img for img in image_list if img.split("-")[0] == group_count]
         # If the subgroup of this image has already been renamed, adjust the count accordingly
        if len(group_list) == 0:
            adjust_val += 1
        # Loop through all images of this same subgroup
        for sub_count, sub_image in enumerate(group_list):
            if image.lower().endswith(".jpeg"): # New name for longer file names
                new_name = "{}new/{}-{}{}".format(path, count+1-adjust_val, sub_count+1, image[-5:])
            else:
                new_name = "{}new/{}-{}{}".format(path, count+1-adjust_val, sub_count+1, image[-4:])
            os.rename(path + sub_image, new_name) # Move to the /new/ folder (with the new name)
            time.sleep(0.08) # Sleep between each command to avoid losing files
            
        image_list = get_image_list(path) # Reset the image list now that some have been moved
        
    # Move the images back from /Grouped/new/ to /Grouped/
    for image in os.listdir(path + "new/"):
        os.rename(path + "new/" + image, path + image)
        time.sleep(0.08)
        
    os.rmdir(path + "new") # Delete /path/new/ subfolder
    print ("{} images renamed.".format(originalLen))