# Duplicate Picture Identifier
##### by Collin Heist

In [99]:
from pathlib import Path
from re import search, IGNORECASE

import numpy as np
from PIL import Image, ImageOps

from pytesseract import image_to_string
from tqdm.notebook import tqdm



import imagehash               # Hashing images for duplicate identification
import time                    # Time delays between file movements
import pandas as pd            # Dataframe (how all data is parsed)

In [103]:
class PersonExistsError(Exception): pass
class InvalidPersonDetailsError(Exception): pass
class PersonDoesNotExistError(Exception): pass

class FileDoesNotBelongError(Exception): pass
class FileDoesNotExistError(Exception): pass
class UnidentifiedFiletypeError(Exception): pass

In [129]:
class Person:
    """
    
    """

    """"""
    IMAGE_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.heic')
    VIDEO_EXTENSIONS = ('.mov', '.mp4', '.gif', '.webm', '.mkv')
    
    """"""
    __NUMBERED_FILE_REGEX = f'^V?(\d+)\.(?:{"|".join(IMAGE_EXTENSIONS+VIDEO_EXTENSIONS)})$'
    
    def __init__(self, name: str, path: Path):
        """
        
        """
        
        # This person's name
        self.name: str = name
            
        # The path this person's files are found at
        if isinstance(path, Path):
            self.path: Path = path
        else:
            self.path: Path = Path(path)
                
        # The path this person's video files are found at
        self.video_path: Path = self.path / 'Videos'
            
        # The path new files should be added under 
        self.__next_filenames: dict = {'image': None, 'video': None}
        self.__update_next_valid_filenames()
                
        # This person's social media to usernames map
        self.__socials: dict = {}
            
        # This person's combined search regex
        self.__socials_regex: str = '^()$'
            
            
    def __str__(self) -> str:
        """Get a less detailed string representation of the object."""
        
        return f'<Person "{self.name}" at "{self.path}" with socials {", ".join(self.__socials.keys())}>'
    
            
    def __repr__(self) -> str:
        """Get a more detailed string representation of the object."""
        
        return f'<Person "{self.name}" at "{self.path.resolve()}" with socials {self.__socials}>'
    
    
    def __contains__(self, username: str) -> bool:
        """
        
        """
        
        # And the username itself to avoid empty social regex matching empty strings
        return bool(search(self.__socials_regex, username, IGNORECASE)) and username
    
    
    def add_socials(self, **socials: dict) -> None:
        """
        
        """
        
        # Ensure all values are lists 
        socials = {key: val if isinstance(val, list) else [val] for key, val in socials.items()}
        
        # For each given media and username(s), either create entry or extend current one
        for media, usernames in socials.items():
            # Adjust regex to include new usernames
            self.__socials_regex = (
                self.__socials_regex[:-2] + # Get all of regex up to ending ')$'
                ''.join([f'{val}|' for val in usernames]) +
                ')$'
            )
            
            # Extend this media list
            if media in self.__socials:
                self.__socials[media].extend(usernames)
            else:
                self.__socials[media] = usernames
                
                
    def add_file(self, file: Path) -> None:
        """
        
        """
        
        # If the file does not exist, error
        if not file.exists():
            raise FileDoesNotExist(f'File ({file.name}) does not exist.')
        
        # Depending on if the file is a image/video, rename accordingly
        if file.name.endswith(self.IMAGE_EXTENSIONS):
            file.rename(self.path / f"{self.__next_filenames['image']}{file.suffix}")
        elif file.name.endswith(self.VIDEO_EXTENSIONS):
            file.rename(self.video_path / f"V{self.__next_filenames['video']}{file.suffix}")
        else:
            raise UnidentifiedFiletypeError(f'File ({file.name}) does not have a known filetype.')
            
        # Get next valid filenames
        self.__update_next_valid_filenames()
                
    
    def sort(self) -> None:
        """Sort this person's image and videos into sequential order."""
        
        self.__sort(False) # Sort images
        self.__sort(True)  # Sort videos
        
        
    def __sort(self, is_video: bool):
        """
        Sort the image or video path of this Person so all items are sequential. For example, the
        image directory (``is_video`` is False) below would rename as:
        
            /``self.path``/  ->  /``self.path``/
            1.jpeg           ->  [no rename]
            3.jpg            ->  [no rename]
            4.png            ->  2.png
            
        :param
        """
        
        # Exit if the person's path/video path doesn't exist
        if (not is_video and not self.path.exists()) or (is_video and not self.video_path.exists()):
            return
        
        # Get the extensions/paths/prefix for this sort type (video or image)
        extensions = self.VIDEO_EXTENSIONS if is_video else self.IMAGE_EXTENSIONS
        search_path = self.video_path if is_video else self.path
        prefix = 'V' if is_video else ''
        
        # Get Path objects of all the images/videos in this Person's search path
        files = [file for file in search_path.iterdir() if file.name.endswith(extensions)]
        
        # Get set of filenames (with/without extension) of all images in this Person's search path
        current = set([Path(file.name) for file in files])
        current_no_ext = set([file.stem for file in files])
        
        # Get set of all possible desired filenames - e.g. {'1.jpg', '1.png', '1.jpeg', ..., 'n.jpeg'}
        # or {'1.mov', '1.mp4', ..., 'n.mp4'}, this is all permutations of valid extensions and the
        # correct numbers
        desired = set([Path(f'{prefix}{n}.{e}') for n in range(1, len(files)+1) for e in extensions])
        desired_no_ext = set([f'{prefix}{n}' for n in range(1, len(files)+1)])
        
        # Go through all extra filenames and rename to missing filename
        for start_name, new_name in zip((current - desired), (desired_no_ext -  current_no_ext)):
            new_name_path = search_path / f'{new_name}{start_name.suffix}'
            print(f'Renaming {start_name} -> {new_name_path}')
#             (search_path / start_name).rename(new_name_path)
            

    def __update_next_valid_filenames(self) -> None:
        """
        
        """
        
        for label, directory in zip(('image', 'video'), (self.path, self.video_path)):
            if not directory.exists():
                self.__next_filenames[label] = '1'
                continue
            
            filelist = [search(self.__NUMBERED_FILE_REGEX, file.name, IGNORECASE).group(1)
                        for file in directory.iterdir()
                        if search(self.__NUMBERED_FILE_REGEX, file.name, IGNORECASE)]

            file_number = 1
            while str(file_number) in filelist:
                file_number += 1

            self.__next_filenames[label] = str(file_number)

In [326]:
class SocialMap:
    """
    
    """
    
    
    def __init__(self, base_directory: Path) -> None:
        """
        
        """
        
        self._map: dict = {}
            
        # Create Path object if directory was given
        if isinstance(base_directory, Path):
            self._base_directory: Path = base_directory
        else:
            self._base_directory: Path = Path(base_directory)
            
            
    def __str__(self) -> str:
        """Get a less detailed string representation of the object."""
        
        return f'<Social Map of {len(self._map)} Person objects>'
            
            
    def __repr__(self) -> str:
        """Get a more detailed string representation of the object."""
        
        return f'<Social Map of {len(self._map)} Person objects ({", ".join(self._map)})>'
    
    
    def __contains__(self, person) -> bool:
        """
        
        """
        
        if isinstance(person, Person):
            return person.name in self._map
        elif isinstance(person, str):
            return person in self._map
        
        
    def __getitem__(self, person: str) -> Person:
        """
        
        """
        
        if person not in self:
            raise PersonDoesNotExistError(f'Person {name} does not exist in social map.')
            
        return self._map[person]
            
            
    def __iadd__(self, person: ('name', 'path')) -> 'SocialMap':
        """Wrapper for ``add_person()`` method."""
        
        if not isinstance(person, tuple) or len(person) != 2:
            raise ValueError(f'Person creation requires two arguments, "name" and "path".')
        
        self.add_person(*person)
            
        return self
    
    
    def __iter__(self) -> iter:
        """
        
        """
        
        yield from self._map
    
    
    def add_person(self, name: str, path: Path) -> None:
        """
        
        """
        
        # Create Person object from given arguments and base directory
        person = Person(name, self._base_directory / path)
        
        # See if the person is already in the map
        if person in self:
            raise PersonExistsError(f'{person} already exists in social map.')
            
        # Add person to map under their name
        self._map[person.name] = person
        
        
    def add_socials(self, name: str, **socials: dict) -> None:
        """
        
        """
        
        # Check the given name has an associated Person object
        if name not in self:
            raise PersonDoesNotExistError(f'Person {name} does not exist in social map.')
            
        self._map[name].add_socials(**socials)
        
        
    def sort(self, name: str) -> None:
        """
        
        """
        
        if name not in self:
            raise PersonDoesNotExistError(f'Person {name} does not exist in social map.')
            
        self._map[name].sort()
        
        
    def sort_all(self) -> None:
        """
        
        """
        
        for _, person in self._map.items():
            person.sort()
            
            
    def add_file_to_person(self, person: str, file: Path) -> None:
        """
        
        """
        
        if person not in self:
            raise PersonDoesNotExistError(f'Person {person} does not exist in social map.')
            
        if isinstance(person, Person):
            self._map[person.name].add_file(file)
        else:
            self._map[person].add_file(file)

In [38]:
class UnsortedDirectory:
    """
    
    """
    
    """Extensions that are valid files (and to be iterated through) for this directory."""
    VALID_EXTENSIONS = ('jpg', 'jpeg', 'png', 'heic',
                        'mov', 'mp4', 'gif', 'webm', 'mkv')
    
    """Regex to determine if a final is labelled."""
    IS_LABELED_REGEX = f'^(.*)-\d+\.(?:{"|".join(VALID_EXTENSIONS)})$'
    
    """String to use when a person cannot be identified using OCR."""
    UNIDENTIFIED = 'UNIDENTIFIED PERSON'
    
    """Config paramaters to use for Pytesseract OCT"""
    OCR_CONFIG = '--psm 6 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789._'
    
    """Threshold between black and white used to detect edge of username box"""
    _BOX_THRESHOLD = 250 # 0 (black) - 255 (white) for average of the 3x92 pixel block being looked at
    
    def __init__(self, path: Path) -> None:
        """
        
        """
        
        if isinstance(path, Path):
            self.path = path
        else:
            self.path = Path(path)
            
    
    def __iter__(self) -> iter:
        """
        
        """
        
        yield from (file in self.path.iterdir() if file.name.endswith(self.VALID_EXTENSIONS))
        
        
    def identify_person(self, file: Path) -> str:
        """
        
        """
        
        # If the file is labeled - just return that label
        is_labeled = search(self.IS_LABELED_REGEX, file.name, IGNORECASE)
        if is_labeled:
            return is_labeled.group(1)
        
        if not file.exists():
            return 
        
        ##  If the file is not labeled - perform OCR
        # Open the image, invert, and convert to black and white
        image = ImageOps.invert(Image.open(path).convert('L'))
        image_arr = np.array(image)
        width, height = image.size
        if width < 92 or height < 92: 
            return self.UNIDENTIFIED # If the image is too small for a valid username tag, leave
        
        # Determine which corner of the image the label is in by looking at sections of 92 pixels in each corner
        is_bottom_left = np.average(image_arr[-92:, 1:4]) > self._BOX_THRESHOLD and np.average(image_arr[-5:-2, :92]) > self._BOX_THRESHOLD
        is_top_left = np.average(image_arr[:92, 1:4]) > self._self._BOX_THRESHOLD and np.average(image_arr[1:4, :92]) > self._BOX_THRESHOLD
        is_bottom_right = np.average(image_arr[-92:, -5:-2]) > self._BOX_THRESHOLD and np.average(image_arr[-5:-2, -92:]) > self._BOX_THRESHOLD
        is_top_right = np.average(image_arr[:92, -5:-2]) > self._BOX_THRESHOLD and np.average(image_arr[1:4, -92:]) > self._BOX_THRESHOLD
        if np.sum([is_bottom_left, is_top_left, is_bottom_right, is_top_right]) != 1:
            return self.UNIDENTIFIED # If the number of boxes found != 1, leave
        
        # Find width of the label field by counting the number of black (white) pixels
        row = 3 if is_top_left or is_top_right else -4         # Which pixel row of the image to count along
        direction = 1 if is_bottom_left or is_top_left else -1 # Which direction to iterate through the image in
        column = 3 if is_bottom_left or is_top_left else -4    # Which pixel column of the image to count along
        box_width = 0
        moving_window = [image_arr[row, column], image_arr[row, column + direction], image_arr[row, column + 2 * direction]]
        while np.average(moving_window) > self._BOX_THRESHOLD and box_width < width - 5:
            moving_window = [image_arr[row, column], image_arr[row, column + direction], image_arr[row, column + 2 * direction]]
            column += direction
            box_width += 1
        
        # Crop the image, perform OCR and return the result
        crop_x1 = 92 if is_bottom_left or is_top_left else width - box_width + 92
        crop_y1 = 0 if is_top_left or is_top_right else height - 92
        crop_x2 = box_width if is_bottom_left or is_top_left else width
        crop_y2 = 92 if is_top_left or is_top_right else height
        if crop_x1 > width or crop_x2 > width or crop_y1 > height or crop_y2 > height or crop_x2 < crop_x1 or crop_y2 < crop_y1:
            return self.UNIDENTIFIED # If the computed crop positions are out of bounds, leave
        
        # Provided no errors occurred, parse the cropped image for a username and return the result
        return image_to_string(image.crop((crop_x1, crop_y1, crop_x2, crop_y2)), lang='eng', config=self.OCR_CONFIG)

In [14]:
from abc import ABC, abstractmethod
class GroupedDirectory(ABC):
    """
    
    """
    
    
    def __init__(self, path: Path, name: str) -> None:
        """
        
        """
        
        if isinstance(path, Path):
            self.path = path
        else:
            self.path = Path(path)
            
        self.name = name
        
            
    def should_contain(self, path: Path) -> bool:
        """
        
        """
        
        return path.suffix.replace('.', '').lower() in (self.VALID_EXTENSIONS)
    
    
    @abstractmethod
    def sort(self) -> None:
        """
        
        """
        pass
    
    
    @abstractmethod
    def add_file(self, path: Path, force: bool=False) -> None:
        """
        
        """
        
        pass
    
    
    @abstractmethod
    def _update_next_valid_filenames(self) -> None:
        """
        
        """
        pass

In [15]:
class Videos(GroupedDirectory):
    """
    
    """
    
    """All valid extensions for this GroupedDirectory."""
    VALID_EXTENSIONS = ('mov', 'mp4', 'gif', 'webm', 'mkv')
    
    """Regex to identify which files are numbered"""
    __NUMBERED_FILE_REGEX = f'^(V\d+)\.(?:{"|".join(VALID_EXTENSIONS)})$'
    
    
    def __init__(self, path: Path) -> None:
        """
        
        """
        
        super().__init__(path, name='Videos')
        
        self._next_filenames: str = 'V1'
        self._update_next_valid_filenames()
    
    
    def sort(self) -> None:
        """
        
        """
        
        if not self.path.exists():
            return
        
        # Get Path objects of all the videos in this grouped directory
        files = [file for file in self.path.iterdir() if file.name.endswith(self.VALID_EXTENSIONS)]
        
        # Get set of filenames (with/without extension) of all videos in this grouped directory
        current = set([Path(file.name) for file in files])
        current_no_ext = set([file.stem for file in files])
        
        # Get set of all possible desired filenames - e.g. {'V1.mov', 'V1.mp4', ..., 'Vn.mp4'},
        # this is all permutations of valid extensions and the correct numbers
        desired = set([Path(f'V{n}.{e}') for n in range(1, len(files)+1) for e in self.VALID_EXTENSIONS])
        desired_no_ext = set([f'V{n}' for n in range(1, len(files)+1)])
        
        # Go through all extra filenames and rename to missing filename
        for start_name, new_name in zip((current - desired), (desired_no_ext -  current_no_ext)):
            new_name_path = self.path / f'{new_name}{start_name.suffix}'
            print(f'Renaming {start_name} -> {new_name_path.name}')
#             (search_path / start_name).rename(new_name_path)

        self._update_next_valid_filenames()
    
    
    def add_file(self, path: Path, force: bool=False) -> None:
        """
        
        """
        
        # Make into Path object if not given
        if isinstance(path, str):
            path = Path(path)
            
        # If the path doesn't exist, error
        if not path.exists():
            raise FileDoesNotExistError(f'File ({path.name}) does not exist.')
        
        # If the file shouldn't belong to this grouped directory, error unless it's forced
        if not self.should_contain(path) and not force:
            raise FileDoesNotBelongError(f'File ({path.name}) does not belong to "'
                                         f'{self.name}" GroupedDirectory.')
            
        # Rename the file to the next valid filename(s)
        print(f'Renaming {path} -> {self.path / (self._next_filenames + path.suffix)}')
#         path.rename(self.path / (self._next_filenames + path.suffix))

        # Update next valid filename(s)
        self._update_next_valid_filenames()


    def _update_next_valid_filenames(self) -> None:
        """
        
        """
        
        # If this GroupedDirectory's path doesn't exist, exit
        if not self.path.exists():
            return
        
        # Get list of all numbered files in this directory
        filelist = [search(self.__NUMBERED_FILE_REGEX, file.name, IGNORECASE).group(1)
                    for file in self.path.iterdir()
                    if search(self.__NUMBERED_FILE_REGEX, file.name, IGNORECASE)]

        # Next valid filename starts at V1...V[n]
        file_number = 1
        while f'V{file_number}' in filelist:
            file_number += 1
            
        self._next_filenames = f'V{file_number}'
        
        

In [71]:
class Grouped(GroupedDirectory):
    """
    
    """
    
    """All valid extensions for this GroupedDirectory."""
    VALID_EXTENSIONS = ('jpg', 'jpeg', 'png', 'heic',
                        'mov', 'mp4', 'gif', 'webm', 'mkv')
    
    """Regex to identify which files are numbered"""
    __NUMBERED_FILE_REGEX = f'^(\d+)-(\d+\.(?:{"|".join(VALID_EXTENSIONS)}))$'
    
    
    def __init__(self, path: Path) -> None:
        """
        
        """
        
        super().__init__(path, name='Grouped')
        
        self._next_filenames: dict = {}
        self._update_next_valid_filenames()
        
        
    def should_contain(self, path: Path) -> bool:
        """
        
        """
        
        valid_ext = path.suffix.replace('.', '').lower() in (self.VALID_EXTENSIONS)
        
        has_grouping = self.__find_groupings(path)
        
        return valid_ext and has_grouping
        
        
    def add_file(self, file: Path, force: bool=False) -> None:
        """
        
        """
        
        # Make into Path object if not given
        if isinstance(file, str):
            file = Path(file)
            
        # If the path doesn't exist, error
        if not file.exists():
            raise FileDoesNotExistError(f'File ({file.name}) does not exist.')
            
        # If the file shouldn't belong to this grouped directory, error
        if not self.should_contain(file):
            raise FileDoesNotBelongError(f'File ({file.name}) does not belong to "'
                                         f'{self.name}" GroupedDirectory - cannot '
                                         f'force.')
            
        # Get this file's grouping, e.g. '4-12313.jpg' -> '4'
        grouping = self.__find_groupings(file, 1)
        
        if grouping in self._next_filenames:
            new_name = self.path / (self._next_filenames[grouping] + file.suffix)
        else:
            new_name = self.path / (self._next_filenames['new'] + file.suffix)
        print(f'Renaming {file.name} -> {new_name.name}')
#         file.rename(new_name)

        # Update next valid filename(s)
        self._update_next_valid_filenames()
    
    
    def sort(self) -> None:
        """
        
        """
        
        if not self.path.exists():
            return
        
        # Get Path objects of all the files in this grouped directory
        files = [path for path in self.path.iterdir() if path.name.endswith(self.VALID_EXTENSIONS)]
        
        # If not all files are numbered, the directory cannot be sorted
        for file in files:
            if not self.__find_groupings(file):
                raise FileDoesNotBelongError(f'File ({file}) is not valid for this GroupedDirectory.')
                
        # Get a set of all the unique groupings (e.g. 1-1, 2-1, 3-1 -> {1, 2, 3})
        unique_groupings = set(self.__find_groupings(file, 1) for file in files)

        # Go through each grouping prefix and compare the identified to the desired for that grouping
        for grouping in unique_groupings:
            # Get the current set of files for this grouping - e.g. {1-1, 1-3, 1-6, ...}
            current = set(Path(f'{grouping}-{self.__find_groupings(file, 2)}')
                          for file in files
                          if self.__find_groupings(file, 1) == grouping)
            current_no_ext = set(curr.stem for curr in current)

            # Get the desired set of files for this grouping - e.g {1-1, 1-2, ..., 1-n}
            desired = set(Path(f'{grouping}-{num}.{ext}')
                          for num in range(1, len(current)+1)
                          for ext in self.VALID_EXTENSIONS)
            desired_no_ext = set(des.stem for des in desired)
            
            for start_name, end_name in zip(current - desired, desired_no_ext - current_no_ext):
                print(f'Renaming {start_name} -> {end_name + start_name.suffix}')
#                 (self.path / start_name).rename(self.path / (end_name + start_name.suffix))

        # Update next valid filename(s)
        self._update_next_valid_filenames()


    def _update_next_valid_filenames(self) -> None:
        """
        
        """
        
        # If this GroupedDirectory's path doesn't exist, return
        if not self.path.exists():
            return
        
        # Get list of all numbered files in this directory
        files = [Path(file.name)
                 for file in self.path.iterdir()
                 if self.__find_groupings(file)]
        files_no_ext = [file.stem for file in files]

        # Get a set of all the unique groupings (e.g. 1-1, 2-1, 3-1 -> {1, 2, 3})
        unique_groupings = set(self.__find_groupings(file, 1) for file in files)
        
        # Next valid filenames are determined per-grouping
        for grouping in unique_groupings:
            file_number = 1
            while f'{grouping}-{file_number}' in files_no_ext:
                file_number += 1
                
            self._next_filenames[grouping] = f'{grouping}-{file_number}'
            
        # Next new-grouping is the next sequential grouping not yet present
        # e.g. 1-1, 2-1, 4-1 -> 3-1
        desired_groupings = set(range(1, len(unique_groupings)+2))
        missing = desired_groupings - set(map(int, unique_groupings))
        self._next_filenames['new'] = f'{sorted(list(missing))[0]}-1'
        
        
    def __find_groupings(self, filename: str, group_index: int=None) -> str:
        """
        
        """

        if isinstance(filename, Path):
            match = search(self.__NUMBERED_FILE_REGEX, filename.name, IGNORECASE)
        else:
            match = search(self.__NUMBERED_FILE_REGEX, filename, IGNORECASE)
        
        if group_index == None:
            return match
        
        return match.group(group_index)
        

In [323]:
class Ungrouped(GroupedDirectory):
    """
    
    """
    
    """All valid extensions for this GroupedDirectory."""
    VALID_EXTENSIONS = ('jpg', 'jpeg', 'png', 'heic')
    
    """Regex to identify which files are numbered"""
    __NUMBERED_FILE_REGEX = f'^(\d+)\.(?:{"|".join(VALID_EXTENSIONS)})$'
    
    
    def __init__(self, path: Path) -> None:
        """
        
        """
        
        super().__init__(path, name='Ungrouped')
        
        self._next_filenames: str = '1'
        self._update_next_valid_filenames()
    
    
    def sort(self) -> None:
        """
        
        """
        
        if not self.path.exists():
            return
        
        # Get Path objects of all the videos in this grouped directory
        paths = [path for path in self.path.iterdir() if path.name.endswith(self.VALID_EXTENSIONS)]
        
        # Get set of filenames (with/without extension) of all videos in this grouped directory
        current = set([Path(path.name) for path in paths])
        current_no_ext = set([path.stem for path in paths])
        
        # Get set of all possible desired filenames - e.g. {'1.jpg', '1.png', ..., 'n.jpg'},
        # this is all permutations of valid extensions and the correct numbers
        desired = set([Path(f'{n}.{e}') for n in range(1, len(paths)+1) for e in self.VALID_EXTENSIONS])
        desired_no_ext = set([f'{n}' for n in range(1, len(paths)+1)])
        
        # Go through all extra filenames and rename to missing filename
        for start_name, new_name in zip((current - desired), (desired_no_ext -  current_no_ext)):
            new_name_path = self.path / f'{new_name}{start_name.suffix}'
            print(f'Renaming {start_name} -> {new_name_path.name}')
#             (search_path / start_name).rename(new_name_path)

        self._update_next_valid_filenames()


    def _update_next_valid_filenames(self) -> None:
        """
        
        """
        
        # If this GroupedDirectory's path doesn't exist, exit
        if not self.path.exists():
            return
        
        # Get list of all numbered files in this directory
        filelist = [search(self.__NUMBERED_FILE_REGEX, file.name, IGNORECASE).group(1)
                    for file in self.path.iterdir()
                    if search(self.__NUMBERED_FILE_REGEX, file.name, IGNORECASE)]

        # Next valid filename starts at 1...[n]
        file_number = 1
        while str(file_number) in filelist:
            file_number += 1
            
        self._next_filenames = str(file_number)
        
        

In [None]:
class FileSorter:
    def __init__(self, social_map: SocialMap) -> None:
        self.social_map = social_map
        
    def sort_unsorted(self, unsorted: UnsortedDirectory, *grouped_directories: ('GroupedDirectory..')) -> None:
        for file in unsorted:
            person = unsorted.identify_person(file)
            
            # If the person is in the social map - give the file to that person
            if person in self.social_map:
                self.social_map.add_file_to_person(person, file)
                continue
                
            # If the person is NOT in the social map, try and place in a grouped directory
            for grouped_directory in grouped_directories:
                if grouped_directory.should_contain(file):
                    grouped_directory.add_file(file)
                    continue
                    
            # If the file is not in the social map, nor a grouped directory, skip
            continue

In [None]:
def identify_duplicates(path, ignore_list=[None], verbose=True):
    image_list = get_image_list(path, ignore_list=ignore_list, move=False)
    if verbose:
        print ("{} images, requiring {} comparisons".format(len(image_list), num_comparisons(image_list)))
        print ("This should take about {} minutes".format(int(time_per_compare * num_comparisons(image_list) / 60.0)))
        start_time = time.time() # Mark the start time for user-notification
        
    # Loop through all images, calculate the hash and store it and filename in two lists
    name_list, hash_list = [], []
    time.sleep(0.25)
    bar_format_string = '{desc}: {percentage:05.2f}%{bar}{n_fmt}/{total_fmt} [{elapsed}, {rate:05.2f} {unit}/s]'
    for image in tqdm(image_list, unit='hashes', desc='Image Hashing', ncols='100%', bar_format=bar_format_string):
        try:
            hash_val = imagehash.average_hash(Image.open(image))
        except OSError:
            print ("Delete {}".format(image))
            hash_val = 0
            
        name_list.append(image)
        hash_list.append(hash_val)
    
    # Create a DataFrame for managing the hash comparisons and results
    df = pd.DataFrame(np.transpose([name_list, hash_list]), columns=["File", "Hash"])
    df["Duplicates"] = "" # Create an empty (for now) DataFrame column
    df['DupeStr'] = '' # Create an empty column that will house the string-converted array of duplicates
    
    # Loop through all rows of the DataFrame, computing the difference in hashes between all
    for row_num, row in tqdm(df.iterrows(), total=len(df), unit='comparisons', desc='Hash Comparison', ncols='100%', bar_format=bar_format_string):
        # Store all file names whose hash is below the threshold in that row's Duplicate column
        df.at[row_num, "Duplicates"] = df[(~df['DupeStr'].str.contains(row['File']).any()) & (abs(df["Hash"] - row["Hash"]) < hash_threshold) & (df["File"] != row["File"])]["File"].values
        df.at[row_num, 'DupeStr'] = ' '.join(map(str, row['Duplicates']))
    
    # Create the duplicated entries folder that has aliases to each duplicated file
    create_duplicate_aliases(path, df)
    
    # If applicable, notify the user of how long the operations took
    if verbose:
        print ("Took {:.2} minutes".format((time.time() - start_time) / 60.0))
        
    # Return the DataFrame just for display purposes
    return df[df['Duplicates'].astype(str) != '[]'][['File', 'Duplicates']].reset_index(drop=True)

In [5]:
def identify_duplicates(path, ignore_list=[None], verbose=True):
    image_list = get_image_list(path, ignore_list=ignore_list, move=False)
    if verbose:
        print ("{} images, requiring {} comparisons".format(len(image_list), num_comparisons(image_list)))
        print ("This should take about {} minutes".format(int(time_per_compare * num_comparisons(image_list) / 60.0)))
        start_time = time.time() # Mark the start time for user-notification
        
    # Loop through all images, calculate the hash and store it and filename in two lists
    name_list, hash_list = [], []
    time.sleep(0.25)
    bar_format_string = '{desc}: {percentage:05.2f}%{bar}{n_fmt}/{total_fmt} [{elapsed}, {rate:05.2f} {unit}/s]'
    for image in tqdm(image_list, unit='hashes', desc='Image Hashing', ncols='100%', bar_format=bar_format_string):
        try:
            hash_val = imagehash.average_hash(Image.open(image))
        except OSError:
            print ("Delete {}".format(image))
            hash_val = 0
            
        name_list.append(image)
        hash_list.append(hash_val)
    
    # Create a DataFrame for managing the hash comparisons and results
    df = pd.DataFrame(np.transpose([name_list, hash_list]), columns=["File", "Hash"])
    df["Duplicates"] = "" # Create an empty (for now) DataFrame column
    df['DupeStr'] = '' # Create an empty column that will house the string-converted array of duplicates
    
    # Loop through all rows of the DataFrame, computing the difference in hashes between all
    for row_num, row in tqdm(df.iterrows(), total=len(df), unit='comparisons', desc='Hash Comparison', ncols='100%', bar_format=bar_format_string):
        # Store all file names whose hash is below the threshold in that row's Duplicate column
        df.at[row_num, "Duplicates"] = df[(~df['DupeStr'].str.contains(row['File']).any()) & (abs(df["Hash"] - row["Hash"]) < hash_threshold) & (df["File"] != row["File"])]["File"].values
        df.at[row_num, 'DupeStr'] = ' '.join(map(str, row['Duplicates']))
    
    # Create the duplicated entries folder that has aliases to each duplicated file
    create_duplicate_aliases(path, df)
    
    # If applicable, notify the user of how long the operations took
    if verbose:
        print ("Took {:.2} minutes".format((time.time() - start_time) / 60.0))
        
    # Return the DataFrame just for display purposes
    return df[df['Duplicates'].astype(str) != '[]'][['File', 'Duplicates']].reset_index(drop=True)

## Create a `/Duplicates/` folder that contains aliases to each duplicated picture

In [6]:
def create_duplicate_aliases(path, df):
    # Filter the DataFrame to ignore all entries without duplicates
    df = df[df["Duplicates"].astype(str) != '[]'][["File", "Duplicates"]].reset_index(drop=True)
    
    # Return if no duplicates are found
    if len(df) == 0:
        return
    
    # Create the folder where duplicate will be placed
    try:
        os.mkdir(os.path.join(path, "Duplicates"))
    except:
        print ("/Duplicates/ folder already exists.")
        
    # Loop through all rows in the duplicate entries - for each row, create a 
    for row_num, row in df.iterrows():
        # Create a numbered folder to contain aliases to all matched duplicates
        os.mkdir(os.path.join(path, 'Duplicates', str(row_num)))
        # Create the alias to the source file that there are duplicates of
        os.symlink(row['File'], os.path.join(path, 'Duplicates', str(row_num), "0 - {}, {}.{}".format(row['File'].split('/')[-2], row['File'].split('/')[-1].split('.')[-2], row['File'].split('.')[-1])))
        # Loop through all found duplicates and create aliases for each one
        for dupe_num, dupe in enumerate(row['Duplicates']):
            os.symlink(dupe, os.path.join(path, 'Duplicates', str(row_num), '{} - {}, {}.{}'.format(dupe_num + 1, dupe.split('/')[-2], dupe.split('/')[-1].split('.')[-2], dupe.split(".")[-1])))

## Specific function to rename the `/grouped/` subfolders

In [7]:
# Rename the /Grouped/ subfolder
def rename_grouped(path):
    image_list = get_image_list(path)
    originalLen = len(image_list)
    if not os.path.isdir(path + "new/"): # Make the /new/ subfolder if it doesn't exist
        os.makedirs(path + "new/")
        print ("Created /new/ folder")
    adjust_val = 0
    for count, image in enumerate(image_list):
        group_count = image.split("-")[0] # Grab the group count of this image
        # group_list is the list of all image in the same group
        group_list = [img for img in image_list if img.split("-")[0] == group_count]
         # If the subgroup of this image has already been renamed, adjust the count accordingly
        if len(group_list) == 0:
            adjust_val += 1
        # Loop through all images of this same subgroup
        for sub_count, sub_image in enumerate(group_list):
            if image.lower().endswith(".jpeg"): # New name for longer file names
                new_name = "{}new/{}-{}{}".format(path, count+1-adjust_val, sub_count+1, image[-5:])
            else:
                new_name = "{}new/{}-{}{}".format(path, count+1-adjust_val, sub_count+1, image[-4:])
            os.rename(path + sub_image, new_name) # Move to the /new/ folder (with the new name)
            time.sleep(0.08) # Sleep between each command to avoid losing files
            
        image_list = get_image_list(path) # Reset the image list now that some have been moved
        
    # Move the images back from /Grouped/new/ to /Grouped/
    for image in os.listdir(path + "new/"):
        os.rename(path + "new/" + image, path + image)
        time.sleep(0.08)
        
    os.rmdir(path + "new") # Delete /path/new/ subfolder
    print ("{} images renamed.".format(originalLen))