# File duplicate finder by size

This code tries to find duplicated file by file size

In [14]:
import os
import hashlib
from tqdm import tqdm

def find_duplicate_files(folder_path, scan_subfolders=False):
    """
    Finds duplicate files within a folder and optionally its subfolders.

    Args:
        folder_path (str): The path to the folder to scan.
        scan_subfolders (bool, optional): Whether to scan subfolders. Defaults to False.

    Returns:
        dict: A dictionary where keys are file hashes and values are lists of file paths
              with that hash (duplicates).
    """
    file_hashes = {}
    scanned_files_count = 0

    if scan_subfolders:
        file_list = []
        print("1/2")
        for root, _, files in os.walk(folder_path):
            for file in tqdm(files):
                file_list.append(os.path.join(root, file))
    else:
        file_list = []
        for f in tqdm(os.listdir(folder_path), desc="Scanning files"):
            file_path = os.path.join(folder_path, f)
            if os.path.isfile(file_path):
                file_list.append(file_path)
    progress_bar = tqdm(file_list, desc="Scanning files", unit="file")
    print("2/2")
    for file_path in progress_bar:
        scanned_files_count += 1
        try:
            with open(file_path, "rb") as file:
                file_content = file.read()
                file_hash = hashlib.sha256(file_content).hexdigest() # Using SHA256 for robust hashing

                if file_hash in file_hashes:
                    file_hashes[file_hash].append(file_path)
                else:
                    file_hashes[file_hash] = [file_path]
        except Exception as e:
            print(f"Error reading file: {file_path}. Skipping. Error: {e}") # Basic error handling - skip file

    return file_hashes, scanned_files_count

def move_duplicate_files(duplicate_groups, folder_path):
    """
    Moves duplicate files to a '!duplication' folder within the specified folder.

    Args:
        duplicate_groups (dict): A dictionary of duplicate file groups (output from find_duplicate_files).
        folder_path (str): The base folder path where the '!duplication' folder will be created.

    Returns:
        int: The number of duplicate files moved.
    """
    duplication_folder = os.path.join(folder_path, "!duplication")
    if not os.path.exists(duplication_folder):
        os.makedirs(duplication_folder) # Create the duplication folder if it doesn't exist

    moved_files_count = 0
    for hash_value, file_paths in duplicate_groups.items():
        if len(file_paths) > 1: # Only consider groups with more than one file as duplicates
            # Keep the first file as original, move the rest
            original_file = file_paths[0]
            duplicate_files = file_paths[1:]

            for duplicate_file in duplicate_files:
                try:
                    destination_path = os.path.join(duplication_folder, os.path.basename(duplicate_file))
                    os.rename(duplicate_file, destination_path) # Use rename for move within same filesystem (faster)
                    moved_files_count += 1
                except Exception as e:
                    print(f"Error moving file: {duplicate_file} to {duplication_folder}. Skipping. Error: {e}") # Error handling for moving

    return moved_files_count



In [15]:
if __name__ == "__main__":
    folder_path = input("Put the folder directory here: ")
    scan_subfolders_choice = input("Scan subfolders as well? (yes/no): ").lower()
    scan_subfolders = scan_subfolders_choice == 'yes'

    duplicate_hashes, scanned_count = find_duplicate_files(folder_path, scan_subfolders)

    duplicates_found = 0
    for hash_value in duplicate_hashes:
        if len(duplicate_hashes[hash_value]) > 1:
            duplicates_found += len(duplicate_hashes[hash_value]) - 1 # Subtract 1 because one is kept as original

    moved_count = move_duplicate_files(duplicate_hashes, folder_path)

    print("\n--- Scan Summary ---")
    print(f"Total files scanned: {scanned_count}")
    print(f"Duplicate files found: {duplicates_found}")
    print(f"Duplicate files moved to '!duplication' folder: {moved_count}")
    print("--- Script finished ---")

Scanning files: 100%|██████████| 55974/55974 [04:47<00:00, 194.97it/s] 
Scanning files:   0%|          | 0/55973 [00:00<?, ?file/s]

2/2


Scanning files: 100%|██████████| 55973/55973 [14:34<00:00, 64.01file/s] 



--- Scan Summary ---
Total files scanned: 55973
Duplicate files found: 291
Duplicate files moved to '!duplication' folder: 291
--- Script finished ---
