In [None]:
# --- Step 0: Install Pillow for image processing ---
# This command runs in the shell to install the library
!pip install Pillow

print("\n--- Pillow installation complete ---\n")

import os
import hashlib
import glob
import time
import io
try:
    from PIL import Image, UnidentifiedImageError
except ImportError:
    print("="*60)
    print("ERROR: Pillow library not found. ")
    print("Please RE-RUN this cell. If it fails again, restart the kernel and re-run.")
    print("="*60)
    # Stop the script if Pillow isn't available
    raise

# --- Configuration ---
# This path is based on your previous output, assuming your
# notebook is in a subfolder (like /src) next to /public.
IMAGE_DIR = "../public/images"

# The hash algorithm to use
HASH_ALGO = "md5"

# A list of image extensions to *search* for
IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".gif"]
# ---------------------

def get_image_as_png_hash(filepath, algo):
    """
    Opens an image, converts it to PNG in-memory, 
    and returns the hash of that PNG data.
    """
    hash_func = hashlib.new(algo)
    try:
        with Image.open(filepath) as img:
            # Use an in-memory byte stream
            with io.BytesIO() as in_memory_png:
                # Save the image as PNG format into the byte stream
                img.save(in_memory_png, "PNG")
                # Rewind the stream to the beginning
                in_memory_png.seek(0)
                # Read and hash the stream in chunks
                for chunk in iter(lambda: in_memory_png.read(4096), b""):
                    hash_func.update(chunk)
            return hash_func.hexdigest()
    
    except UnidentifiedImageError:
        print(f"  [SKIPPING] {os.path.basename(filepath)}: Cannot identify image file.")
        return None
    except IOError as e:
        print(f"  [ERROR] Could not read {filepath}: {e}")
        return None

def process_images_to_png(directory):
    """
    Converts all images to PNG, renames them to their MD5 hash, 
    and removes duplicates.
    """
    
    print("="*60)
    print("WARNING: This script will CONVERT, RENAME, and DELETE files.")
    print("1. All images will be converted to PNG.")
    print("2. Originals (JPG, GIF, etc.) will be DELETED.")
    print("3. Duplicates (based on PNG content) will be DELETED.")
    print(f"Target directory: {directory}")
    print("Processing will begin in 5 seconds. Press (i, i) or Interrupt to cancel.")
    print("="*60)
    try:
        time.sleep(5)
    except KeyboardInterrupt:
        print("\nOperation cancelled by user.")
        return

    print("--- Starting Process ---")

    # --- Pass 1: Find all files, get their PNG hash, and detect duplicates ---
    print("Pass 1: Scanning files and finding duplicates (by PNG content)...")
    
    # hashes_seen maps: {png_hash: original_filepath}
    hashes_seen = {}
    files_to_delete = []
    
    # Find all image paths
    image_paths = []
    for ext in IMAGE_EXTENSIONS:
        image_paths.extend(glob.glob(os.path.join(directory, f"*{ext}")))
        
    if not image_paths:
        print(f"No images found in '{directory}'.")
        print(f"(Current working directory: {os.getcwd()})")
        return

    for filepath in image_paths:
        basename = os.path.basename(filepath)
        
        # Get the hash of the image *as if it were a PNG*
        png_hash = get_image_as_png_hash(filepath, HASH_ALGO)
        
        if png_hash is None:
            continue
            
        if png_hash in hashes_seen:
            # Duplicate found
            first_file_basename = os.path.basename(hashes_seen[png_hash])
            print(f"  [DUPLICATE] {basename} is a copy of {first_file_basename}.")
            files_to_delete.append(filepath)
        else:
            # First time seeing this hash
            hashes_seen[png_hash] = filepath

    # --- Pass 2: Convert and rename all unique "keeper" files ---
    print(f"\nPass 2: Converting and renaming {len(hashes_seen)} unique files...")
    renamed_count = 0
    ok_count = 0
    rename_errors = 0
    
    for png_hash, original_path in hashes_seen.items():
        basename = os.path.basename(original_path)
        
        # The new file extension is ALWAYS .png
        new_name = f"{png_hash}.png"
        new_filepath = os.path.join(directory, new_name)

        try:
            if original_path == new_filepath:
                print(f"  [OK]      {basename} is already correct.")
                ok_count += 1
            else:
                # Open the original image
                with Image.open(original_path) as img:
                    # Save it as the new PNG file
                    img.save(new_filepath, "PNG")
                
                # After successful save, remove the original file
                os.remove(original_path)
                
                print(f"  [CONVERTED] {basename} -> {new_name}")
                renamed_count += 1
                
        except (OSError, IOError) as e:
            print(f"  [ERROR] Failed to convert/rename {basename}: {e}")
            rename_errors += 1

    # --- Pass 3: Delete all duplicates found in Pass 1 ---
    print(f"\nPass 3: Deleting {len(files_to_delete)} duplicate files...")
    deleted_count = 0
    for filepath in files_to_delete:
        try:
            basename = os.path.basename(filepath)
            os.remove(filepath)
            print(f"  [DELETED] {basename}")
            deleted_count += 1
        except OSError as e:
            print(f"  [ERROR] Failed to delete {os.path.basename(filepath)}: {e}")

    # --- Summary ---
    print("\n--- Processing Complete ---")
    print(f"Files kept (already correct): {ok_count}")
    print(f"Files converted/renamed:      {renamed_count}")
    print(f"Duplicate files deleted:      {deleted_count}")
    print(f"Errors:                       {rename_errors}")
    print("-------------------------------")
    print(f"Your '{directory}' folder now contains {ok_count + renamed_count} unique PNG images.")

# --- Run the script ---
if not os.path.isdir(IMAGE_DIR):
    print(f"Error: Directory not found: {IMAGE_DIR}")
    print("Please check the 'IMAGE_DIR' variable in the script.")
    print(f"(The current working directory is: {os.getcwd()})")
else:
    process_images_to_png(IMAGE_DIR)

Scanning directory: ../public/images
Using hash algorithm: md5

[MISMATCH] 004c6586ff65e90635ccaa3d2482a05a.jpg
  - Expected: f09a11f2f818b171e3cdda0dc46faba0.jpg
  - Found:    004c6586ff65e90635ccaa3d2482a05a.jpg
[MISMATCH] 0112fa478a0c09f1db0d1b3341555f1a.jpg
  - Expected: bf36982985f472b7fee712ceb0e10528.jpg
  - Found:    0112fa478a0c09f1db0d1b3341555f1a.jpg
[MISMATCH] 022b03c958e5adda9be1bfcdf740bb8c.jpg
  - Expected: e5b3731fa96819b0f3d9d681af350c67.jpg
  - Found:    022b03c958e5adda9be1bfcdf740bb8c.jpg
[MISMATCH] 029aefb6089a314d7b152616ba57d1b9.jpg
  - Expected: ff1c1bedac8f9471f01a5e4de4082b00.jpg
  - Found:    029aefb6089a314d7b152616ba57d1b9.jpg
[MISMATCH] 038418a9d4e676b9bee0cca72370d1c9.jpg
  - Expected: dd9ee87e8d190f956f546b785cc8fd29.jpg
  - Found:    038418a9d4e676b9bee0cca72370d1c9.jpg
[MISMATCH] 04cc26f69186666c66afb39badd54148.jpg
  - Expected: e9c33364dc7a6a31c811fc91183efab3.jpg
  - Found:    04cc26f69186666c66afb39badd54148.jpg
[MISMATCH] 05023a17f9075e21d1242e2eb