In [1]:
import fitz  # PyMuPDF
import os
from PIL import Image, ImageStat
import io

pdf_path = input("Enter the full path of the PDF file: ").strip().strip('"')  # remove extra quotes

# Replace backslashes with forward slashes (safe for Windows)
pdf_path = pdf_path.replace("\\", "/")

# Create output folder based on PDF name
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
output_folder = f"{pdf_name}_images"
os.makedirs(output_folder, exist_ok=True)

def is_image_blank(image_bytes, brightness_threshold=5):
    """
    Check if an image is blank or nearly black based on average brightness.
    Returns True if the image should be skipped.
    """
    image = Image.open(io.BytesIO(image_bytes)).convert("L")  # convert to grayscale
    stat = ImageStat.Stat(image)
    brightness = stat.mean[0]  # average brightness (0–255)
    return brightness < brightness_threshold

# Open PDF
try:
    doc = fitz.open(pdf_path)
except Exception as e:
    print(f"❌ Error opening PDF: {e}")
    exit()

# Extract images
for page_num, page in enumerate(doc):
    image_list = page.get_images(full=True)
    print(f"Page {page_num+1}: found {len(image_list)} images")

    for img_index, img in enumerate(image_list):
        xref = img[0]
        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]

        # Skip nearly black / blank images
        if is_image_blank(image_bytes):
            print(f"  Skipped dark/blank image on page {page_num+1}")
            continue

        output_path = os.path.join(output_folder, f"page{page_num+1}_img{img_index}.{image_ext}")
        with open(output_path, "wb") as f:
            f.write(image_bytes)
        print(f"  ✅ Saved: {output_path}")

print(f"\n✅ Done! All images saved in: {output_folder}")


Page 1: found 0 images
Page 2: found 0 images
Page 3: found 1 images
  ✅ Saved: 1706.03762v7_images\page3_img0.png
Page 4: found 2 images
  ✅ Saved: 1706.03762v7_images\page4_img0.png
  ✅ Saved: 1706.03762v7_images\page4_img1.png
Page 5: found 0 images
Page 6: found 0 images
Page 7: found 0 images
Page 8: found 0 images
Page 9: found 0 images
Page 10: found 0 images
Page 11: found 0 images
Page 12: found 0 images
Page 13: found 0 images
Page 14: found 0 images
Page 15: found 0 images

✅ Done! All images saved in: 1706.03762v7_images


In [2]:
import os
from PIL import Image
import imagehash
import numpy as np

# ===== CONFIG =====
authors_folder = r"extracted_images"  # Main dataset folder (with Author1/, Author2/, etc.)
input_folder = output_path        # Folder with new images to compare

# ===== Step 1: Compute pHash for each author’s images =====
def compute_hashes_for_folder(folder_path):
    hashes = []
    for f in os.listdir(folder_path):
        if f.lower().endswith(('.png', '.jpg', '.jpeg')):
            try:
                img_path = os.path.join(folder_path, f)
                img = Image.open(img_path).convert("RGB")
                hash_val = imagehash.phash(img)
                hashes.append(hash_val)
            except Exception as e:
                print(f"⚠️ Error processing {f}: {e}")
    return hashes

author_hashes = {}
for author in os.listdir(authors_folder):
    author_path = os.path.join(authors_folder, author)
    if os.path.isdir(author_path):
        author_hashes[author] = compute_hashes_for_folder(author_path)

# ===== Step 2: Compute pHash for input images =====
input_hashes = compute_hashes_for_folder(input_folder)
if not input_hashes:
    raise ValueError("❌ No valid images found in input folder.")

# ===== Step 3: Compute similarity (lower = more similar) =====
similarity_scores = {}

for author, hashes in author_hashes.items():
    if not hashes:
        continue
    distances = []
    for ih in input_hashes:
        for ah in hashes:
            distances.append(ih - ah)  # Hamming distance between pHashes
    avg_distance = np.mean(distances) if distances else np.inf
    similarity_scores[author] = avg_distance

# ===== Step 4: Rank authors by similarity =====
sorted_authors = sorted(similarity_scores.items(), key=lambda x: x[1])
top_author, top_score = sorted_authors[0]

# ===== Step 5: Output =====
print("\n🔍 Visual Similarity Results (using pHash):")
for author, score in sorted_authors:
    print(f"   {author}: average distance = {score:.2f}")

print(f"\n✅ Top similar author: **{top_author}** (avg distance = {top_score:.2f})")


NotADirectoryError: [WinError 267] The directory name is invalid: '1706.03762v7_images\\page4_img1.png'

In [1]:
import fitz  # PyMuPDF
import os
from PIL import Image, ImageStat
import io
import imagehash
import numpy as np

# ===== STEP 0: INPUT PDF =====
pdf_path = input("Enter the full path of the PDF file: ").strip().strip('"')  # remove extra quotes

# Replace backslashes with forward slashes (safe for Windows)
pdf_path = pdf_path.replace("\\", "/")

if not os.path.isfile(pdf_path):
    print(f"❌ File does not exist: {pdf_path}")
    exit()

# Create output folder based on PDF name
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
output_folder = os.path.join(os.getcwd(), f"{pdf_name}_images")
os.makedirs(output_folder, exist_ok=True)


# ===== STEP 1: FUNCTION TO CHECK BLANK IMAGE =====
def is_image_blank(image_bytes, brightness_threshold=5):
    """Check if an image is blank or nearly black based on average brightness."""
    image = Image.open(io.BytesIO(image_bytes)).convert("L")  # convert to grayscale
    stat = ImageStat.Stat(image)
    brightness = stat.mean[0]  # average brightness (0–255)
    return brightness < brightness_threshold

# ===== STEP 2: EXTRACT IMAGES FROM PDF =====
try:
    doc = fitz.open(pdf_path)
except Exception as e:
    print(f"❌ Error opening PDF: {e}")
    exit()

for page_num, page in enumerate(doc):
    image_list = page.get_images(full=True)
    print(f"Page {page_num+1}: found {len(image_list)} images")

    for img_index, img in enumerate(image_list):
        xref = img[0]
        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]

        if is_image_blank(image_bytes):
            print(f"  Skipped dark/blank image on page {page_num+1}")
            continue

        output_path = os.path.join(output_folder, f"page{page_num+1}_img{img_index}.{image_ext}")
        with open(output_path, "wb") as f:
            f.write(image_bytes)
        print(f"  ✅ Saved: {output_path}")

print(f"\n✅ Done! All images saved in: {output_folder}")

# ===== STEP 3: CONFIG FOR VISUAL SIMILARITY =====
authors_folder = r"extracted_images"  # Main dataset folder (Author1/, Author2/, etc.)
input_folder = output_folder           # Use newly extracted images

# ===== STEP 4: FUNCTION TO COMPUTE pHash =====
def compute_hashes_for_folder(folder_path):
    hashes = []
    for f in os.listdir(folder_path):
        if f.lower().endswith(('.png', '.jpg', '.jpeg')):
            try:
                img_path = os.path.join(folder_path, f)
                img = Image.open(img_path).convert("RGB")
                hash_val = imagehash.phash(img)
                hashes.append(hash_val)
            except Exception as e:
                print(f"⚠️ Error processing {f}: {e}")
    return hashes

# Compute hashes for each author
author_hashes = {}
for author in os.listdir(authors_folder):
    author_path = os.path.join(authors_folder, author)
    if os.path.isdir(author_path):
        author_hashes[author] = compute_hashes_for_folder(author_path)

# Compute hashes for input images
input_hashes = compute_hashes_for_folder(input_folder)
if not input_hashes:
    raise ValueError("❌ No valid images found in input folder.")

# ===== STEP 5: COMPUTE SIMILARITY =====
similarity_scores = {}
for author, hashes in author_hashes.items():
    if not hashes:
        continue
    distances = []
    for ih in input_hashes:
        for ah in hashes:
            distances.append(ih - ah)  # Hamming distance
    avg_distance = np.mean(distances) if distances else np.inf
    similarity_scores[author] = avg_distance

# Rank authors by similarity
sorted_authors = sorted(similarity_scores.items(), key=lambda x: x[1])
top_author, top_score = sorted_authors[0]

# ===== STEP 6: OUTPUT RESULTS =====
print("\n🔍 Visual Similarity Results (using pHash):")
for author, score in sorted_authors:
    print(f"   {author}: average distance = {score:.2f}")

print(f"\n✅ Top similar author: **{top_author}** (avg distance = {top_score:.2f})")


Page 1: found 0 images
Page 2: found 0 images
Page 3: found 1 images
  ✅ Saved: c:\Users\BHUVANA VIJAYA\OneDrive\Documents\assignment_2\1706.03762v7_images\page3_img0.png
Page 4: found 2 images
  ✅ Saved: c:\Users\BHUVANA VIJAYA\OneDrive\Documents\assignment_2\1706.03762v7_images\page4_img0.png
  ✅ Saved: c:\Users\BHUVANA VIJAYA\OneDrive\Documents\assignment_2\1706.03762v7_images\page4_img1.png
Page 5: found 0 images
Page 6: found 0 images
Page 7: found 0 images
Page 8: found 0 images
Page 9: found 0 images
Page 10: found 0 images
Page 11: found 0 images
Page 12: found 0 images
Page 13: found 0 images
Page 14: found 0 images
Page 15: found 0 images

✅ Done! All images saved in: c:\Users\BHUVANA VIJAYA\OneDrive\Documents\assignment_2\1706.03762v7_images

🔍 Visual Similarity Results (using pHash):
   K.V. Sambasivarao: average distance = 29.87
   Dr.Rohit Beniwal: average distance = 29.95
   Jayasri D: average distance = 30.13
   Arun Chauhan: average distance = 30.28
   Amita Jain: aver