In [None]:
import fitz  # PyMuPDF
import os
from PIL import Image, ImageStat
import io

pdf_path = "1706.03762v7.pdf"
output_folder = "images_1"
os.makedirs(output_folder, exist_ok=True)


def is_image_blank(image_bytes, brightness_threshold=5):
    """
    Check if an image is blank or nearly black based on average brightness.
    Returns True if the image should be skipped.
    """
    image = Image.open(io.BytesIO(image_bytes)).convert("L")  # convert to grayscale
    stat = ImageStat.Stat(image)
    brightness = stat.mean[0]  # average brightness (0–255)
    return brightness < brightness_threshold


doc = fitz.open(pdf_path)

for page_num, page in enumerate(doc):
    image_list = page.get_images(full=True)
    print(f"Page {page_num+1}: found {len(image_list)} images")

    for img_index, img in enumerate(image_list):
        xref = img[0]
        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]

        # Skip nearly black / blank images
        if is_image_blank(image_bytes):
            print(f"  Skipped dark/blank image on page {page_num+1}")
            continue

        output_path = f"{output_folder}/page{page_num+1}_img{img_index}.{image_ext}"
        with open(output_path, "wb") as f:
            f.write(image_bytes)
        print(f"  ✅ Saved: {output_path}")


FileNotFoundError: no such file: '1706.03762v7.pdf'

In [1]:
import fitz  # PyMuPDF
import os
from PIL import Image, ImageStat
import io
import re

# Paths
dataset_path = r"C:\Dataset\Dataset"  # Main dataset folder
output_root = "extracted_images"
os.makedirs(output_root, exist_ok=True)

# Function to sanitize filenames for Windows
def sanitize_filename(filename, max_length=50):
    filename = re.sub(r'[<>:"/\\|?*]', '', filename)  # remove illegal characters
    if len(filename) > max_length:
        filename = filename[:max_length]  # truncate
    return filename

# Function to check if image is blank/dark
def is_image_blank(image_bytes, brightness_threshold=5):
    image = Image.open(io.BytesIO(image_bytes)).convert("L")  # grayscale
    stat = ImageStat.Stat(image)
    brightness = stat.mean[0]  # 0-255
    return brightness < brightness_threshold

# Iterate over author folders
for author_folder in os.listdir(dataset_path):
    author_path = os.path.join(dataset_path, author_folder)
    if not os.path.isdir(author_path):
        continue

    print(f"Processing author: {author_folder}")
    output_author_folder = os.path.join(output_root, sanitize_filename(author_folder))
    os.makedirs(output_author_folder, exist_ok=True)

    # Iterate over PDFs in author's folder
    for pdf_file in os.listdir(author_path):
        if not pdf_file.lower().endswith(".pdf"):
            continue

        pdf_path = os.path.join(author_path, pdf_file)
        pdf_name = sanitize_filename(os.path.splitext(pdf_file)[0])

        try:
            doc = fitz.open(pdf_path)
        except Exception as e:
            print(f"  ⚠️ Failed to open PDF: {pdf_file}, skipping. Error: {e}")
            continue

        print(f"  Processing PDF: {pdf_file}, {len(doc)} pages")

        for page_num, page in enumerate(doc):
            image_list = page.get_images(full=True)
            for img_index, img in enumerate(image_list):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]

                if is_image_blank(image_bytes):
                    print(f"    Skipped blank/dark image: page {page_num+1} img {img_index}")
                    continue

                output_path = os.path.join(
                    output_author_folder,
                    f"{pdf_name}_page{page_num+1}_img{img_index}.{image_ext}"
                )

                try:
                    with open(output_path, "wb") as f:
                        f.write(image_bytes)
                    print(f"    ✅ Saved: {output_path}")
                except Exception as e:
                    print(f"    ⚠️ Failed to save image: {output_path}. Error: {e}")

        doc.close()


Processing author: Amit Saxena
  Processing PDF: A Review of Clustering Techniques.pdf, 30 pages
    Skipped blank/dark image: page 2 img 0
    Skipped blank/dark image: page 3 img 0
    ✅ Saved: extracted_images\Amit Saxena\A Review of Clustering Techniques_page4_img0.png
    ✅ Saved: extracted_images\Amit Saxena\A Review of Clustering Techniques_page4_img1.png
    ✅ Saved: extracted_images\Amit Saxena\A Review of Clustering Techniques_page5_img0.png
    ✅ Saved: extracted_images\Amit Saxena\A Review of Clustering Techniques_page5_img1.png
    ✅ Saved: extracted_images\Amit Saxena\A Review of Clustering Techniques_page7_img0.png
    Skipped blank/dark image: page 7 img 1
    Skipped blank/dark image: page 8 img 0
    Skipped blank/dark image: page 8 img 1
    ✅ Saved: extracted_images\Amit Saxena\A Review of Clustering Techniques_page11_img0.png
    Skipped blank/dark image: page 13 img 0
    Skipped blank/dark image: page 14 img 0
    Skipped blank/dark image: page 14 img 1
    Skipp