In [8]:
import fitz  # pip install PyMuPDF
import os
import json

dataset_path = r"C:\Dataset"
output_json = "author_papers_pymupdf.json"
all_authors_data = {}

for author in os.listdir(dataset_path):
    author_folder = os.path.join(dataset_path, author)
    if os.path.isdir(author_folder):
        all_authors_data[author] = []
        for pdf_file in os.listdir(author_folder):
            if pdf_file.endswith(".pdf"):
                pdf_path = os.path.join(author_folder, pdf_file)
                doc = fitz.open(pdf_path)

                # Metadata
                metadata = doc.metadata

                # Text with positions
                structure = []
                for page_num, page in enumerate(doc):
                    blocks = page.get_text("dict")["blocks"]
                    for block in blocks:
                        if block['type'] == 0:  # text block
                            for line in block["lines"]:
                                for span in line["spans"]:
                                    structure.append({
                                        "page": page_num+1,
                                        "text": span['text'],
                                        "x0": span['bbox'][0],
                                        "y0": span['bbox'][1],
                                        "x1": span['bbox'][2],
                                        "y1": span['bbox'][3],
                                        "size": span['size'],
                                        "font": span['font']
                                    })
                
                # Images
                images = []
                for page_num, page in enumerate(doc):
                    for img_index, img in enumerate(page.get_images(full=True)):
                        xref = img[0]
                        base_image = doc.extract_image(xref)
                        images.append({
                            "page": page_num+1,
                            "image_bytes": base_image["image"],  # can save as file later
                            "ext": base_image["ext"]
                        })

                all_authors_data[author].append({
                    "file": pdf_file,
                    "metadata": metadata,
                    "structure": structure,
                    "images": images
                })
                print(f"Processed {pdf_file} for {author}")

with open(output_json, "w", encoding="utf-8") as f:
    json.dump(all_authors_data, f, indent=4, ensure_ascii=False)
