In [1]:
import fitz  # PyMuPDF

def get_page_count(pdf_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    
    # Get the number of pages
    num_pages = pdf_document.page_count
    
    # Close the document
    pdf_document.close()
    
    return num_pages

# Path to your PDF file
pdf_path = 'example.pdf'

# Get and print the page count
page_count = get_page_count(pdf_path)
print(f'The PDF has {page_count} pages.')


The PDF has 361 pages.


In [2]:
import fitz  # PyMuPDF

def get_page_metadata(pdf_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    
    # Create a list to store metadata for each page
    page_metadata = []
    
    # Iterate over each page
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        
        # Get metadata for the current page
        metadata = {
            'page_number': page_num + 1,
            'width': page.rect.width,
            'height': page.rect.height,
            'rotation': page.rotation,
            'text': page.get_text()  # Extracting text (optional)
        }
        
        # Add metadata to the list
        page_metadata.append(metadata)
    
    # Close the document
    pdf_document.close()
    
    return page_metadata

# Path to your PDF file
pdf_path = 'example.pdf'

# Get and print the metadata for each page
metadata_list = get_page_metadata(pdf_path)
for metadata in metadata_list:
    print(f"Page {metadata['page_number']}:")
    print(f"  Width: {metadata['width']}")
    print(f"  Height: {metadata['height']}")
    print(f"  Rotation: {metadata['rotation']}")
    print(f"  Text (first 100 chars): {metadata['text'][:100]}")
    print('-' * 40)


Page 1:
  Width: 540.0
  Height: 666.0
  Rotation: 0
  Text (first 100 chars): 
----------------------------------------
Page 2:
  Width: 540.0
  Height: 666.0
  Rotation: 0
  Text (first 100 chars): Generative AI with LangChain
Build large language model (LLM) apps with Python, 
ChatGPT, and other 
----------------------------------------
Page 3:
  Width: 540.0
  Height: 666.0
  Rotation: 0
  Text (first 100 chars): Generative AI with LangChain
Copyright © 2023 Packt Publishing
All rights reserved. No part of this 
----------------------------------------
Page 4:
  Width: 540.0
  Height: 666.0
  Rotation: 0
  Text (first 100 chars): To Diane and Nico
– Ben Auffarth

----------------------------------------
Page 5:
  Width: 540.0
  Height: 666.0
  Rotation: 0
  Text (first 100 chars): Contributors
About the author
Ben Auffarth is a seasoned data science leader with a background and P
----------------------------------------
Page 6:
  Width: 540.0
  Height: 666.0
  Rotation: 0
  Text (f

In [4]:
import fitz  # PyMuPDF
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_text_from_page(pdf_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    
    # Extract text from each page
    page_texts = {}
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        page_texts[page_num] = page.get_text()
    
    # Close the document
    pdf_document.close()
    
    return page_texts

def identify_topics(text):
    # Process text with spaCy
    doc = nlp(text)
    
    # Basic topic identification (you may need more sophisticated methods)
    topics = set()
    subtopics = set()
    
    for sent in doc.sents:
        if len(sent.ents) > 0:
            topics.add(sent.text)
        else:
            subtopics.add(sent.text)
    
    return list(topics), list(subtopics)

def main(pdf_path):
    # Extract text from the PDF
    page_texts = extract_text_from_page(pdf_path)
    
    for page_num, text in page_texts.items():
        print(f"Page {page_num + 1}:")
        topics, subtopics = identify_topics(text)
        print("Topics:")
        for topic in topics:
            print(f"  {topic}")
        print("Subtopics:")
        for subtopic in subtopics:
            print(f"  {subtopic}")
        print('-' * 40)

# Path to your PDF file
pdf_path = 'example.pdf'

# Run the main function
main(pdf_path)


Page 1:
Topics:
Subtopics:
----------------------------------------
Page 2:
Topics:
  Generative AI with LangChain
Build large language model (LLM) apps with Python, 
ChatGPT, and other LLMs
Ben Auffarth
BIRMINGHAM—MUMBAI

Subtopics:
----------------------------------------
Page 3:
Topics:
  Packt Publishing has endeavored to provide trademark information about all of the companies and products 
mentioned in this book by the appropriate use of capitals.
  Manju Arasan
Presentation Designer:
  Tanya D’cruz and Elliot Dallow
Copy Editor: Safis Editing
Technical Editor: Kushal Sharma
Proofreader: Safis Editing
Indexer:
  Neither the author, nor Packt Publishing or its dealers and distributors, will be held liable for any 
damages caused or alleged to have been caused directly or indirectly by this book.

  Ajay Patule
Developer Relations Marketing Executive: Monika Sangwan
First published: December 2023
Production reference: 1141223
Published by Packt Publishing Ltd.
Grosvenor House
11 St

In [5]:
import fitz  # PyMuPDF

def extract_metadata_from_pdf(file_path):
    doc = fitz.open(file_path)
    metadata = {
        "title": doc.metadata.get("title", "Unknown Title"),
        "author": doc.metadata.get("author", "Unknown Author"),
        "subject": doc.metadata.get("subject", "Unknown Subject"),
        "keywords": doc.metadata.get("keywords", "Unknown Keywords"),
        "page_count": len(doc)
    }
    return metadata

import re

def extract_sections_from_text(text):
    sections = re.findall(r'(Chapter \d+): (.+)', text)
    return sections

book_metadata = extract_metadata_from_pdf("example.pdf")
# sections = extract_sections_from_text(book_text)

# Combine metadata and sections
book_info = {
    "metadata": book_metadata,
    # "sections": sections
}

print(book_info)


{'metadata': {'title': '', 'author': '', 'subject': '', 'keywords': '', 'page_count': 361}}


In [6]:
book_metadata

{'title': '', 'author': '', 'subject': '', 'keywords': '', 'page_count': 361}