In [1]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer

In [22]:
def extract_chapter_titles(pdf_path, max_pages=20):
    chapters = []
    
    # Get all pages but limit to first max_pages
    page_layouts = list(extract_pages(pdf_path))[:max_pages]
    
    # Process only the first max_pages in the PDF to find chapter headings
    for i, page_layout in enumerate(page_layouts):
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                text = element.get_text().strip()
                # Look for text that matches a chapter heading pattern
                if text.startswith("Chapter ") or re.match(r"^\d+\.?\s+[A-Z]", text):
                    # Clean up the chapter title
                    chapter_title = text.strip()
                    chapters.append({
                        "title": chapter_title,
                        "page": i + 1  # Use the page number where the chapter title was found
                    })
    
    return chapters

path = "../../data/Natural_Image_Statistics.pdf"

# Usage
chapters = extract_chapter_titles(path, max_pages=20)
for i, chapter in enumerate(chapters):
    print(f"{i+1}. {chapter['title']} (Page {chapter['page']})")





1. 1
Part I Background
2
Linear ﬁlters and frequency analysis . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 25
3 Outline of the visual system . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 51
4 Multivariate probability and statistics . . . . . . . . . . . . . . . . . . . . . . . . . . . . 69
Part II Statistics of linear features
5
6
7
8
Part III Nonlinear features & dependency of linear features
9
Energy correlation of linear features & normalization . . . . . . . . . . . . . . 209
10 Energy detectors and complex cells . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 223
11 Energy correlations and topographic organization . . . . . . . . . . . . . . . . . 249
12 Dependencies of energy detectors: Beyond V1 . . . . . . . . . . . . . . . . . . . . . 273
13 Overcomplete and non-negative models . . . . . . . . . . . . . . . . . . . . . . . . . . . 289
14 Lateral interactions and feedback . . . . . . . . . . . . . . . . . . . . . . . . . . 

In [None]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTText, LTTextBox, LTTextLine, LTChar

def extract_toc_pages(pdf_path):
    toc_pages = []
    toc_started = False
    toc_format_ended = False
    
    # Process all pages to find and extract ToC pages
    for i, page_layout in enumerate(extract_pages(pdf_path)):
        page_text = []
        
        # Extract all text from the current page using various text element types
        for element in page_layout:
            # Handle different types of text elements
            if hasattr(element, 'get_text'):
                # This covers LTTextContainer, LTTextBox, LTTextLine, etc.
                text = element.get_text().strip()
                if text:
                    page_text.append(text)
            # Recursively process container elements that might contain text
            elif hasattr(element, '_objs'):
                for obj in element._objs:
                    if hasattr(obj, 'get_text'):
                        text = obj.get_text().strip()
                        if text:
                            page_text.append(text)
        
        # Join all text for analysis
        full_page_text = "\n".join(page_text).lower()
        
        # Check if this page contains "contents" to start capturing
        if not toc_started and ("contents" in full_page_text or "table of contents" in full_page_text):
            toc_started = True
            toc_pages.append(i)
            # Store some characteristics to detect format changes
            toc_format = {
                "avg_line_length": sum(len(line) for line in page_text) / max(1, len(page_text)),
                "line_count": len(page_text),
                "contains_dots": any("..." in line for line in page_text),
                "contains_numbers": any(any(c.isdigit() for c in line) for line in page_text)
            }
            continue
        
        # If we've started the ToC and haven't ended it yet, check if this page continues the ToC format
        if toc_started and not toc_format_ended:
            # Skip if the page is empty
            if not page_text:
                continue
                
            # Calculate current page format characteristics
            current_format = {
                "avg_line_length": sum(len(line) for line in page_text) / max(1, len(page_text)),
                "line_count": len(page_text),
                "contains_dots": any("..." in line for line in page_text),
                "contains_numbers": any(any(c.isdigit() for c in line) for line in page_text)
            }
            
            # Check if format has significantly changed
            format_changed = (
                abs(current_format["avg_line_length"] - toc_format["avg_line_length"]) > 20 or
                abs(current_format["line_count"] - toc_format["line_count"]) > 5 or
                toc_format["contains_dots"] != current_format["contains_dots"] or
                toc_format["contains_numbers"] != current_format["contains_numbers"]
            )
            
            # If format has changed significantly or we see chapter/introduction text, end ToC
            if format_changed or any(keyword in full_page_text for keyword in ["chapter", "introduction", "preface"]):
                toc_format_ended = True
            else:
                toc_pages.append(i)
                
            # Safety check - don't capture more than 10 ToC pages to avoid including the entire document
            if len(toc_pages) >= 15:
                toc_format_ended = True
    
    return toc_pages

def extract_toc_content(pdf_path, toc_pages):
    toc_content = []
    
    for page_num in toc_pages:
        page_content = []
        page = list(extract_pages(pdf_path, [page_num]))[0]
        
        # Recursive function to extract text from all types of elements
        def extract_text_from_element(element):
            texts = []
            # If element has get_text method, use it
            if hasattr(element, 'get_text'):
                text = element.get_text().strip()
                if text:
                    texts.append(text)
            # If element has child objects, process them
            if hasattr(element, '_objs'):
                for obj in element._objs:
                    texts.extend(extract_text_from_element(obj))
            return texts
        
        # Extract text from all elements on the page
        for element in page:
            page_content.extend(extract_text_from_element(element))
        
        toc_content.append("\n".join(page_content))
    
    return toc_content

path = "../../data/Natural_Image_Statistics.pdf"

# Usage
toc_page_numbers = extract_toc_pages(path)
print(f"Table of Contents found on pages: {toc_page_numbers}")

# Get and display the content from those pages
toc_content = extract_toc_content(path, toc_page_numbers)
for i, content in enumerate(toc_content):
    print(f"=== TOC PAGE {toc_page_numbers[i]+1} ===")
    print(content)
    print("\n")

In [28]:
for i, content in enumerate(toc_content):
    #print(f"=== TOC PAGE {toc_page_numbers[i]+1} ===")
    print(content)
    #print("\n")

Aapo Hyv¨arinen
Jarmo Hurri
Patrik O. Hoyer
Aapo Hyv¨arinen
A
a
p
o
H
y
v
¨
a
r
i
n
e
n
Jarmo Hurri
J
a
r
m
o
H
u
r
r
i
Patrik O. Hoyer
P
a
t
r
i
k
O
.
H
o
y
e
r
Natural Image Statistics
Natural Image Statistics
N
a
t
u
r
a
l
I
m
a
g
e
S
t
a
t
i
s
t
i
c
s
A probabilistic approach to early
computational vision
A probabilistic approach to early
A
p
r
o
b
a
b
i
l
i
s
t
i
c
a
p
p
r
o
a
c
h
t
o
e
a
r
l
y
computational vision
c
o
m
p
u
t
a
t
i
o
n
a
l
v
i
s
i
o
n
February 27, 2009
February 27, 2009
F
e
b
r
u
a
r
y
2
7
,
2
0
0
9
Springer
Springer
S
p
r
i
n
g
e
r


In [10]:
toc

[]

In [None]:
from pdfminer.high_level import extract_pages, extract_text_to_fp
from pdfminer.layout import LAParams
from io import StringIO

def find_toc_pages(pdf_path):
    toc_pages = []
    toc_started = False
    toc_format_ended = False
    
    # Process all pages to find ToC pages
    for i, page_layout in enumerate(extract_pages(pdf_path)):
        page_text = []
        
        # Extract all text from the current page using various text element types
        for element in page_layout:
            # Handle different types of text elements
            if hasattr(element, 'get_text'):
                text = element.get_text().strip()
                if text:
                    page_text.append(text)
            # Recursively process container elements that might contain text
            elif hasattr(element, '_objs'):
                for obj in element._objs:
                    if hasattr(obj, 'get_text'):
                        text = obj.get_text().strip()
                        if text:
                            page_text.append(text)
        
        # Join all text for analysis
        full_page_text = "\n".join(page_text).lower()
        
        # Check if this page contains "contents" to start capturing
        if not toc_started and ("contents" in full_page_text or "table of contents" in full_page_text):
            toc_started = True
            toc_pages.append(i)
            # Store some characteristics to detect format changes
            toc_format = {
                "avg_line_length": sum(len(line) for line in page_text) / max(1, len(page_text)),
                "line_count": len(page_text),
                "contains_dots": any("..." in line for line in page_text),
                "contains_numbers": any(any(c.isdigit() for c in line) for line in page_text)
            }
            continue
        
        # If we've started the ToC and haven't ended it yet, check if this page continues the ToC format
        if toc_started and not toc_format_ended:
            # Skip if the page is empty
            if not page_text:
                continue
                
            # Calculate current page format characteristics
            current_format = {
                "avg_line_length": sum(len(line) for line in page_text) / max(1, len(page_text)),
                "line_count": len(page_text),
                "contains_dots": any("..." in line for line in page_text),
                "contains_numbers": any(any(c.isdigit() for c in line) for line in page_text)
            }
            
            # Check if format has significantly changed
            format_changed = (
                abs(current_format["avg_line_length"] - toc_format["avg_line_length"]) > 20 or
                abs(current_format["line_count"] - toc_format["line_count"]) > 5 or
                toc_format["contains_dots"] != current_format["contains_dots"] or
                toc_format["contains_numbers"] != current_format["contains_numbers"]
            )
            
            # If format has changed significantly or we see chapter/introduction text, end ToC
            if format_changed or any(keyword in full_page_text for keyword in ["chapter", "introduction", "preface"]):
                toc_format_ended = True
            else:
                toc_pages.append(i)
                
            # Safety check - don't capture more than 10 ToC pages
            if len(toc_pages) >= 10:
                toc_format_ended = True
    
    return toc_pages

def extract_text_with_pymupdf(pdf_path, page_numbers):
    """
    Extract text from specific pages using PyMuPDF (fitz).
    
    Args:
        pdf_path: Path to the PDF file
        page_numbers: List of page numbers to extract text from (0-based)
    
    Returns:
        A list of strings, where each string contains the text from one page
    """
    all_text = []
    
    # Open the PDF
    doc = fitz.open(pdf_path)
    
    # For each selected page, extract text
    for page_num in page_numbers:
        if 0 <= page_num < len(doc):
            page = doc[page_num]
            
            # Extract text with different methods and see which works best
            text = page.get_text()
            
            # Add to results
            all_text.append(text)
    
    # Close the document
    doc.close()
    
    return all_text



# Usage
pdf_path = "../../data/Natural_Image_Statistics.pdf"

# Find the table of contents pages
toc_page_numbers = find_toc_pages(pdf_path)
print(f"Table of Contents found on pages: {[p+1 for p in toc_page_numbers]} (1-based)")

# Extract text from those pages using PyMuPDF
if toc_page_numbers:
    page_texts = extract_text_with_pymupdf(pdf_path, toc_page_numbers)
    
    # Print text from each page
    for i, text in enumerate(page_texts):
        print(f"\n\n=== Page {toc_page_numbers[i] + 1} ===")
        print(text)
        
    # Save to a text file
    with open("toc_text.txt", "w", encoding="utf-8") as f:
        for i, text in enumerate(page_texts):
            f.write(f"\n\n=== Page {toc_page_numbers[i] + 1} ===\n")
            f.write(text)
    
    print(f"\nExtracted text saved to toc_text.txt")
else:
    print("No Table of Contents pages found.")





Table of Contents found on pages: [3] (1-based)


=== Page 3 ===
Contents overview
1
Introduction . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
1
Part I Background
2
Linear ﬁlters and frequency analysis . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 25
3
Outline of the visual system . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 51
4
Multivariate probability and statistics . . . . . . . . . . . . . . . . . . . . . . . . . . . . 69
Part II Statistics of linear features
5
Principal components and whitening . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 97
6
Sparse coding and simple cells . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 137
7
Independent component analysis. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 159
8
Information-theoretic interpretations. . . . . . . . . . . . . . . . . . . . . . . . . . . . . 185
Part III Nonlinear featur

In [30]:
toc_page_numbers

[2]

In [38]:
import fitz  # PyMuPDF
import logging
import warnings
import re

# Suppress warnings
logging.getLogger('pdfminer').setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

def find_and_extract_toc(pdf_path, max_pages_to_check=30, max_toc_pages=10):
    """
    Improved function to find and extract TOC text in a single pass.
    """
    toc_pages = []
    toc_texts = []
    
    # Open the PDF just once
    doc = fitz.open(pdf_path)
    total_pages = len(doc)
    
    # Only check the first specified number of pages
    pages_to_check = min(max_pages_to_check, total_pages)
    
    # First pass: Find the first TOC page
    first_toc_page = -1
    for i in range(pages_to_check):
        page = doc[i]
        text = page.get_text().lower()
        
        # Check if this page contains TOC indicators
        if (("contents" in text and not "table of figures" in text) or 
            "table of contents" in text):
            first_toc_page = i
            break
    
    # If we found a TOC page, capture it and subsequent pages with similar format
    if first_toc_page >= 0:
        # Get the first TOC page
        page = doc[first_toc_page]
        text = page.get_text()
        toc_pages.append(first_toc_page)
        toc_texts.append(text)
        
        # Create pattern for chapter lines (common in academic books)
        chapter_pattern = re.compile(r'chapter\s+\d+', re.IGNORECASE)
        
        # Check for chapter entries in the first TOC page
        has_chapter_entries = bool(chapter_pattern.search(text.lower()))
        
        # Check subsequent pages
        for i in range(first_toc_page + 1, min(first_toc_page + max_toc_pages, total_pages)):
            page = doc[i]
            text = page.get_text()
            lower_text = text.lower()
            
            # Skip if this page is empty
            if not text.strip():
                continue
            
            # Strong indicators that this is NOT a TOC page
            if any(x in lower_text for x in [
                "chapter 1.", 
                "introduction.", 
                "preface.", 
                "chapter one.", 
                "acknowledgements."
            ]):
                break
            
            # If first TOC page had chapter entries, check if this page also has them
            if has_chapter_entries:
                current_has_chapters = bool(chapter_pattern.search(lower_text))
                # For academic books, chapters are a strong indicator of TOC pages
                if current_has_chapters:
                    toc_pages.append(i)
                    toc_texts.append(text)
                    continue
            
            # Check for content patterns common in TOC pages
            is_toc_page = False
            
            # Check if page contains numbered entries (e.g., "1.1", "2.3", etc.)
            if re.search(r'\d+\.\d+', lower_text):
                is_toc_page = True
                
            # Check if page contains page numbers (patterns like "... 123" or "... 45")
            if re.search(r'\.+\s*\d+', lower_text):
                is_toc_page = True
                
            # Check for sequences of short entries followed by numbers (typical TOC format)
            lines = [line.strip() for line in text.split('\n') if line.strip()]
            if any(re.search(r'.{5,50}\s+\d+$', line) for line in lines):
                is_toc_page = True
            
            # If nothing indicates this is a TOC page, stop
            if not is_toc_page:
                break
                
            # Add this page to the TOC
            toc_pages.append(i)
            toc_texts.append(text)
    
    # Close the document
    doc.close()
    
    return toc_pages, toc_texts

# Usage
pdf_path = "../../data/mcelreath_2020_statistical-rethinking.pdf"

# Find and extract ToC with more robust detection
toc_page_numbers, toc_texts = find_and_extract_toc(pdf_path, max_pages_to_check=30, max_toc_pages=10)
print(f"Table of Contents found on pages: {[p+1 for p in toc_page_numbers]} (1-based)")

# Process the extracted text
if toc_page_numbers:
    print(f"Found {len(toc_page_numbers)} TOC pages")
    
    # Save to file
    with open("toc_text.txt", "w", encoding="utf-8") as f:
        for i, text in enumerate(toc_texts):
            f.write(f"\n\n=== Page {toc_page_numbers[i] + 1} ===\n")
            f.write(text)
    
    print(f"\nExtracted text saved to toc_text.txt")
else:
    print("No Table of Contents pages found.")
    

Table of Contents found on pages: [6, 7, 8, 10, 11] (1-based)
Found 5 TOC pages

Extracted text saved to toc_text.txt
