In [6]:
import fitz
import warnings
from pdfminer.high_level import extract_text, extract_pages

In [None]:
# Dictionary mapping example keys to PDF paths
examples = {
    "pdf_path1": "../data/mcelreath_2020_statistical-rethinking.pdf",
    "pdf_path2": "../data/Theory of Statistic.pdf",
    "pdf_path3": "../data/Deep Learning with Python.pdf",
    "pdf_path4": "../data/Natural_Image_Statistics.pdf",
    "pdf_path5": "../data/mml-book.pdf"
}

# Dictionary mapping example keys to page ranges to extract content from
content_page_ranges = {
    "pdf_path1": range(5, 8),
    "pdf_path2": range(10, 17),
    "pdf_path3": range(7, 13),
    "pdf_path4": range(4, 13),
    "pdf_path5": range(2, 5),
}

# Select example number
n_example = 4
key = f"pdf_path{n_example}"

# Open the PDF
doc = fitz.open(examples[key])

# Extract text from the specified page range
chapters_content_list = []
for page_num in content_page_ranges[key]:
    page = doc[page_num]
    text = page.get_text("text")
    chapters_content_list.append(text)

# Join all text pages into a single string if needed
chapters_content = "\n".join(chapters_content_list)

print(chapters_content)  # or pass it to your model

In [None]:
def extract_font_info(pdf_path):
    doc = fitz.open(pdf_path)
    font_data = []
    
    for page_num in content_page_ranges[key]:
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]  # Extract text blocks
        
        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        font_data.append({
                            "text": span["text"],
                            # "font_name": span["font"],
                            # "font_size": span["size"],
                            # "color": span["color"],  # RGB tuple (e.g., (0, 0, 0) for black)
                            # "is_bold": "bold" in span["font"].lower(),
                            # "is_italic": "italic" in span["font"].lower(),
                            "page": page_num + 1,
                            "coordinates": (span["origin"][0], span["origin"][1])
                        })
    return font_data

# Usage
font_info = extract_font_info(examples[key])


def extract_lines_from_font_info(font_info):
    """
    Extracts lines of text from font information based on y-coordinates.
    This function assumes that text elements with the same y-coordinate belong to the same line.
    """
    if not font_info:
        return []
    lines = []
    prev_y = None
    cur_line = ""

    for element in font_info:
        cur_y = element['coordinates'][1]
        if prev_y is None or cur_y == prev_y:
            cur_line += " " + element['text']
        else:
            if cur_line.strip():
                lines.append(cur_line.strip())
            cur_line = element['text']
        prev_y = cur_y

    # Don't forget the last line
    if cur_line.strip():
        lines.append(cur_line.strip())

    return lines

In [None]:
font_info = extract_font_info(examples[key])
lines = extract_lines_from_font_info(font_info)
for line in lines:
    print(line)

'1.1 What this book is all about . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1'