In [4]:
import fitz  # PyMuPDF

pdf_path = "../../data/mcelreath_2020_statistical-rethinking.pdf"
doc = fitz.open(pdf_path)
content_page_range = range(5,7)  # Adjust the range as needed

# Get content 
chapters_content = []
for page_num in content_page_range:
    page = doc[page_num]
    text = page.get_text("text")
    chapters_content.append(text)


In [5]:
chapters_content

['Contents\nPreface to the Second Edition\nix\nPreface\nxi\nAudience\nxi\nTeaching strategy\nxii\nHow to use this book\nxii\nInstalling the rethinking R package\nxvi\nAcknowledgments\nxvi\nChapter 1.\nThe Golem of Prague\n1\n1.1.\nStatistical golems\n1\n1.2.\nStatistical rethinking\n4\n1.3.\nTools for golem engineering\n10\n1.4.\nSummary\n17\nChapter 2.\nSmall Worlds and Large Worlds\n19\n2.1.\nThe garden of forking data\n20\n2.2.\nBuilding a model\n28\n2.3.\nComponents of the model\n32\n2.4.\nMaking the model go\n36\n2.5.\nSummary\n46\n2.6.\nPractice\n46\nChapter 3.\nSampling the Imaginary\n49\n3.1.\nSampling from a grid-approximate posterior\n52\n3.2.\nSampling to summarize\n53\n3.3.\nSampling to simulate prediction\n61\n3.4.\nSummary\n68\n3.5.\nPractice\n68\nChapter 4.\nGeocentric Models\n71\n4.1.\nWhy normal distributions are normal\n72\n4.2.\nA language for describing models\n77\n4.3.\nGaussian model of height\n78\n4.4.\nLinear prediction\n91\n4.5.\nCurves from lines\n110\n4.6.\nS

In [6]:
import fitz  # PyMuPDF

page_number_map = {}

for i, page in enumerate(doc):
    text = page.get_text("dict")
    blocks = text["blocks"]

    for block in blocks:
        if "lines" not in block:
            continue
        for line in block["lines"]:
            for span in line["spans"]:
                # Heuristic: page numbers are small and centered at the bottom
                y0 = span["bbox"][1]
                if y0 > page.rect.height * 0.85:  # Bottom 15% of page
                    clean_text = span["text"].strip()
                    if clean_text.isdigit():
                        page_number_map[i] = int(clean_text)

In [None]:
import fitz  # PyMuPDF

def extract_page_data_fitz(pdf_path):
    doc = fitz.open(pdf_path)
    pages_data = []

    for i, page in enumerate(doc):
        height = page.rect.height
        width = page.rect.width

        top_rect = fitz.Rect(0, 0, width, height * 0.15)
        bottom_rect = fitz.Rect(0, height * 0.85, width, height)

        top_text = page.get_text("text", clip=top_rect).split()
        bottom_text = page.get_text("text", clip=bottom_rect).split()

        found_number = None
        for text in top_text + bottom_text:
            if text.isdigit():
                found_number = int(text)
                break

        full_text = page.get_text("text")

        pages_data.append({
            "index": i,
            "number": found_number,
            "content": full_text
        })

    doc.close()
    return pages_data


def correct_page_numbers(pages_data, sequence_length=10):
    # Find first sequence of 'sequence_length' consecutive page numbers
    seen = [(i, d["number"]) for i, d in enumerate(pages_data) if isinstance(d["number"], int)]

    for start in range(len(seen) - sequence_length + 1):
        valid = True
        for j in range(sequence_length):
            if seen[start + j][1] != seen[start][1] + j:
                valid = False
                break
        if valid:
            base_index, base_number = seen[start]
            break
    else:
        # No sequence found, return original data
        raise ValueError("No valid sequence of page numbers found.")

    # Forward fill from base_index
    for offset, page in enumerate(pages_data[base_index:], start=0):
        page["number"] = base_number + offset

    # Backward fill before base_index
    for offset in range(1, base_index + 1):
        page = pages_data[base_index - offset]
        page["number"] = base_number - offset
    
    # Set pages < 1 == None
    for page in pages_data:
        if page["number"] < 1:
            page["number"] = None

    return pages_data

In [None]:
pdf_path1 = "../../data/mcelreath_2020_statistical-rethinking.pdf"
pdf_path2 = "../../data/Theory of Statistic.pdf"
pdf_path3 = "../../data/Deep Learning with Python.pdf"
pdf_path4 = "../../data/mcelreath_2020_statistical-rethinking.pdf"
pdf_path5 = "../../data/mml-book.pdf"



pages_data = extract_page_data_fitz(pdf_path1)

try:
    corrected_pages = correct_page_numbers(pages_data, sequence_length=10)
except ValueError as e:
    print(f"Warning: {e}. Using original page data without correction.")
    corrected_pages = pages_data  # fallback to original

for p in corrected_pages:
    print(f"Page index: {p['index']}, Page number: {p['number']}")

[{'index': 0, 'number': None, 'content': ''},
 {'index': 1, 'number': None, 'content': 'Statistical Rethinking\n'},
 {'index': 2,
  'number': None,
  'content': 'CHAPMAN & HALL/CRC\nTexts in Statistical Science Series\nJoseph K. Blitzstein, Harvard University, USA  \nJulian J. Faraway, University of Bath, UK  \nMartin Tanner, Northwestern University, USA \nJim Zidek, University of British Columbia, Canada\nRecently Published Titles\nTheory of Spatial Statistics \nA Concise Introduction \nM.N.M van Lieshout\nBayesian Statistical Methods \nBrian J. Reich and Sujit K. Ghosh\nSampling \nDesign and Analysis, Second Edition \nSharon L. Lohr\nThe Analysis of Time Series \nAn Introduction with R, Seventh Edition \nChris Chatfield and Haipeng Xing\nTime Series \nA Data Analysis Approach Using R \nRobert H. Shumway and David S. Stoffer\nPractical Multivariate Analysis, Sixth Edition \nAbdelmonem Afifi, Susanne May, Robin A. Donatello, and Virginia A. Clark\nTime Series: A First Course with Boots