# Load PDF

In [None]:
import pymupdf

doc = pymupdf.open("../data/raw/World History Textbook-082817.pdf")

page = doc[34]

blocks = page.get_text("blocks")
print(len(blocks))

for b in blocks:
    bbox = b[:4]
    text = b[4]
    print("BLOCK:", bbox)
    print(text[:200].replace("\n", " "), "...")
    print("-" * 80)

Get blocks with PyMuPDF

In [54]:
import pymupdf

doc = pymupdf.open("../data/raw/World History Textbook-082817.pdf")
page = doc[25]  # or 35 / whatever page index you're testing

blocks = page.get_text("blocks")

print(len(blocks))
for i, b in enumerate(blocks):
    x0, y0, x1, y1, text, *rest = b
    print(f"\n===== BLOCK {i} =====")
    print(f"bbox = ({x0:.2f}, {y0:.2f}, {x1:.2f}, {y1:.2f})")
    print("raw text:")
    print(repr(text))  # repr() shows \n etc.


11

===== BLOCK 0 =====
bbox = (286.52, 740.81, 327.88, 754.59)
raw text:
'Page | 17 \n'

===== BLOCK 1 =====
bbox = (415.12, 45.30, 540.00, 57.03)
raw text:
'CHAPTER 1: PREHISTORY\n'

===== BLOCK 2 =====
bbox = (72.00, 87.47, 542.61, 462.66)
raw text:
'During the Paleolithic Era, and until recently in fact, a child would be breastfed until he or she was three or four years old, a necessity preventing mothers from joining long-distance hunting expeditions without their toddlers. However, a breastfeeding woman could complete tasks that "don\'t require rapt concentration, are relatively dull and repetitive; they are easily interrupted, don\'t place the child in danger, and don\'t require the participant to stray far from home." \nDuri ng the  Paleolit hic  Era, and until recently in fact, a child would be breastfed until he or she was three or four years old, a necessity preventing mothers from joining long-distance hunting expeditions without their toddlers. However, a breastfeeding wom

## Fix duplicates with in blocks

Test for one page

In [None]:
import json
import re
import pymupdf

doc = pymupdf.open("../data/raw/World History Textbook-082817.pdf")
final_data = {}

# loop over pages
for i, page in enumerate(doc):
    print(f"processing page {i+1}...") 
    
    blocks = page.get_text("blocks")
    cleaned_blocks = []
    seen_keys = set()
    page_paras = [] 
    
    for b in blocks:
        text = b[4].strip()
        if not text: continue
        
        
        if ("page |" in text.lower() and len(text) < 40) or \
           text.lower().startswith(("figure ", "map ", "license:", "source:", "author:")):
            continue
            
        block_text_parts = []
        
        lines = text.split("\n")
        for raw_line in lines:
            p = raw_line.strip()
            if not p:
                continue

            
            p_norm = re.sub(r"\s+", " ", p.lower()).strip()
            k = re.sub(r"[^a-z0-9]+", "", p_norm)[:80]
            
            if not k or k in seen_keys: continue
            
            # inline duplicate check against previous paragraphs on this page
            skip = False
            for idx, existing in enumerate(page_paras):
                ex_norm = re.sub(r"\s+", " ", existing.lower()).strip()
                if p_norm in ex_norm: 
                    skip = True; break
                if ex_norm in p_norm: 
                    page_paras[idx] = p 
                    skip = True; break
            
            if skip: continue

            seen_keys.add(k)
            page_paras.append(p)
            block_text_parts.append(p)
            
        if block_text_parts:
            cleaned_blocks.append(" ".join(block_text_parts))
            
    # finalize page string
    full_text = re.sub(r"\s+", " ", " ".join(cleaned_blocks)).strip()
    
    final_data[f"page_{i+1}"] = {
        "page": i + 1, 
        "text": full_text, 
        "char_count": len(full_text), 
        "word_count": len(full_text.split())
    }

with open("../data/clean/cleaned_book_metadata.json", "w", encoding="utf8") as f:
    json.dump(final_data, f, ensure_ascii=False, indent=2)

print("Done.")

Done.


In [None]:
# Process entire PDF
output = {}

for i in range(len(doc)):
    page = doc[i]
    cleaned = clean_page(page)

    output[f"page_{i+1}"] = {   
        "page": i + 1,
        "text": cleaned,
        "char_count": len(cleaned),
        "word_count": len(cleaned.split())
    }

with open("../data/clean/cleaned_book_metadata.json", "w", encoding="utf8") as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

print(f"Processed {len(doc)} pages.")
print("Saved to: ../data/clean/cleaned_book_metadata.json")