In [3]:
import re
import json
from collections import Counter
from pypdf import PdfReader

# -----------------------------
# CONFIGURATION
# -----------------------------
PDF_PATH = "../resources/miller_anes_test.pdf"
OUTPUT_JSON = "../resources/cleaned_miller_chapter.json"

# -----------------------------
# STEP 1: LOAD PDF
# -----------------------------
reader = PdfReader(PDF_PATH)
raw_pages = [page.extract_text() for page in reader.pages]

# Filter out blank pages
raw_pages = [p for p in raw_pages if p and len(p.strip()) > 20]
print(f"Loaded {len(raw_pages)} pages from {PDF_PATH}")

# -----------------------------
# STEP 2: BASIC CLEANING
# -----------------------------
def clean_page_text(text):
    """
    Cleans one PDF page:
    1. Removes standalone page numbers
    2. Joins hyphenated words
    3. Fixes single line breaks in sentences
    4. Collapses multiple spaces/newlines
    """
    # Remove standalone page numbers like "123" or "- 123 -"
    text = re.sub(r"\n?\s*-?\s*\d+\s*-?\s*\n?", "\n", text)
    
    # Join hyphenated words across line breaks
    text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
    
    # Replace single line breaks inside paragraphs with space
    text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
    
    # Collapse multiple newlines into paragraph breaks
    text = re.sub(r"\n\s*\n+", "\n\n", text)
    
    # Collapse multiple spaces
    text = re.sub(r"[ \t]+", " ", text)
    
    return text.strip()

clean_pages = [clean_page_text(p) for p in raw_pages]

# -----------------------------
# STEP 3: REMOVE REPEATED HEADERS/FOOTERS
# -----------------------------
from collections import Counter

# Detect repeated first/last lines (likely headers or author names)
first_lines = [p.split("\n")[0].strip() for p in clean_pages if p]
last_lines = [p.split("\n")[-1].strip() for p in clean_pages if p]

common_headers = [h for h, c in Counter(first_lines).items() if c > 3 and len(h) > 0]
common_footers = [f for f, c in Counter(last_lines).items() if c > 3 and len(f) > 0]

def remove_headers_footers(text):
    for h in common_headers + common_footers:
        text = text.replace(h, "")
    return text.strip()

clean_pages = [remove_headers_footers(p) for p in clean_pages]

# -----------------------------
# STEP 5: REMOVE REFERENCES SECTION
# -----------------------------
def remove_references(text):
    """
    Removes text from 'References' to the end of the page/document.
    """
    pattern = r"(REFERENCES.*)$"
    cleaned_text = re.sub(pattern, "", text, flags=re.IGNORECASE | re.DOTALL)
    return cleaned_text.strip()

clean_pages = [remove_references(p) for p in clean_pages]

#txt file saved
full_text = "\n\n".join(clean_pages)
with open("cleaned_test_chapter.txt", "w", encoding="utf-8") as f:
    f.write(full_text)

# -----------------------------
# STEP 6: DETECT CHAPTER TITLES
# -----------------------------
chapter_pattern = re.compile(r"(CHAPTER\s+\d+|[A-Z][A-Z\s]{5,})")

def detect_chapter(page):
    # Search in first 200 chars for heading
    match = chapter_pattern.search(page[:200])
    return match.group(0).strip() if match else None

documents = []
current_chapter = "Unknown Chapter"

for i, page in enumerate(clean_pages, start=1):
    chapter = detect_chapter(page) or current_chapter
    if detect_chapter(page):
        current_chapter = chapter

    documents.append({
        "content": page,
        "metadata": {
            "page_number": i,
            "chapter": current_chapter,
            "source": PDF_PATH
        }
    })

# -----------------------------
# STEP 7: SAVE CLEANED OUTPUT
# -----------------------------
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(documents, f, ensure_ascii=False, indent=2)

print(f"✅ Cleaned text saved to {OUTPUT_JSON} ({len(documents)} pages)")


Loaded 9 pages from ../resources/miller_anes_test.pdf
✅ Cleaned text saved to ../resources/cleaned_miller_chapter.json (9 pages)
