# Load data

In [100]:
import json

with open("../data/clean/cleaned_book_metadata.json", "r", encoding="utf8") as f:
    text = json.load(f)

# Clean data

## Remove top and bottom boilerplate code

In [101]:
filtered = {}

for key, value in text.items():
    page_no = value['page']

    if 10 <= page_no <= 486:
        filtered[key] = value

## Add chapter details metadata

In [102]:
import re

CHAPTER_REGEX = re.compile(
    r"CHAPTER\s+(\d+)\s*:\s*([A-Z][A-Z \-]+)"
)

def normalize_chapter_title(title):
    if not title:
        return "UNTITLED"

    t = title.strip().upper()

    # Keep only letters, spaces, and digits
    t = "".join(c for c in t if c.isalnum() or c.isspace())

    # Remove trailing single letters or numbers (OCR garbage)
    parts = t.split()
    while parts and len(parts[-1]) <= 1:
        parts.pop()

    t = " ".join(parts)

    return t if t else "UNTITLED"

def extract_chapter_details(text):

    head = text[:250]
    m = CHAPTER_REGEX.search(head)

    if not m:
        return None, None
    
    chapter_number = int(m.group(1))
    chapter_title = normalize_chapter_title(m.group(2))

    return chapter_number, chapter_title

pages_with_metadata = {}

for key, value in filtered.items():
    text = value.get('text', '')

    chapter_number, chapter_title = extract_chapter_details(text)

    pages_with_metadata[key] = {
        "page": value['page'],
        "text": text,
        "char_count": len(text),
        "word_count": len(text.split()),
        "chapter_number": chapter_number,
        "chapter_title": chapter_title
    }

In [103]:
def backfill_chapter_details(pages):

    last_chapter_number = None
    last_chapter_title = None

    for key, value in pages.items():
        chapter_num = value.get("chapter_number")
        chapter_title = value.get("chapter_title")
        page_number = value.get("page")

        # Update memory when a chapter header appears
        if chapter_num is not None:
            last_chapter_number = chapter_num
            last_chapter_title = chapter_title
        else:
            # Backfill missing chapter metadata
            value["chapter_number"] = last_chapter_number
            value["chapter_title"] = last_chapter_title

        # Add combined chapter details (after backfill)
        value["chapter_details"] = (
            f"CHAPTER: {value['chapter_number']} - {value['chapter_title']} | pg-{page_number}"
        )

        # Remove only unwanted keys (text must stay)
        for unwanted in ["page", "char_count", "word_count", "chapter_number", "chapter_title"]:
            value.pop(unwanted, None)

    return pages
pages_with_metadata = backfill_chapter_details(pages_with_metadata)

In [104]:
for i, (key, value) in enumerate(pages_with_metadata.items()):
    print(f"chapter details: {value['chapter_details']}")
    print(f"text snippet: {value['text'][:100]!r}")

chapter details: CHAPTER: None - None | pg-10
text snippet: '1 Eugene Berger. 1.1 Chronology. 8 – 6 million years ago Bi-pedal hominids in Africa 2.6 million yea'
chapter details: CHAPTER: None - None | pg-11
text snippet: 'not content with simply reaching remote places; they were curious about their earliest human inhabit'
chapter details: CHAPTER: 1 - PREHISTORY | pg-12
text snippet: 'CHAPTER 1: PREHISTORY 1.3 QUESTIONS TO GUIDE YOUR READING 1. What were some factors that led to homi'
chapter details: CHAPTER: 1 - PREHISTORY | pg-13
text snippet: 'some time. While some bipedal hominids may have stayed in the forest, climate changes did drive othe'
chapter details: CHAPTER: 1 - PREHISTORY | pg-14
text snippet: 'CHAPTER 1: PREHISTORY major effects on hominid development. First, with sea levels dropping due to g'
chapter details: CHAPTER: 1 - PREHISTORY | pg-15
text snippet: 'Americas. Then from 10,000 years ago to 1,500 CE, humans arrived in the Arctic, the Indian Ocean, th'
chapter de

# Export data

In [105]:
import json

with open("../data/clean/cleaned_book.json", "w", encoding="utf8") as f:
    json.dump(pages_with_metadata, f, indent=2, ensure_ascii=False)