In [8]:
import json

with open('../data/clean/cleaned_book.json', 'r') as file:
    data = json.load(file)

for i, (key, value) in enumerate(data.items()):
    
    if i == 5:
        break

    print(f"chapter details: {value['chapter_details']}")
    print(f"text snippet: {value['text'][:100]!r}")

# Create chapter_texts dictionary: {chapter_details: text}
chapter_texts = {}
for key, value in data.items():
    chapter_details = value['chapter_details']
    text = value['text']
    
    # Aggregate text by chapter
    if chapter_details not in chapter_texts:
        chapter_texts[chapter_details] = []
    chapter_texts[chapter_details].append(text)

# Join all page texts for each chapter
chapter_texts = {ch: " ".join(texts) for ch, texts in chapter_texts.items()}

chapter details: CHAPTER: None - None | pg-10
text snippet: '1 Eugene Berger. 1.1 Chronology. 8 – 6 million years ago Bi-pedal hominids in Africa 2.6 million yea'
chapter details: CHAPTER: None - None | pg-11
text snippet: 'not content with simply reaching remote places; they were curious about their earliest human inhabit'
chapter details: CHAPTER: 1 - PREHISTORY | pg-12
text snippet: 'CHAPTER 1: PREHISTORY 1.3 QUESTIONS TO GUIDE YOUR READING 1. What were some factors that led to homi'
chapter details: CHAPTER: 1 - PREHISTORY | pg-13
text snippet: 'some time. While some bipedal hominids may have stayed in the forest, climate changes did drive othe'
chapter details: CHAPTER: 1 - PREHISTORY | pg-14
text snippet: 'CHAPTER 1: PREHISTORY major effects on hominid development. First, with sea levels dropping due to g'


# NLTK tokenizer

In [14]:
import nltk
from nltk.tokenize import sent_tokenize

# Ensure NLTK models exist
nltk.download("punkt")

def split_sentences(pages):
    """Add a 'sentences' list to each page."""
    for key, value in pages.items():
        text = value.get("text", "").strip()

        if text:
            sentences = sent_tokenize(text)
        else:
            sentences = []

        value["sentences"] = sentences

    return pages
chapter_sentences = split_sentences(data)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# MiniLM to create chunks

In [23]:
from sentence_transformers import SentenceTransformer, util
import uuid

def build_semantic_chunks(pages,
                          model_name="all-MiniLM-L6-v2",
                          similarity_threshold=0.55,
                          max_sentences_per_chunk=10):

    print("Loading embedding model...")
    model = SentenceTransformer(model_name)

    all_chunks = []

    for page_num, page in pages.items():
        sentences = page.get("sentences", [])
        chapter_m = page.get("chapter_details")

        if not sentences:
            continue

        # Embed page sentences
        embeddings = model.encode(sentences, convert_to_tensor=True)

        current_chunk_sentences = []
        current_chunk_embeddings = []

        def flush_chunk():
            if not current_chunk_sentences:
                return

            chunk_text = " ".join(current_chunk_sentences).strip()
            if not chunk_text:
                return

            all_chunks.append({
                "chunk_id": str(uuid.uuid4()),
                "chapter_metadata": chapter_m,
                "text": chunk_text,
            })

        # Build chunks
        for i, sentence in enumerate(sentences):
            sent_emb = embeddings[i]

            if not current_chunk_embeddings:
                current_chunk_sentences.append(sentence)
                current_chunk_embeddings.append(sent_emb)
                continue

            prev_emb = current_chunk_embeddings[-1]
            similarity = util.pytorch_cos_sim(prev_emb, sent_emb).item()

            split_by_size = len(current_chunk_sentences) >= max_sentences_per_chunk
            split_by_similarity = similarity < similarity_threshold

            if split_by_size or split_by_similarity:
                flush_chunk()
                current_chunk_sentences = [sentence]
                current_chunk_embeddings = [sent_emb]
            else:
                current_chunk_sentences.append(sentence)
                current_chunk_embeddings.append(sent_emb)

        # Flush any remaining chunk
        flush_chunk()

    return all_chunks

chunks = build_semantic_chunks(chapter_sentences)
print("Chunks:", len(chunks))

Loading embedding model...
Chunks: 7353


# Build chroma db

In [17]:
from sentence_transformers import SentenceTransformer

def get_embedder(model_name="all-MiniLM-L6-v2"):
    return SentenceTransformer(model_name)


In [27]:
import chromadb

def create_chroma_collection(collection_name="history_book"):
    client = chromadb.PersistentClient(path="../data/world_history_store")

    collection = client.get_or_create_collection(
        name=collection_name,
        metadata={"hnsw:space": "cosine"}   # cosine distance, best for embeddings
    )
    return collection

def store_chunks_in_chroma(chunks, collection, embedder, batch_size=100):
    texts = []
    ids = []
    metadatas = []

    for chunk in chunks:
        ids.append(chunk["chunk_id"])
        texts.append(chunk["text"])

        metadatas.append({
            "chapter_metadata": chunk["chapter_metadata"] if chunk["chapter_metadata"] else "UNKNOWN",
        })

    embeddings = embedder.encode(texts).tolist()

    # Add in batches to avoid ChromaDB batch size limit
    total_chunks = len(chunks)
    for i in range(0, total_chunks, batch_size):
        end_idx = min(i + batch_size, total_chunks)
        
        collection.add(
            ids=ids[i:end_idx],
            documents=texts[i:end_idx],
            metadatas=metadatas[i:end_idx],
            embeddings=embeddings[i:end_idx]
        )
        
        print(f"Stored batch {i//batch_size + 1}: {end_idx}/{total_chunks} chunks")

    print(f"✓ All {total_chunks} chunks stored in Chroma.")

embedder = get_embedder()
collection = create_chroma_collection("world_history")

store_chunks_in_chroma(chunks, collection, embedder)

Stored batch 1: 100/7353 chunks
Stored batch 2: 200/7353 chunks
Stored batch 3: 300/7353 chunks
Stored batch 4: 400/7353 chunks
Stored batch 5: 500/7353 chunks
Stored batch 6: 600/7353 chunks
Stored batch 7: 700/7353 chunks
Stored batch 5: 500/7353 chunks
Stored batch 6: 600/7353 chunks
Stored batch 7: 700/7353 chunks
Stored batch 8: 800/7353 chunks
Stored batch 9: 900/7353 chunks
Stored batch 10: 1000/7353 chunks
Stored batch 8: 800/7353 chunks
Stored batch 9: 900/7353 chunks
Stored batch 10: 1000/7353 chunks
Stored batch 11: 1100/7353 chunks
Stored batch 12: 1200/7353 chunks
Stored batch 13: 1300/7353 chunks
Stored batch 14: 1400/7353 chunks
Stored batch 11: 1100/7353 chunks
Stored batch 12: 1200/7353 chunks
Stored batch 13: 1300/7353 chunks
Stored batch 14: 1400/7353 chunks
Stored batch 15: 1500/7353 chunks
Stored batch 16: 1600/7353 chunks
Stored batch 17: 1700/7353 chunks
Stored batch 15: 1500/7353 chunks
Stored batch 16: 1600/7353 chunks
Stored batch 17: 1700/7353 chunks
Stored b

In [35]:
def search(query, collection, embedder, k=10):
    query_emb = embedder.encode([query]).tolist()

    results = collection.query(
        query_embeddings=query_emb,
        n_results=k
    )
    return results

r = search("How did the Neolithic Revolution change human societies?", collection, embedder)

for doc, meta in zip(r["documents"][0], r["metadatas"][0]):
    print("—")
    print(meta["chapter_metadata"])
    print(doc)

—
CHAPTER: 1 - PREHISTORY | pg-21
1.6 Agriculture and the "Neolithic Revolution".
—
CHAPTER: 1 - PREHISTORY | pg-25
Family life also changed significantly during the Neolithic.
—
CHAPTER: 1 - PREHISTORY | pg-24
For many scholars, the abandonment of hunting represents the "real" Neolithic Revolution. As communities completely abandoned hunting and
—
CHAPTER: 1 - PREHISTORY | pg-25
1.6.3 Leaving Paleolithic Culture Behind While the Neolithic Era is described in greater detail elsewhere, it is important to understand Paleolithic and Neolithic differences in order to convey a sense of just how revolutionary the shift to agriculture was for humanity.
—
CHAPTER: 1 - PREHISTORY | pg-21
12.She and hundreds of other scholars from Hobbes to Marx have pointed to the Neolithic Revolution, that is, the move from a hunter-gatherer world to an agricultural one, as the root of what we today refer to as civilization.
—
CHAPTER: 1 - PREHISTORY | pg-28
About 10,000 years ago, the Neolithic Era began.
—
C