In [3]:
!pip install pypdf2 pdfplumber langchain google-generativeai crewai crewai-tools langchain-google-genai langchain pydantic agno transformers -U

Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting crewai
  Downloading crewai-0.105.0-py3-none-any.whl.metadata (28 kB)
Collecting crewai-tools
  Downloading crewai_tools-0.37.0-py3-none-any.whl.metadata (5.4 kB)
Collecting langchain-google-genai
  Downloading langchain_google_genai-2.0.11-py3-none-any.whl.metadata (3.6 kB)
Collecting agno
  Downloading agno-1.1.9-py3-none-any.whl.metadata (39 kB)
Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collectin

# Fixed-Sized Chunk

Why This Approach?

- **Consistent Segmentation:** Splits text into uniform parts based on a fixed size.
- **Efficient Processing**: Simple to implement and fast to compute.
- **Predictable Output:** Ideal when document structure is less critical.

In [2]:
import PyPDF2

def extract_pdf_text(pdf_path):
    reader = PyPDF2.PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            full_text += page_text + "\n\n"
    return full_text

def fixed_size_chunk(text, chunk_size=500):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

pdf_file = "Insurance_Handbook_20103.pdf"
pdf_text = extract_pdf_text(pdf_file)
chunks = fixed_size_chunk(pdf_text, chunk_size=500)
print("Fixed-Size Chunking:\n", chunks)

Fixed-Size Chunking:


# Sentence-Based Chunking

Why This Approach?

- Natural Boundaries: Uses complete sentences to preserve meaning.
- Context Preservation: Prevents mid-sentence breaks that could distort information.
- Enhanced Coherence: Ensures each chunk remains semantically intact.

In [3]:
import PyPDF2
import spacy
nlp = spacy.load("en_core_web_sm")

def extract_pdf_text(pdf_path):
    reader = PyPDF2.PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            full_text += page_text + "\n\n"
    return full_text

def sentence_chunk(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

pdf_file = "Insurance_Handbook_20103.pdf"
pdf_text = extract_pdf_text(pdf_file)
chunks = sentence_chunk(pdf_text)
print("Sentence-Based Chunking:\n", chunks)

Sentence-Based Chunking:


# Semantic-Based Chunking

- Meaningful Grouping: Clusters sentences based on underlying semantics.
- Improved Relevance: Captures thematic content for better query matching.
- Context-Aware Retrieval: Increases the accuracy of downstream responses.

In [4]:
import PyPDF2
import spacy
from sentence_transformers import SentenceTransformer, util

nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer("all-MiniLM-L6-v2")

def extract_pdf_text(pdf_path):
    """
    Extracts text from each page of a PDF file and returns the full text as a single string.
    """
    reader = PyPDF2.PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            full_text += page_text + "\n\n"
    return full_text

def semantic_embedding_chunk(text, threshold=0.75):
    """
    Splits text into semantic chunks using sentence embeddings.
    Uses spaCy for sentence segmentation and SentenceTransformer for generating embeddings.

    :param text: The full text to chunk.
    :param threshold: Cosine similarity threshold for adding a sentence to the current chunk.
    :return: A list of semantic chunks (each as a string).
    """
    # Sentence segmentation
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]

    chunks = []
    current_chunk_sentences = []
    current_chunk_embedding = None

    for sentence in sentences:
        # Generate embedding for the current sentence
        sentence_embedding = model.encode(sentence, convert_to_tensor=True)

        # If starting a new chunk, initialize it with the current sentence
        if current_chunk_embedding is None:
            current_chunk_sentences = [sentence]
            current_chunk_embedding = sentence_embedding
        else:
            # Compute cosine similarity between current sentence and the chunk embedding
            sim_score = util.cos_sim(sentence_embedding, current_chunk_embedding)
            if sim_score.item() >= threshold:
                # Add sentence to the current chunk and update the chunk's average embedding
                current_chunk_sentences.append(sentence)
                num_sents = len(current_chunk_sentences)
                current_chunk_embedding = ((current_chunk_embedding * (num_sents - 1)) + sentence_embedding) / num_sents
            else:
                # Finalize the current chunk and start a new one
                chunks.append(" ".join(current_chunk_sentences))
                current_chunk_sentences = [sentence]
                current_chunk_embedding = sentence_embedding

    # Append the final chunk if it exists
    if current_chunk_sentences:
        chunks.append(" ".join(current_chunk_sentences))

    return chunks


pdf_file = "Insurance_Handbook_20103.pdf"
pdf_text = extract_pdf_text(pdf_file)
semantic_chunks = semantic_embedding_chunk(pdf_text, threshold=0.75)
for i, chunk in enumerate(semantic_chunks):
    print(f"Chunk {i+1}:\n{chunk}\n{'-'*60}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Chunk 1:
Insurance 
Handbook 
A guide to insurance:  
what it does and how it works


©2010 Insurance Information Institute.
------------------------------------------------------------
Chunk 2:
978-0-932387-47-9Insurance 
Handbook 
A guide to insurance:  
what it does and how it works


Insurance Information Institute
110 William StreetNew York, NY 10038Tel.
------------------------------------------------------------
Chunk 3:
212-346-5500.
------------------------------------------------------------
Chunk 4:
Fax. 212-732-1916.
------------------------------------------------------------
Chunk 5:
www.iii.org
President
	
–
	
Robert
	
P.
	
Hartwig,
	
Ph.D.,
	
CPCU
	
–
	
bobh@iii.org
Executive Vice President
	
–
	
Cary
	
Schneider
	
–
	
carys@iii.org
Senior Vice President
	
–
	
Public
	
Affairs
	
–
	
Jeanne
	
Salvatore
	
–
	
jeannes@iii.org
Senior Vice President and Chief Economist –
	
Steven
	
N.
	
Weisbart,
	
Ph.D.,
	
CLU
	
–
	
stevenw@iii.org
R
esearch
Vice President – Global Issues
	

# Recursive Chunking

Why This Approach?
- Iterative Refinement: Repeatedly splits text until chunks meet size constraints.
- Token Limit Compliance: Ensures all segments are within acceptable processing limits.
- Flexible Segmentation: Continuously refines chunks for optimal downstream processing.

In [5]:
import PyPDF2

def extract_pdf_text(pdf_path):
    reader = PyPDF2.PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            full_text += page_text + "\n\n"
    return full_text

def iterative_chunk(text, max_length=500):
    chunks = []
    while len(text) > max_length:
        separators = ["\n\n", "\n", " ", ""]
        found = False
        for sep in separators:
            if sep == "":
                # If no separator found, just cut the text at max_length.
                chunk = text[:max_length]
                chunks.append(chunk.strip())
                text = text[max_length:]
                found = True
                break
            idx = text.rfind(sep, 0, max_length)
            if idx != -1 and idx != 0:
                chunk = text[:idx]
                chunks.append(chunk.strip())
                text = text[idx:]
                found = True
                break
        if not found:
            # If no suitable separator is found, just break at max_length.
            chunk = text[:max_length]
            chunks.append(chunk.strip())
            text = text[max_length:]
    if text.strip():
        chunks.append(text.strip())
    return chunks

# Example usage:
pdf_file = "Insurance_Handbook_20103.pdf"
pdf_text = extract_pdf_text(pdf_file)
chunks = iterative_chunk(pdf_text, max_length=500)
print("Iterative Chunking:\n", chunks)


Iterative Chunking:


# Modality-Specific Chunking

- Specialized Processing: Separates text, images, and tables for tailored extraction.
- Optimized Multi-Modal Handling: Addresses diverse content types in complex documents.
- Enhanced Data Organization: Enables efficient retrieval across different modalities.

In [6]:
import PyPDF2
import pdfplumber

def extract_pdf_text(pdf_path):
    """
    Extracts text from each page of the PDF using PyPDF2 and returns it as one string.
    """
    reader = PyPDF2.PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            full_text += page_text + "\n\n"  # Separate pages with double newlines
    return full_text

def extract_tables(pdf_path):
    """
    Extracts tables from the PDF using pdfplumber.
    Returns a list of tables (each table is a list of lists).
    """
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_tables = page.extract_tables()
            for table in page_tables:
                tables.append(table)
    return tables

def extract_images(pdf_path):
    """
    Extracts images from the PDF using PyPDF2.
    Note: PyPDF2 has limited image extraction capabilities.
    Returns a list of dictionaries with image data and metadata.
    """
    reader = PyPDF2.PdfReader(pdf_path)
    images = []
    for page_num, page in enumerate(reader.pages):
        resources = page.get("/Resources")
        if resources and "/XObject" in resources:
            xObject = resources["/XObject"].get_object()
            for obj in xObject:
                if xObject[obj].get("/Subtype") == "/Image":
                    data = xObject[obj].get_data()
                    # Determine image type from the filter
                    if "/Filter" in xObject[obj]:
                        if xObject[obj]["/Filter"] == "/DCTDecode":
                            ext = "jpg"
                        elif xObject[obj]["/Filter"] == "/FlateDecode":
                            ext = "png"
                        else:
                            ext = "bin"
                    else:
                        ext = "bin"
                    images.append({"page": page_num, "data": data, "ext": ext})
    return images

def modality_chunk(pdf_path):
    """
    Extracts text, tables, and images from a PDF file.
    - Text is split into paragraphs (using double newlines).
    - Tables are extracted using pdfplumber.
    - Images are extracted using PyPDF2.
    Returns a dictionary with keys: "text_chunks", "tables", and "images".
    """
    text = extract_pdf_text(pdf_path)
    text_chunks = [p.strip() for p in text.split("\n\n") if p.strip()]
    tables = extract_tables(pdf_path)
    images = extract_images(pdf_path)
    return {"text_chunks": text_chunks, "tables": tables, "images": images}

# Example usage:
pdf_file = "Insurance_Handbook_20103.pdf"  # Replace with your PDF file path
result = modality_chunk(pdf_file)

print("Modality-Specific Chunking:")
print("\nText Chunks:")
for idx, chunk in enumerate(result["text_chunks"]):
    print(f"Chunk {idx+1}:\n{chunk}\n{'-'*40}")
print("\nTables:", result["tables"])
print("\nImages:", result["images"])


Modality-Specific Chunking:

Text Chunks:
Chunk 1:
Insurance 
Handbook 
A guide to insurance:  
what it does and how it works
----------------------------------------
Chunk 2:
©2010 Insurance Information Institute. 978-0-932387-47-9Insurance 
Handbook 
A guide to insurance:  
what it does and how it works
----------------------------------------
Chunk 3:
Insurance Information Institute
110 William StreetNew York, NY 10038Tel. 212-346-5500. Fax. 212-732-1916. www.iii.org
President
	
–
	
Robert
	
P.
	
Hartwig,
	
Ph.D.,
	
CPCU
	
–
	
bobh@iii.org
Executive Vice President
	
–
	
Cary
	
Schneider
	
–
	
carys@iii.org
Senior Vice President
	
–
	
Public
	
Affairs
	
–
	
Jeanne
	
Salvatore
	
–
	
jeannes@iii.org
Senior Vice President and Chief Economist –
	
Steven
	
N.
	
Weisbart,
	
Ph.D.,
	
CLU
	
–
	
stevenw@iii.org
R
esearch
Vice President – Global Issues
	
–
	
Claire
	
Wilkinson
	
–
	
clairew@iii.org
Publications 
Vice President – Publications and Information Services – Madine
	
Singer
	
–
	
mad

# Sliding-Window Chunking

Why This Approach?
- Overlapping Context: Maintains shared content between adjacent chunks.
- Smooth Transitions: Reduces abrupt breaks in context.
- Continuous Retrieval: Supports coherent processing of lengthy documents.

In [7]:
import PyPDF2

def extract_pdf_text(pdf_path):
    reader = PyPDF2.PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            full_text += page_text + "\n\n"
    return full_text

def sliding_window_chunk(text, window_size=100, overlap=20):
    words = text.split()
    chunks = []
    step = window_size - overlap
    for i in range(0, len(words), step):
        chunk = " ".join(words[i:i+window_size])
        chunks.append(chunk)
    return chunks

pdf_file = "Insurance_Handbook_20103.pdf"
pdf_text = extract_pdf_text(pdf_file)
chunks = sliding_window_chunk(pdf_text, window_size=100, overlap=20)
print("Sliding-Window Chunking:\n", chunks)


Sliding-Window Chunking:


# Hierarchical Chunking

Why This Approach?
- Structural Awareness: Leverages natural document headings and sections.
- Multi-Level Context: Preserves the inherent organization of the text.
- Logical Navigation: Enhances retrieval by reflecting the document’s hierarchy.

In [8]:
# Manual Hierarchical Chunking
import PyPDF2

def extract_pdf_text(pdf_path):
    reader = PyPDF2.PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            full_text += page_text + "\n"
    return full_text

def hierarchical_chunk_manual(text, markers=["INTRODUCTION", "CONCLUSION"]):
    """
    Splits text into chunks based on manually provided markers.
    """
    lines = text.splitlines()
    chunks = []
    current_chunk = []
    for line in lines:
        # If any manual marker is found in the line and there is already accumulated content
        if any(marker in line for marker in markers) and current_chunk:
            chunks.append("\n".join(current_chunk).strip())
            current_chunk = [line]
        else:
            current_chunk.append(line)
    if current_chunk:
        chunks.append("\n".join(current_chunk).strip())
    return chunks

# Example usage for manual hierarchical chunking:
pdf_file = "Insurance_Handbook_20103.pdf"
pdf_text = extract_pdf_text(pdf_file)
manual_chunks = hierarchical_chunk_manual(pdf_text, markers=["INTRODUCTION", "CONCLUSION"])

print("Manual Hierarchical Chunking:")
for idx, chunk in enumerate(manual_chunks):
    print(f"Chunk {idx+1}:\n{chunk}\n{'-'*40}")



Manual Hierarchical Chunking:
Chunk 1:
Insurance 
Handbook 
A guide to insurance:  
what it does and how it works

©2010 Insurance Information Institute. 978-0-932387-47-9Insurance 
Handbook 
A guide to insurance:  
what it does and how it works

Insurance Information Institute
110 William StreetNew York, NY 10038Tel. 212-346-5500. Fax. 212-732-1916. www.iii.org
President
	
–
	
Robert
	
P.
	
Hartwig,
	
Ph.D.,
	
CPCU
	
–
	
bobh@iii.org
Executive Vice President
	
–
	
Cary
	
Schneider
	
–
	
carys@iii.org
Senior Vice President
	
–
	
Public
	
Affairs
	
–
	
Jeanne
	
Salvatore
	
–
	
jeannes@iii.org
Senior Vice President and Chief Economist –
	
Steven
	
N.
	
Weisbart,
	
Ph.D.,
	
CLU
	
–
	
stevenw@iii.org
R
esearch
Vice President – Global Issues
	
–
	
Claire
	
Wilkinson
	
–
	
clairew@iii.org
Publications 
Vice President – Publications and Information Services – Madine
	
Singer
	
–
	
madines@iii.org
Managing Editor – 
Neil
	
Liebman
	
–
	
neill@iii.org
Research and Production – Mary-Anne
	
Firne

In [9]:
# Automatically Hierarirchal Chunking

def detect_markers(text):
    """
    Automatically detects potential header markers.
    Heuristic: any short line (<= 10 words) that is either all uppercase or ends with a colon is treated as a marker.
    """
    lines = text.splitlines()
    markers = []
    for line in lines:
        words = line.split()
        if words and len(words) <= 10:
            if line.isupper() or line.endswith(":"):
                markers.append(line.strip())
    return list(set(markers))

def hierarchical_chunk_auto(text):
    """
    Splits text into chunks using automatically detected markers.
    Returns both the chunks and the detected markers.
    """
    auto_markers = detect_markers(text)
    # Sort markers in order of appearance
    lines = text.splitlines()
    detected = []
    for line in lines:
        for marker in auto_markers:
            if marker in line and marker not in detected:
                detected.append(marker)

    chunks = []
    current_chunk = []
    for line in lines:
        # If any detected marker is found and there is accumulated text, start a new chunk
        if any(marker in line for marker in detected) and current_chunk:
            chunks.append("\n".join(current_chunk).strip())
            current_chunk = [line]
        else:
            current_chunk.append(line)
    if current_chunk:
        chunks.append("\n".join(current_chunk).strip())
    return chunks, detected

# Example usage for automatic hierarchical chunking:
auto_chunks, auto_markers = hierarchical_chunk_auto(pdf_text)

print("\nAutomatically Detected Markers:")
print(auto_markers)
print("\nAutomatic Hierarchical Chunking:")
for idx, chunk in enumerate(auto_chunks):
    print(f"Chunk {idx+1}:\n{chunk}\n{'-'*40}")



Automatically Detected Markers:

Automatic Hierarchical Chunking:
Chunk 1:
Insurance
----------------------------------------
Chunk 2:
Handbook
----------------------------------------
Chunk 3:
A guide to insurance:  
what it does and how it works
----------------------------------------
Chunk 4:
©2010 Insurance Information Institute. 978-0-932387-47-9Insurance
----------------------------------------
Chunk 5:
Handbook
----------------------------------------
Chunk 6:
A guide to insurance:  
what it does and how it works
----------------------------------------
Chunk 7:
Insurance Information Institute
----------------------------------------
Chunk 8:
110 William StreetNew York, NY 10038Tel. 212-346-5500. Fax. 212-732-1916. www.iii.org
----------------------------------------
Chunk 9:
President
	
–
----------------------------------------
Chunk 10:
Robert
----------------------------------------
Chunk 11:
P.
----------------------------------------
Chunk 12:
Hartwig,
------------------

# Topic-Based Chunking

Why This Approach?
- Thematic Segmentation: Groups content according to distinct topics.
- Focused Retrieval: Enables more precise, subject-specific queries.
- Improved Relevance: Increases the likelihood of retrieving contextually related information.

In [10]:
import PyPDF2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def extract_pdf_text(pdf_path):
    reader = PyPDF2.PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            full_text += page_text + "\n\n"
    return full_text

def topic_based_chunk(text, n_topics=2):
    # Split text into sentences (using period followed by space as a simple splitter)
    sentences = text.split('. ')
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(sentences)
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(X)
    topic_distribution = lda.transform(X)
    topics = {}
    for i, sent in enumerate(sentences):
        topic = topic_distribution[i].argmax()
        topics.setdefault(topic, []).append(sent)
    return topics

pdf_file = "Insurance_Handbook_20103.pdf"
pdf_text = extract_pdf_text(pdf_file)
topics = topic_based_chunk(pdf_text, n_topics=2)
print("Topic-Based Chunking:\n", topics)


Topic-Based Chunking:


# Agentic Chunking

Why This Approach?
- Adaptive Segmentation: Uses intelligent agents to decide optimal breakpoints.
- Context-Sensitive Boundaries: Adjusts chunking based on content and task needs.
- Enhanced Downstream Quality: Maximizes relevance and context for improved responses.

In [11]:
import uuid
import google.generativeai as genAI
import time
from typing import List, Dict
import os

class AgenticChunker:
    def __init__(self):
        self.chunks = {}  # chunk information
        self.agent = genAI.GenerativeModel("gemini-2.0-flash")  # Updated model name
        self.chunk_id_length = 5  # For truncating the Chunk ID
        self.configure = genAI.configure(api_key="YOUR-API-KEY")

    def add_propositions(self, propositions: List[str]):
        """Add multiple propositions with rate limiting"""
        for idx, proposition in enumerate(propositions):
            print(f"Processing proposition {idx+1}/{len(propositions)}")
            self.add_proposition(proposition)
            time.sleep(4)  # Wait 4 seconds between propositions to respect 15 rpm limit

    def add_proposition(self, proposition: str):
        """Add a single proposition to the appropriate chunk"""
        print(f"Evaluating: {proposition[:50]}...")  # Show truncated proposition

        if not self.chunks:
            print("No existing chunks - creating first chunk")
            self.create_new_chunk(proposition)
            return

        relevant_chunk_id = self.find_relevant_chunk(proposition)

        if relevant_chunk_id:
            print(f"Adding to existing chunk: {self.chunks[relevant_chunk_id]['title']}")
            self.add_proposition_to_chunk(relevant_chunk_id, proposition)
        else:
            print("Creating new chunk for proposition")
            self.create_new_chunk(proposition)

    def add_proposition_to_chunk(self, chunk_id: str, proposition: str):
        """Add proposition to existing chunk and update metadata"""
        self.chunks[chunk_id]["propositions"].append(proposition)

        # Batch updates to reduce API calls: update when a multiple of 3 propositions are reached
        if len(self.chunks[chunk_id]["propositions"]) % 3 == 0:
            self.chunks[chunk_id]["summary"] = self.update_chunk_summary(self.chunks[chunk_id])
            self.chunks[chunk_id]["title"] = self.update_chunk_title(self.chunks[chunk_id])

    def _generate_content(self, prompt: str) -> str:
        """Wrapper for Gemini API calls with error handling and rate limiting"""
        try:
            response = self.agent.generate_content(prompt)
            # Wait 5 seconds after each API call to avoid exceeding rate limits
            time.sleep(5)
            return response.text.strip()
        except Exception as e:
            print(f"API Error: {str(e)}")
            return ""  # Return empty string to prevent pipeline failure

    def update_chunk_summary(self, chunk: Dict) -> str:
        """Generate updated chunk summary"""
        prompt = f"""
        You are the steward of a group of chunks representing sentences about a similar topic.
        A new proposition was just added. Generate a very brief 1-sentence summary that informs viewers what the chunk is about.
        Only respond with the new summary, nothing else.

        Chunk's propositions:
        {chr(10).join(chunk['propositions'][-3:])}

        Current summary: {chunk['summary'] if 'summary' in chunk else ''}
        """
        return self._generate_content(prompt)

    def update_chunk_title(self, chunk: Dict) -> str:
        """Generate updated chunk title"""
        prompt = f"""
        You are the steward of a group of chunks representing related propositions.
        A new proposition was just added. Generate a very brief updated chunk title (2-4 words) that summarizes the chunk's theme.
        Only respond with the new title, nothing else.

        Chunk's propositions:
        {chr(10).join(chunk['propositions'][-3:])}

        Chunk summary: {chunk['summary'] if 'summary' in chunk else ''}
        Current chunk title: {chunk['title'] if 'title' in chunk else ''}
        """
        return self._generate_content(prompt)

    def get_new_chunk_summary(self, proposition: str) -> str:
        """Generate initial summary for a new chunk"""
        prompt = f"""
        You are the steward of a group of chunks representing groups of sentences that talk about a similar topic.
        Generate a very brief 1 to 2-sentence summary that describes the topic of the new chunk based on the following proposition.
        Only respond with the new chunk summary, nothing else.

        Proposition:
        {proposition}
        """
        return self._generate_content(prompt)

    def get_new_chunk_title(self, summary: str) -> str:
        """Generate initial title for a new chunk based on its summary"""
        prompt = f"""
        You are the steward of a group of chunks. Based on the following summary, generate a concise title (2-4 words) that captures the essence of the chunk.
        Only respond with the new chunk title, nothing else.

        Chunk summary:
        {summary}
        """
        return self._generate_content(prompt)

    def create_new_chunk(self, proposition: str):
        """Create new chunk with initial proposition"""
        new_chunk_id = str(uuid.uuid4())[:self.chunk_id_length]  # Unique chunk id
        new_chunk_summary = self.get_new_chunk_summary(proposition)
        new_chunk_title = self.get_new_chunk_title(new_chunk_summary)
        self.chunks[new_chunk_id] = {
            'chunk_id': new_chunk_id,
            'propositions': [proposition],
            'title': new_chunk_title,
            'summary': new_chunk_summary,
            'chunk_index': len(self.chunks)
        }
        print(f"Created new chunk {new_chunk_id}: {new_chunk_title}")

    def find_relevant_chunk(self, proposition: str) -> str:
        """Find a matching chunk for the proposition"""
        prompt = f"""
        Determine whether the following proposition should belong to one of the existing chunks.
        If it should, return the chunk ID. If not, return 'NO_MATCH'.

        Existing chunks (ID: Title - Summary):
        {self._format_chunk_outline()}

        Proposition:
        {proposition}

        Respond ONLY with the matching chunk ID or 'NO_MATCH'.
        """
        response = self._generate_content(prompt)
        resp = response.strip()
        if resp == "NO_MATCH" or resp not in self.chunks:
            return None
        return resp

    def _format_chunk_outline(self) -> str:
        """Format chunk information for LLM input"""
        return "\n".join(
            f"{c['chunk_id']}: {c['title']} - {c['summary']}"
            for c in self.chunks.values()
        )

    def pretty_print_chunks(self):
        """Display all chunks with their metadata"""
        print("\n----- Chunks Created -----\n")
        for _, chunk in self.chunks.items():
            print(f"Chunk ID    : {chunk['chunk_id']}")
            print(f"Title       : {chunk['title'].strip()}")
            print(f"Summary     : {chunk['summary'].strip()}")
            print("Propositions:")
            for prop in chunk['propositions']:
                print(f"    - {prop}")
            print("\n")

    # New method to call pretty_print_chunks using print_chunks name.
    def print_chunks(self):
        self.pretty_print_chunks()


In [12]:
from PyPDF2 import PdfReader

def extract_text(pdf_path: str) -> List[str]:
    reader = PdfReader(pdf_path)
    return [page.extract_text() for page in reader.pages]

pdf_text = extract_text("Insurance_Handbook_20103.pdf")
chunker = AgenticChunker()
chunker.add_propositions(pdf_text)
chunker.print_chunks()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Total financial loss resulting from the death 
or disability of a wage earner, or from the destruction of property. Includes the loss of earnings, medical expenses, funeral expenses, the cost of restoring or replacing 
property and legal expenses. It does not 
include noneconomic losses, such as pain 
caused by an injury.
ELECTRONIC COMMERCE/
E-COMMERCE
The sale of products such as insurance over 
the Internet.
ELIMINATION PERIOD
A kind of deductible or waiting period 
usually found in disability policies. It is 
counted in days from the beginning of the 
illness or injury. 
EMPLOYEE DISHONESTY COVERAGE
Covers direct losses and damage to busi-
nesses resulting from the dishonest acts of 
employees. (See Fidelity bond)
EMPLOYEE RETIREMENT INCOME 
SECURITY ACT/ERISA
Federal legislation that protects employees 
by establishing minimum standards for 
private pension and welfare plans.
EMPLOYER’S LIABILITY
Part B of the worker