### This extract pages number (starting from actual page 1) and bundles it with page content

In [1]:
import fitz  # PyMuPDF
import warnings

def extract_page_data_fitz(pdf_path):
    """
    Extracts page numbers and text from a PDF file using PyMuPDF.
    The function looks for page numbers in the top and bottom 15% of each page.
    It returns a list of dictionaries, each containing the page index, page number,
    and the full text of the page.
    """
    doc = fitz.open(pdf_path)
    pages_data = []

    for i, page in enumerate(doc):
        height = page.rect.height
        width = page.rect.width

        top_rect = fitz.Rect(0, 0, width, height * 0.15)
        bottom_rect = fitz.Rect(0, height * 0.85, width, height)

        top_text = page.get_text("text", clip=top_rect).split()
        bottom_text = page.get_text("text", clip=bottom_rect).split()

        found_number = None
        for text in top_text + bottom_text:
            if text.isdigit():
                found_number = int(text)
                break

        full_text = page.get_text("text")

        pages_data.append({
            "index": i,
            "number": found_number,
            "content": full_text
        })

    doc.close()
    return pages_data


def correct_page_numbers(pages_data, sequence_length=10):
    """
    Corrects the page numbers in the extracted data.
    It looks for a sequence of consecutive page numbers and fills in the gaps.
    The function also handles the case where page numbers are not in a sequential order
    by correcting them based on the first found sequence of consecutive page numbers.
    The function also sets page numbers less than 1 to None.
    If no sequence is found, it returns None.
    The function returns the index of the first page with number 1.
    """
    try:
        # Find first sequence of 'sequence_length' consecutive page numbers
        seen = [(i, d["number"]) for i, d in enumerate(pages_data) if isinstance(d["number"], int)]

        for start in range(len(seen) - sequence_length + 1):
            valid = True
            for j in range(sequence_length):
                if seen[start + j][1] != seen[start][1] + j:
                    valid = False
                    break
            if valid:
                base_index, base_number = seen[start]
                break
        else:
            # No sequence found
            return None

        # Forward fill from base_index
        for offset, page in enumerate(pages_data[base_index:], start=0):
            page["number"] = base_number + offset

        # Backward fill before base_index
        for offset in range(1, base_index + 1):
            page = pages_data[base_index - offset]
            page["number"] = base_number - offset

        # Set pages < 1 == None
        for page in pages_data:
            if page["number"] < 1:
                page["number"] = None

        # Find index of first page with number 1
        start_chapter = next((page['index'] for page in pages_data if page["number"] == 1), None)

        return start_chapter

    except Exception:
        # Catch any unexpected errors and return None
        return None


def extract_text(pdf_path, start_chapter=None):
    """
    Extracts the text from a PDF file using PyMuPDF.
    It returns the text of the book starting from the specified page index.
    If no start_chapter is provided, it extracts the text from the entire PDF.
    """
    if start_chapter:
        doc = fitz.open(pdf_path)
        all_pages_text = []
        for page_range in range(start_chapter, len(doc)):
            page_text = doc[page_range].get_text("text")
            all_pages_text.append(page_text)
        doc.close()
        whole_text = "\n".join(all_pages_text)
    else:
        warnings.warn(
            "start_chapter is None: extracting text from the entire PDF.",
            UserWarning
        )
        doc = fitz.open(pdf_path)
        whole_text = ""
        for page in doc:
            page_text = page.get_text("text")
            whole_text += page_text
        doc.close()
    
    return whole_text

In [6]:
pdf_path1 = "../../data/mcelreath_2020_statistical-rethinking.pdf"
pdf_path2 = "../../data/Theory of Statistic.pdf"
pdf_path3 = "../../data/Deep Learning with Python.pdf"
pdf_path4 = "../../data/Natural_Image_Statistics.pdf"
pdf_path5 = "../../data/mml-book.pdf"

pdf_path = pdf_path2

pages_data = extract_page_data_fitz(pdf_path)
start_chapter = correct_page_numbers(pages_data, sequence_length=10)
text = extract_text(pdf_path, start_chapter)

print(text)

1
Probability Theory
Probability theory provides the basis for mathematical statistics.
Probability theory has two distinct elements. One is just a special case
of measure theory and can be approached in that way. For this aspect, the
presentation in this chapter assumes familiarity with the material in
Section 0.1 beginning on page 692. This aspect is “pure” mathematics. The
other aspect of probability theory is essentially built on a gedanken experiment
involving drawing balls from an urn that contains balls of diﬀerent colors, and
noting the colors of the balls drawn. In this aspect of probability theory, we
may treat “probability” as a primitive (that is, undeﬁned) concept. In this line
of development, we relate “probability” informally to some notion of long-term
frequency or to expectations or beliefs relating to the types of balls that will
be drawn. Following some additional axiomatic developments, however, this
aspect of probability theory is also essentially “pure” mathematic

### Set-up ChromaDB

In [None]:

from nltk.tokenize import sent_tokenize
import nltk

try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")

def text_chunking(text, max_words=750, overlap_sentences=5, min_words=400):
    """
    Creates text chunks up to max_words using sentences as undivisible units.
    Each chunk can overlap with the next one by overlap_sentences.
    Chunks smaller than min_words are merged with the next chunk.
    """
    sentences = sent_tokenize(text)
    word_counts = [len(sentence.split()) for sentence in sentences]
    
    chunks = []
    i = 0
    
    while i < len(sentences):
        chunk_sentences = []
        word_count = 0
        chunk_start = i
        
        # Build chunk
        while i < len(sentences):
            if word_count + word_counts[i] > max_words and chunk_sentences:
                break
            chunk_sentences.append(sentences[i])
            word_count += word_counts[i]
            i += 1
        
        if chunk_sentences:
            chunks.append(" ".join(chunk_sentences))
            
            # Add overlap for next chunk
            if i < len(sentences):
                chunk_size = len(chunk_sentences)
                overlap = min(overlap_sentences, chunk_size - 1)
                i = max(i - overlap, chunk_start + 1)
    
    # Merge small chunks with next chunk
    merged_chunks = []
    i = 0
    while i < len(chunks):
        current_chunk = chunks[i]
        current_words = len(current_chunk.split())
        
        # If current chunk is too small and there's a next chunk, merge them
        if current_words < min_words and i + 1 < len(chunks):
            next_chunk = chunks[i + 1]
            next_words = len(next_chunk.split())
            
            # Only merge if combined size won't be too large
            if current_words + next_words <= max_words:
                merged_chunk = current_chunk + " " + next_chunk
                merged_chunks.append(merged_chunk)
                i += 2  # Skip next chunk since we merged it
            else:
                # Keep small chunk as-is if merging would be too large
                merged_chunks.append(current_chunk)
                i += 1
        else:
            merged_chunks.append(current_chunk)
            i += 1
    
    # Remove chunks that are too long (likely data blocks or malformed content)
    final_chunks = []
    for chunk in merged_chunks:
        if len(chunk.split()) <= 1000:
            final_chunks.append(chunk)
    
    return final_chunks

[nltk_data] Downloading package punkt to /Users/davide/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import matplotlib.pyplot as plt

chunks = text_chunking(text, max_words=750, overlap_sentences=5, min_words=400)

n_words = []
for c in chunks:
    words = c.split()
    n_words.append(len(words))

plt.hist(n_words, bins=50)
plt.show()

In [None]:
import os
import chromadb
from chromadb.utils import embedding_functions


def initialize_chromadb(EMBEDDING_MODEL):
    """
    Initialize ChromaDB client and embedding function.
    """
    # Create a ephemeral directory for storing the database
    client = chromadb.Client()

    # Initialize an embedding function (using a Sentence Transformer model)
    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=EMBEDDING_MODEL
    )

    return client, embedding_func


def initialize_collection(client, embedding_func, collection_name):
    """
    Initialize a collection in ChromaDB.
    """
    collection = client.get_or_create_collection(
        name=collection_name,
        embedding_function=embedding_func,
        metadata={"hnsw:space": "cosine"},
    )

    return collection


def update_collection(
    collection,
    text,
    max_words=200,
    min_words=100,
    overlap_sentences=3,
):
    chunks = text_chunking(text, max_words=max_words, min_words=min_words, overlap_sentences=overlap_sentences)

    collection.add(
        documents=chunks,
        ids=[f"chunk_{j:04d}" for j in range(len(chunks))],
        metadatas=[{
            "chunk_index": j,
        } for j in range(len(chunks))]
    )
    return collection

### Test chapters splitting

In [24]:
import importlib
import sys
sys.path.append("../../src")  
from runpod_client import format_messages_as_prompt, run_prompt, clean_and_parse_json
import messages_templates
import toc_parser 

importlib.reload(toc_parser)
importlib.reload(messages_templates)

from toc_parser import extract_chapters_from_toc
from messages_templates import get_toc_extraction_messages

In [22]:
toc = get_toc_extraction_messages(text[:1000])
toc_formatted = format_messages_as_prompt(toc)

TypeError: string indices must be integers, not 'str'

In [23]:
toc



In [25]:
chapters_json = extract_chapters_from_toc(text)

use prompt optimized for gemma3
[RunPod] Job started: 742a4b2c-734b-4e75-a0cc-a780652ce443-e1
[RunPod] Status: IN_QUEUE
[RunPod] Status: IN_PROGRESS
[RunPod] Status: IN_PROGRESS
[RunPod] Status: IN_PROGRESS
[RunPod] Status: IN_PROGRESS
[RunPod] Status: IN_PROGRESS
[RunPod] Status: IN_PROGRESS
[RunPod] Status: IN_PROGRESS
[RunPod] Status: COMPLETED


In [7]:
# test

EMBEDDING_MODEL = "all-MiniLM-L6-v2"  
client, embedding_func = initialize_chromadb(EMBEDDING_MODEL)

# Create two collections with different purposes
whole_text_collection = initialize_collection(
    client, embedding_func, "whole_text_chunks"
)

chapter_collection = initialize_collection(
    client, embedding_func, "chapter_chunks"
)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def get_relevant_text(collection, query='', nresults=2, sim_th=None):
    """Get relevant text from a collection for a given query"""

    query_result = collection.query(query_texts=query, n_results=nresults)
    docs = query_result.get('documents')[0]

    if sim_th is not None:
        similarities = [1 - d for d in query_result.get("distances")[0]]
        relevant_docs = [d for d, s in zip(docs, similarities) if s >= sim_th]
        return ''.join(relevant_docs)
    return ''.join([doc for doc in docs if doc is not None])


def generate_answer(base_url, model, prompt, context=[], top_k=5, top_p=0.9, temp=0.5):
    url = base_url + "/generate"
    data = {
        "prompt": prompt,
        "model": model,
        "stream": False,
        "context": context,
        "options": {"temperature": temp, "top_p": top_p, "top_k": top_k},
    }
    try:
        response = requests.post(url, json=data)
        response.raise_for_status()
        response_dict = response.json()
        return response_dict.get('response', ''), response_dict.get('context', [])
    except requests.exceptions.RequestException as e:
        st.error(f"An error occurred: {e}")
        return "", []


def get_contextual_prompt(question, context):
    contextual_prompt = (
        "You are a helpful assistant. Use the information provided in the context below to answer the question. "
        "Ensure your answer is accurate, concise, and directly addresses the question. "
        "If the context does not provide enough information to answer the question, state that explicitly.\n\n"
        "### Context:\n"
        f"{context}\n\n"
        "### Question:\n"
        f"{question}\n\n"
        "### Answer:"
    )
    return contextual_prompt