### Chapter Content extraction

In [172]:
import fitz  # PyMuPDF
import re
import codecs

# Dictionary mapping example keys to PDF paths
examples = {
    "pdf_path1": "../../data/mcelreath_2020_statistical-rethinking.pdf",
    "pdf_path2": "../../data/Theory of Statistic.pdf",
    "pdf_path3": "../../data/Deep Learning with Python.pdf",
    "pdf_path4": "../../data/Natural_Image_Statistics.pdf",
    "pdf_path5": "../../data/mml-book.pdf"
}

# Dictionary mapping example keys to page ranges to extract content from
content_page_ranges = {
    "pdf_path1": range(5, 8),
    "pdf_path2": range(10, 17),
    "pdf_path3": range(7, 13),
    "pdf_path4": range(4, 13),
    "pdf_path5": range(2, 5),
}

# Select example number
n_example = 5
key = f"pdf_path{n_example}"

# Open the PDF
doc = fitz.open(examples[key])

# Extract text from the specified page range
chapters_content_list = []
for page_num in content_page_ranges[key]:
    page = doc[page_num]
    text = page.get_text("text")
    chapters_content_list.append(text)

# Join all text pages into a single string if needed
chapters_content = "\n".join(chapters_content_list)

print(chapters_content)  # or pass it to your model

Contents
Foreword
1
Part I
Mathematical Foundations
9
1
Introduction and Motivation
11
1.1
Finding Words for Intuitions
12
1.2
Two Ways to Read This Book
13
1.3
Exercises and Feedback
16
2
Linear Algebra
17
2.1
Systems of Linear Equations
19
2.2
Matrices
22
2.3
Solving Systems of Linear Equations
27
2.4
Vector Spaces
35
2.5
Linear Independence
40
2.6
Basis and Rank
44
2.7
Linear Mappings
48
2.8
Afﬁne Spaces
61
2.9
Further Reading
63
Exercises
64
3
Analytic Geometry
70
3.1
Norms
71
3.2
Inner Products
72
3.3
Lengths and Distances
75
3.4
Angles and Orthogonality
76
3.5
Orthonormal Basis
78
3.6
Orthogonal Complement
79
3.7
Inner Product of Functions
80
3.8
Orthogonal Projections
81
3.9
Rotations
91
3.10
Further Reading
94
Exercises
96
4
Matrix Decompositions
98
4.1
Determinant and Trace
99
i
This material will be published by Cambridge University Press as Mathematics for Machine Learn-
ing by Marc Peter Deisenroth, A. Aldo Faisal, and Cheng Soon Ong. This pre-publication version is
free to

### Check Gemma3 performance with chapter infos extraction


In [35]:
import requests
from dotenv import load_dotenv
import os
import re
import json
import time

load_dotenv()  # Loads .env file into environment

# Your endpoint ID and API key
api_key = os.getenv("RUNPOD_API_KEY")
endpoint2 = "https://api.runpod.ai/v2/4zyobam3zy2bci"
endpoint = "https://api.runpod.ai/v2/hmje50gz4lr97c"
headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
}


In [173]:
def format_messages_as_prompt(messages):
    """Convert messages list to a single prompt string for Ollama generate endpoint"""
    prompt_parts = []
    
    for message in messages:
        role = message["role"]
        content = message["content"]
        
        if role == "system":
            prompt_parts.append(f"System: {content}")
        elif role == "user":
            prompt_parts.append(f"User: {content}")
        elif role == "assistant":
            prompt_parts.append(f"Assistant: {content}")
    
    # Add final prompt for the assistant to respond
    prompt_parts.append("Assistant:")
    
    return "\n\n".join(prompt_parts)


# Your messages array
messages = [
        {
            "role": "system",
            "content": "You are a precise document parser that extracts structured information from table of contents. You NEVER hallucinate, invent, or make up information. You ONLY extract what is explicitly present in the provided text. If you cannot find clear chapter information, you return an empty array. You do not guess chapter titles or page numbers."
        },
        {
            "role": "user",
            "content": "I need to extract main chapter information from this table of contents. Only extract numbered chapters, ignore subsections. Do not make up any information."
        },
        {
            "role": "assistant",
            "content": "I understand. I will extract ONLY the main chapters that are explicitly shown in your table of contents. I will not invent, guess, or hallucinate any chapter titles or page numbers. I will only use the exact information present in the document."
        },
        {
            "role": "user",
            "content": f"""Here is the table of contents:

{chapters_content}

WARNING: DO NOT HALLUCINATE OR INVENT INFORMATION
- Do NOT make up chapter titles like "Probability", "Statistical Inference", "Linear Regression"
- Do NOT guess page numbers
- Do NOT create generic textbook chapters
- ONLY extract what you can clearly see in the provided text

CRITICAL RULES:
1. Extract ONLY main chapters that start with a number (1, 2, 3, etc.)
2. Do NOT extract subsections (like 1.1, 1.2, 2.1, etc.)
3. Use the EXACT chapter titles shown in the document
4. Use the EXACT page numbers shown in the document
5. Handle both roman numerals (i, ii, iii, v, x) and arabic numerals (1, 25, 100)
6. Calculate end pages as: next chapter's start page minus 1
7. Return ONLY valid JSON - no explanations, no markdown formatting
8. If you cannot clearly identify chapters, return empty array []

Look for patterns like:
- "1 Probability Theory . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1"
- "2 Distribution Theory and Statistical Models . . . . . . . . . . . . . . . . 155"
- "3 Basic Statistical Theory . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 205"

DO NOT extract lines like:
- "1.1 Some Important Music Concepts . . . . . . . . . . . 3"
- "Preface . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . v"

Use ONLY the exact titles from the document. Do not shorten or modify them.

Return JSON array: [{{"chapter_number": "X", "chapter_title": "...", "start_page": X, "end_page": X}}]

REMEMBER: Extract only what is explicitly visible in the text. Do not hallucinate. Be complete and extract all chapters that are clearly numbered.                  y chapters, return an empty array []."""
        },
        {
            "role": "assistant",
            "content": "I will carefully examine the table of contents and extract only the main chapters that are explicitly shown, using their exact titles and page numbers. I will not invent or hallucinate any information."
        }
    ]


# Convert to proper format
formatted_prompt = format_messages_as_prompt(messages)

# Build the payload for RunPod
payload = {
    "input": {
        "prompt": formatted_prompt  # Now it's a single string
    }
}

# What the formatted prompt will look like:
print("Formatted prompt:")
print(f'"""{formatted_prompt}"""')



Formatted prompt:
"""System: You are a precise document parser that extracts structured information from table of contents. You NEVER hallucinate, invent, or make up information. You ONLY extract what is explicitly present in the provided text. If you cannot find clear chapter information, you return an empty array. You do not guess chapter titles or page numbers.

User: I need to extract main chapter information from this table of contents. Only extract numbered chapters, ignore subsections. Do not make up any information.

Assistant: I understand. I will extract ONLY the main chapters that are explicitly shown in your table of contents. I will not invent, guess, or hallucinate any chapter titles or page numbers. I will only use the exact information present in the document.

User: Here is the table of contents:

Contents
Foreword
1
Part I
Mathematical Foundations
9
1
Introduction and Motivation
11
1.1
Finding Words for Intuitions
12
1.2
Two Ways to Read This Book
13
1.3
Exercises and

In [174]:
# 1. Start a job
start_response = requests.post(f"{endpoint}/run", json=payload, headers=headers)
job = start_response.json()
job_id = job["id"]

print(f"Job started with ID: {job_id}")

# 2. Poll until done
status = None
while status not in ("COMPLETED", "FAILED"):
    time.sleep(3)
    poll_response = requests.get(f"{endpoint}/status/{job_id}", headers=headers)
    poll_data = poll_response.json()
    status = poll_data["status"]
    print(f"Job status: {status}")

# 3. Get result if completed
if status == "COMPLETED":
    output_raw = poll_data['output']['response']
    print("Job Output:")
    print(output_raw)
else:
    print("Job failed.")

Job started with ID: 2925388e-9ade-424d-a75a-54e7811054d6-e1
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: COMPLETED
Job Output:
```json
[
  {
    "chapter_number": "1",
    "chapter_title": "Introduction and Motivation",
    "start_page": 11,
    "end_page": 15
  },
  {
    "chapter_number": "2",
    "chapter_title": "Linear Algebra",
    "start_page": 17,
    "end_page": 62
  },
  {
    "chapter_number": "3",
    "chapter_title": "Analytic Geometry",
    "start_page": 70,
    "end_page": 93
  },
  {
    "chapter_number": "4",
    "chapter_title": "Matrix Decompositions",
    "start_page": 98,
    "end_page": 136
  },
  {
    "chapter_number": "5",
    "chapter_title": "Vector Calculus",
    "start_page": 139,
    "end_page": 164
  },
  {
    "chapter_number": "6",
    "chapter_title": "Probability and Distributions",
    "start_

In [175]:
def clean_and_parse_json(raw_text):

    # Clean up triple quotes and markdown syntax
    cleaned = output_raw.strip("'").strip('```json').strip('```')

    # Unescape JSON string
    unescaped = codecs.decode(cleaned, 'unicode_escape')

    # Parse JSON
    chapters = json.loads(unescaped)
    
    return chapters


chapters = clean_and_parse_json(output_raw)
for chapter in chapters:
    print(f"{chapter['chapter_number']}: {chapter['chapter_title']}")

1: Introduction and Motivation
2: Linear Algebra
3: Analytic Geometry
4: Matrix Decompositions
5: Vector Calculus
6: Probability and Distributions
7: Continuous Optimization
8: When Models Meet Data
9: Linear Regression
10: Dimensionality Reduction with Principal Component Analysis
11: Density Estimation with Gaussian Mixture Models
12: Classiï¬cation with Support Vector Machines


### This extract pages number (starting from actual page 1) and bundles it with page content

In [176]:
import fitz  # PyMuPDF

def extract_page_data_fitz(pdf_path):
    doc = fitz.open(pdf_path)
    pages_data = []

    for i, page in enumerate(doc):
        height = page.rect.height
        width = page.rect.width

        top_rect = fitz.Rect(0, 0, width, height * 0.15)
        bottom_rect = fitz.Rect(0, height * 0.85, width, height)

        top_text = page.get_text("text", clip=top_rect).split()
        bottom_text = page.get_text("text", clip=bottom_rect).split()

        found_number = None
        for text in top_text + bottom_text:
            if text.isdigit():
                found_number = int(text)
                break

        full_text = page.get_text("text")

        pages_data.append({
            "index": i,
            "number": found_number,
            "content": full_text
        })

    doc.close()
    return pages_data


def correct_page_numbers(pages_data, sequence_length=10):
    # Find first sequence of 'sequence_length' consecutive page numbers
    seen = [(i, d["number"]) for i, d in enumerate(pages_data) if isinstance(d["number"], int)]

    for start in range(len(seen) - sequence_length + 1):
        valid = True
        for j in range(sequence_length):
            if seen[start + j][1] != seen[start][1] + j:
                valid = False
                break
        if valid:
            base_index, base_number = seen[start]
            break
    else:
        # No sequence found, return original data
        raise ValueError("No valid sequence of page numbers found.")

    # Forward fill from base_index
    for offset, page in enumerate(pages_data[base_index:], start=0):
        page["number"] = base_number + offset

    # Backward fill before base_index
    for offset in range(1, base_index + 1):
        page = pages_data[base_index - offset]
        page["number"] = base_number - offset
    
    # Set pages < 1 == None
    for page in pages_data:
        if page["number"] < 1:
            page["number"] = None

    return pages_data


def extract_whole_text(pdf_path):
    doc = fitz.open(pdf_path)
    all_pages_text = []
    for page in doc:
        page_text = page.get_text("text")
        all_pages_text.append(page_text)
    doc.close()
    whole_text = "\n".join(all_pages_text)
    return whole_text

In [177]:
pdf_path1 = "../../data/mcelreath_2020_statistical-rethinking.pdf"
pdf_path2 = "../../data/Theory of Statistic.pdf"
pdf_path3 = "../../data/Deep Learning with Python.pdf"
pdf_path4 = "../../data/mcelreath_2020_statistical-rethinking.pdf"
pdf_path5 = "../../data/mml-book.pdf"



pages_data = extract_page_data_fitz(pdf_path1)

try:
    corrected_pages = correct_page_numbers(pages_data, sequence_length=10)
except ValueError as e:
    print(f"Warning: {e}. Using original page data without correction.")
    corrected_pages = pages_data  # fallback to original

for p in corrected_pages:
    print(f"Page index: {p['index']}, Page number: {p['number']}")

Page index: 0, Page number: None
Page index: 1, Page number: None
Page index: 2, Page number: None
Page index: 3, Page number: None
Page index: 4, Page number: None
Page index: 5, Page number: None
Page index: 6, Page number: None
Page index: 7, Page number: None
Page index: 8, Page number: None
Page index: 9, Page number: None
Page index: 10, Page number: None
Page index: 11, Page number: None
Page index: 12, Page number: None
Page index: 13, Page number: None
Page index: 14, Page number: None
Page index: 15, Page number: None
Page index: 16, Page number: None
Page index: 17, Page number: None
Page index: 18, Page number: None
Page index: 19, Page number: 1
Page index: 20, Page number: 2
Page index: 21, Page number: 3
Page index: 22, Page number: 4
Page index: 23, Page number: 5
Page index: 24, Page number: 6
Page index: 25, Page number: 7
Page index: 26, Page number: 8
Page index: 27, Page number: 9
Page index: 28, Page number: 10
Page index: 29, Page number: 11
Page index: 30, Page 

In [152]:
os.path.join(os.getcwd(), os.pardir)

'/Users/davide/Documents/Projects/Text2Test/notebooks/dev/..'

In [148]:
text = extract_whole_text(pdf_path4)

In [149]:
print(text)


Statistical Rethinking

CHAPMAN & HALL/CRC
Texts in Statistical Science Series
Joseph K. Blitzstein, Harvard University, USA  
Julian J. Faraway, University of Bath, UK  
Martin Tanner, Northwestern University, USA 
Jim Zidek, University of British Columbia, Canada
Recently Published Titles
Theory of Spatial Statistics 
A Concise Introduction 
M.N.M van Lieshout
Bayesian Statistical Methods 
Brian J. Reich and Sujit K. Ghosh
Sampling 
Design and Analysis, Second Edition 
Sharon L. Lohr
The Analysis of Time Series 
An Introduction with R, Seventh Edition 
Chris Chatfield and Haipeng Xing
Time Series 
A Data Analysis Approach Using R 
Robert H. Shumway and David S. Stoffer
Practical Multivariate Analysis, Sixth Edition 
Abdelmonem Afifi, Susanne May, Robin A. Donatello, and Virginia A. Clark
Time Series: A First Course with Bootstrap Starter 
Tucker S. McElroy and Dimitris N. Politis
Probability and Bayesian Modeling 
Jim Albert and Jingchen Hu
Surrogates 
Gaussian Process Modeling, Des

### Building RAG

In [None]:
def get_relevant_text(collection, query='', nresults=2, sim_th=None):
    """Get relevant text from a collection for a given query"""

    query_result = collection.query(query_texts=query, n_results=nresults)
    docs = query_result.get('documents')[0]

    if sim_th is not None:
        similarities = [1 - d for d in query_result.get("distances")[0]]
        relevant_docs = [d for d, s in zip(docs, similarities) if s >= sim_th]
        return ''.join(relevant_docs)
    return ''.join([doc for doc in docs if doc is not None])


def generate_answer(base_url, model, prompt, context=[], top_k=5, top_p=0.9, temp=0.5):
    url = base_url + "/generate"
    data = {
        "prompt": prompt,
        "model": model,
        "stream": False,
        "context": context,
        "options": {"temperature": temp, "top_p": top_p, "top_k": top_k},
    }
    try:
        response = requests.post(url, json=data)
        response.raise_for_status()
        response_dict = response.json()
        return response_dict.get('response', ''), response_dict.get('context', [])
    except requests.exceptions.RequestException as e:
        st.error(f"An error occurred: {e}")
        return "", []


def get_contextual_prompt(question, context):
    contextual_prompt = (
        "You are a helpful assistant. Use the information provided in the context below to answer the question. "
        "Ensure your answer is accurate, concise, and directly addresses the question. "
        "If the context does not provide enough information to answer the question, state that explicitly.\n\n"
        "### Context:\n"
        f"{context}\n\n"
        "### Question:\n"
        f"{question}\n\n"
        "### Answer:"
    )
    return contextual_prompt