### Test splitting on chapters

In [229]:
import importlib
import sys
sys.path.append("../../src")  
import toc_parser
import raw_text_processing
import text_processing
import chromadb_utils
import chunks_processing

importlib.reload(toc_parser)
importlib.reload(raw_text_processing)
importlib.reload(text_processing)
importlib.reload(chromadb_utils)
importlib.reload(chunks_processing)

from raw_text_processing import process_pdf, extract_pages_range, extract_chapters
from toc_parser import extract_chapters_from_toc
from text_processing import text_chunking, chapters_chunking
from chromadb_utils import initialize_chromadb, initialize_collection, update_collection
from chunks_processing import query_collection, get_chapter_context


In [187]:
# Dictionary mapping example keys to PDF paths
examples = {
    "pdf_path1": "../../data/mcelreath_2020_statistical-rethinking.pdf",
    "pdf_path2": "../../data/Theory of Statistic.pdf",
    "pdf_path3": "../../data/Deep Learning with Python.pdf",
    "pdf_path4": "../../data/Natural_Image_Statistics.pdf",
    "pdf_path5": "../../data/mml-book.pdf"
}

# Dictionary mapping example keys to page ranges to extract content from
content_page_ranges = {
    "pdf_path1": range(5, 8),
    "pdf_path2": range(10, 17),
    "pdf_path3": range(7, 13),
    "pdf_path4": range(4, 13),
    "pdf_path5": range(2, 5),
}

# Select example number
n_example = 3
key = f"pdf_path{n_example}"
path = examples[key]

In [188]:
text, pages_data, start_chapter = process_pdf(path)
pages_data_corrected = pages_data[start_chapter:]
toc = extract_pages_range(path, content_page_ranges[key])
print(text[:300])
print('\n\n')
print(toc[:100])

Part 1
Fundamentals
of deep learning
C hapters 1–4 of this book will give you a foundational understanding of
what deep learning is, what it can achieve, and how it works. It will also make you
familiar with the canonical workflow for solving data problems using deep learn-
ing. If you aren’t alread



vii
contents
preface
xiii
acknowledgments
xv
about this book
xvi
about the author
xx
about the cover


### Extract chapters & chunck them

In [189]:
chapters_json = extract_chapters_from_toc(toc)

use prompt optimized for gemma3
[RunPod] Job started: 978bd049-a800-46a2-85d3-f948da0a44ca-e1
[RunPod] Status: IN_QUEUE
[RunPod] Status: IN_QUEUE
[RunPod] Status: IN_QUEUE
[RunPod] Status: IN_QUEUE
[RunPod] Status: IN_QUEUE
[RunPod] Status: IN_PROGRESS
[RunPod] Status: IN_PROGRESS
[RunPod] Status: IN_PROGRESS
[RunPod] Status: IN_PROGRESS
[RunPod] Status: IN_PROGRESS
[RunPod] Status: IN_PROGRESS
[RunPod] Status: COMPLETED


In [190]:
chapters = extract_chapters(chapters_json, pages_data_corrected)
chapters = chapters_chunking(chapters)

### Random sampling of chapter chunks

In [224]:
chapter_number = 4
n_questions = 5

chapter_context = get_chapter_context(chapters, chapter_number, n_questions)

### Set up Chroma and chunk text

In [200]:
EMBEDDING_MODEL = "all-MiniLM-L6-v2"  
client, embedding_func = initialize_chromadb(EMBEDDING_MODEL)

# Create two collections with different purposes
whole_text_collection = initialize_collection(client, embedding_func, "whole_text_chunks")
update_collection(whole_text_collection, text, max_words=200, min_words=100, overlap_sentences=3)


In [201]:
update_collection(whole_text_collection, text, max_words=200, min_words=100, overlap_sentences=3)

In [231]:
query = 'data preparation and analysis'
query_context = query_collection(whole_text_collection, query=query, nresults=3, context_multiplier=2)

In [227]:
len(query_context)

3

In [228]:
len(chapter_context)

5

### Create prompts for question generation

In [None]:
def chapter_prompt(contexts, num_questions, max_questions=5):
    """
    Create a prompt formatted for Gemma 3 12B-IT model.
    This prompt is designed to generate diverse questions based on provided text contexts.
    Args:
        contexts (list): List of text contexts to base questions on.
        num_questions (int): Number of questions to generate.
        max_questions (int): Maximum number of questions allowed.  
    Returns:
        str: Formatted prompt string for Gemma 3 model.
    """
    
    # Gemma uses special tokens for instruction tuning
    prompt = """<start_of_turn>user
You are a question generation expert. Generate exactly {num_questions} diverse questions based on the provided text contexts.

IMPORTANT REQUIREMENTS:
1. Output MUST be valid JSON format
2. Generate EXACTLY {num_questions} questions
3. Each question must have a complete answer from the contexts
4. Vary question types (what, why, how, when, explain, compare)
5. Do not generate yes/no questions
6. Answers should be 1-3 sentences long

CONTEXTS:
{contexts}

OUTPUT FORMAT - Return ONLY valid JSON array:
[
{{"question": "Your question here?", "answer": "Complete answer from the context"}},
{{"question": "Another question?", "answer": "Another answer"}}
]

Generate the questions now:<end_of_turn>
<start_of_turn>model
""".format(
        num_questions=min(num_questions, max_questions),
        contexts=format_contexts(contexts)
    )
    
    return prompt

def chapter_prompt_edgecase(grouped_chunks, num_questions, max_questions=5):
    """
    Create a prompt formatted for Gemma 3 12B-IT model.
    This prompt is designed to handle edge cases where contexts retrieved are less than the number of questions requested.
    Args:
        contexts (list): List of text contexts to format.
    Returns:
        str: Formatted string of contexts.
    """
    
    prompt = """<start_of_turn>user
Generate {num_questions} questions from the following contexts. You may:
- Generate one or more questions from each context
- Use multiple contexts for a single question
- Skip contexts if they don't contain meaningful information

REQUIREMENTS:
1. Output valid JSON array format
2. Generate EXACTLY {num_questions} questions
3. Each answer must be found in the provided contexts
4. Create diverse question types
5. Reference which context group(s) you used

CONTEXT GROUPS:
{context_groups}

OUTPUT FORMAT - Return ONLY this JSON structure:
[
{{"question": "Question text?", "answer": "Answer text", "context_used": [1, 2]}},
{{"question": "Question text?", "answer": "Answer text", "context_used": [1]}}
]

Generate the questions:<end_of_turn>
<start_of_turn>model
""".format(
        num_questions=min(num_questions, max_questions),
        context_groups=format_contexts(grouped_chunks)
    )
    
    return prompt

def format_contexts(contexts):
    """
    Format contexts for better readability.
    """
    formatted = ""
    for i, context in enumerate(contexts, 1):
        formatted += f"Context {i}:\n{context.strip()}\n\n"
    return formatted.strip()


In [None]:
out = chapter_prompt(query_context, 5)
out2 = chapter_prompt_edgecase(query_context, 5)

In [220]:
print(out2)

<start_of_turn>user
Generate 5 questions from the following grouped contexts. You may use one or multiple contexts per question.

REQUIREMENTS:
1. Output valid JSON array format
2. Generate EXACTLY 5 questions
3. Each answer must be found in the provided contexts
4. Create diverse question types
5. Reference which context group(s) you used

CONTEXT GROUPS:
Context 1:
The data you’ll manipulate will almost always fall into one of the fol-
lowing categories:
Vector data—2D tensors of shape (samples, features)
Timeseries data or sequence data—3D tensors of shape (samples, timesteps,
features)
Images—4D tensors of shape (samples, height, width, channels) or (samples,
channels, height, width)
Video—5D tensors of shape (samples, frames, height, width, channels) or
(samples, frames, channels, height, width)
2.2.9
Vector data
This is the most common case. In such a dataset, each single data point can be encoded
as a vector, and thus a batch of data will be encoded as a 2D tensor (that is, 

In [230]:
def book_prompt(contexts, num_questions, user_query=None, max_questions=5):
    """
    Create a prompt formatted for Gemma 3 12B-IT model with topic awareness.
    
    Args:
        contexts (list): List of text contexts retrieved based on user query
        num_questions (int): Number of questions to generate
        user_query (str): The original user query/topic
        max_questions (int): Maximum number of questions allowed
    
    Returns:
        str: Formatted prompt string for Gemma 3 model
    """
    
    num_questions = min(num_questions, max_questions)
    
    # Build topic context section if query provided
    topic_context = ""
    if user_query:
        topic_context = f"""
TOPIC FOCUS: {user_query}
The following contexts were retrieved based on this topic. Generate questions that:
- Relate to the main topic: "{user_query}"
- Explore different aspects of this topic found in the contexts
- Connect the topic to broader concepts when relevant

"""
    
    prompt = """<start_of_turn>user
You are a question generation expert. Generate exactly {num_questions} diverse questions based on the provided text contexts.
{topic_context}
IMPORTANT REQUIREMENTS:
1. Output MUST be valid JSON format
2. Generate EXACTLY {num_questions} questions
3. Each question must have a complete answer from the contexts
4. Vary question types (what, why, how, when, explain, compare)
5. Do not generate yes/no questions
6. Answers should be 1-3 sentences long
7. Questions should explore different aspects of the topic

CONTEXTS (Retrieved based on topic: "{query}"):
{contexts}

OUTPUT FORMAT - Return ONLY valid JSON array:
[
{{"question": "Your question here?", "answer": "Complete answer from the context"}},
{{"question": "Another question?", "answer": "Another answer"}}
]

Generate the questions now:<end_of_turn>
<start_of_turn>model
""".format(
        num_questions=num_questions,
        topic_context=topic_context,
        query=user_query if user_query else "the provided content",
        contexts=format_contexts(contexts)
    )
    
    return prompt

In [235]:
out3 = book_prompt(query_context, num_questions=3, user_query=query)

In [236]:
print(out3)

<start_of_turn>user
You are a question generation expert. Generate exactly 3 diverse questions based on the provided text contexts.

TOPIC FOCUS: data preparation and analysis
The following contexts were retrieved based on this topic. Generate questions that:
- Relate to the main topic: "data preparation and analysis"
- Explore different aspects of this topic found in the contexts
- Connect the topic to broader concepts when relevant


IMPORTANT REQUIREMENTS:
1. Output MUST be valid JSON format
2. Generate EXACTLY 3 questions
3. Each question must have a complete answer from the contexts
4. Vary question types (what, why, how, when, explain, compare)
5. Do not generate yes/no questions
6. Answers should be 1-3 sentences long
7. Questions should explore different aspects of the topic

CONTEXTS (Retrieved based on topic: "data preparation and analysis"):
Context 1:
We’ve previously reviewed three common evaluation protocols:
Maintaining a hold-out validation set—The way to go when you h