# 1. Setup Asyncio

In [1]:
import nest_asyncio

nest_asyncio.apply()

# 2. Setup the Qdrant vector database

In [2]:
import qdrant_client

collection_name = "chat_with_docs_chonkie"

client = qdrant_client.QdrantClient(
    host="localhost",
    port=6333,
)



# 3. Read the documents

In [3]:
from llama_index.core import SimpleDirectoryReader

input_dir_path = "./docs"

loader = SimpleDirectoryReader(
    input_dir=input_dir_path,
    required_exts=[".pdf"],
    recursive=True
)

docs = loader.load_data()

In [6]:
docs

[Document(id_='9969b3fa-afca-4c1d-bbf9-dc95cb79c9d6', embedding=None, metadata={'page_label': '1', 'file_name': 'docling.pdf', 'file_path': '/Users/fc/experiments/rag-project/docs/docling.pdf', 'file_type': 'application/pdf', 'file_size': 5566575, 'creation_date': '2025-06-13', 'last_modified_date': '2025-06-13'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Docling Technical Report\nVersion 1.0\nChristoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos\nPanos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer\nKasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima\nValery Weber Lucas Mo

In [7]:
type(docs), len(docs)

(list, 41)

## 4. Use Chonkie to chunk the documents

In [8]:
from chonkie import SemanticChunker
from llama_index.core.schema import Document
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding


semantic_chunker = SemanticChunker(
    embedding_model="BAAI/bge-large-en-v1.5",
    threshold=0.5,
    chunk_size=512,
    min_sentences=1
)

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5",
                                   trust_remote_code=True)

Settings.embed_model = embed_model

all_chunks = []
for doc in docs:
    chunks = semantic_chunker.chunk(doc.text)
    for chunk in chunks:
        # Use LlamaIndex's embedding model to embed the chunk text
        chunk_embedding = Settings.embed_model.get_text_embedding(chunk.text)
        all_chunks.append(
            Document(
                text=chunk.text,
                metadata=doc.metadata,
                embedding=chunk_embedding
            )
        )

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
len(all_chunks)

107

In [10]:
all_chunks[:10]

[Document(id_='ac606e9d-bb5e-4fc1-a570-8931f763049f', embedding=[0.04111266881227493, -0.033255405724048615, 0.004889185540378094, 0.008478309027850628, 0.005850477144122124, -0.016381381079554558, 0.007339164149016142, -0.0190725177526474, -0.0217287614941597, 0.05471247062087059, -0.0175127312541008, -0.003294158261269331, 0.010624443180859089, -0.01871548220515251, -0.017820583656430244, 0.011222572065889835, 0.016856154426932335, -0.014007615856826305, -0.03984614834189415, 0.004058705642819405, -0.00884283147752285, 0.013128494843840599, -0.04635987430810928, -0.01796746626496315, -0.02819453552365303, 0.028740279376506805, 0.024831337854266167, -0.0031951090786606073, 0.10831087827682495, 0.03729763627052307, -0.020881539210677147, -0.04139766842126846, 0.023930752649903297, -0.006288998760282993, -0.010944293811917305, -0.02895534224808216, 0.014605705626308918, 0.003717657644301653, 0.006859759800136089, -0.040770698338747025, 0.014762122184038162, -0.03557861968874931, 0.01835

## 5. Load the embedding model and index data

In [11]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import VectorStoreIndex, StorageContext

def create_index(documents):

    vector_store = QdrantVectorStore(client=client,
                                     collection_name=collection_name)
    
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    index = VectorStoreIndex.from_documents(documents,
                                            storage_context=storage_context)
    
    return index

In [12]:
from llama_index.core import Settings

index = create_index(all_chunks)

In [13]:
type(index)

llama_index.core.indices.vector_store.base.VectorStoreIndex

## 6. Load the LLM

In [14]:
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings


llm = Ollama(model="llama3.2:1b", request_timeout=120.0)

Settings.llm = llm

In [15]:
type(Settings), Settings.llm, Settings.embed_model

(llama_index.core.settings._Settings,
 Ollama(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x13ce6e0a0>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x12fe5dc10>, completion_to_prompt=<function default_completion_to_prompt at 0x138321040>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, base_url='http://localhost:11434', model='llama3.2:1b', temperature=None, context_window=-1, request_timeout=120.0, prompt_key='prompt', json_mode=False, additional_kwargs={}, is_function_calling_model=True, keep_alive=None, thinking=None),
 HuggingFaceEmbedding(model_name='BAAI/bge-large-en-v1.5', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x13ce6e0a0>, num_workers=None, embeddings_cache=None, max_length=512, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None, show_progress_bar=False))

## 7. Define the prompt template

In [16]:
from llama_index.core import PromptTemplate

template = """Context information is below:
              ---------------------
              {context_str}
              ---------------------
              Given the context information above I want you to think
              step by step to answer the query in a crisp manner,
              incase you don't know the answer say 'I don't know!'
            
              Query: {query_str}
        
              Answer:"""

qa_prompt_tmpl = PromptTemplate(template)

## 8. Reranking

Here, we use a cross-encoder to re-rank the document chunks. Also, we limit the output to the top 3 most relevant chunks based on the model’s scoring.

In [19]:
from llama_index.core.postprocessor import SentenceTransformerRerank

rerank = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-2-v2", 
    top_n=3
)

In [20]:
rerank

SentenceTransformerRerank(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x17bace070>, model='cross-encoder/ms-marco-MiniLM-L-2-v2', top_n=3, device='mps', keep_retrieval_score=False, trust_remote_code=False)

## 9. Query the index

In [21]:
query_engine = index.as_query_engine(similarity_top_k=10,
                                     node_postprocessors=[rerank])

query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

# response = query_engine.query("What exactly is DSPy?")
response = query_engine.query("How is DSPy pronounced?")
# response = query_engine.query("What is the github repo for docling?")

In [22]:
from IPython.display import Markdown, display

display(Markdown(str(response)))

DSPy is pronounced "dee-ess-pie". It's the second iteration of our earlier Demonstrate–Search–Predict framework (DSP; Khattab et al. 2022). This paper introduces the key concepts in DSPy. For more extensive and up-to-date documentation of the framework, we refer readers to https://github.com/stanfordnlp/dspy.

In [23]:
response.metadata

{'138089a6-425a-4d2d-885c-ceeef275d9c9': {'page_label': '8',
  'file_name': 'dspy.pdf',
  'file_path': '/Users/fc/experiments/rag-project/docs/dspy.pdf',
  'file_type': 'application/pdf',
  'file_size': 460814,
  'creation_date': '2025-06-23',
  'last_modified_date': '2024-11-02'},
 '0285a94e-afe3-4d0b-84b7-6541e2f2573c': {'page_label': '2',
  'file_name': 'dspy.pdf',
  'file_path': '/Users/fc/experiments/rag-project/docs/dspy.pdf',
  'file_type': 'application/pdf',
  'file_size': 460814,
  'creation_date': '2025-06-23',
  'last_modified_date': '2024-11-02'},
 '35d48c80-edd2-46eb-9282-2d1fba4542c6': {'page_label': '6',
  'file_name': 'dspy.pdf',
  'file_path': '/Users/fc/experiments/rag-project/docs/dspy.pdf',
  'file_type': 'application/pdf',
  'file_size': 460814,
  'creation_date': '2025-06-23',
  'last_modified_date': '2024-11-02'}}

In [24]:
response.response

'DSPy is pronounced "dee-ess-pie". It\'s the second iteration of our earlier Demonstrate–Search–Predict framework (DSP; Khattab et al. 2022). This paper introduces the key concepts in DSPy. For more extensive and up-to-date documentation of the framework, we refer readers to https://github.com/stanfordnlp/dspy.'

## Bonus: Visualize text in sources

In [26]:
from IPython.display import Markdown, display
import re

def highlight(text, query, color):
        # Case-insensitive highlight
        pattern = re.compile(re.escape(query), re.IGNORECASE)
        return pattern.sub(f"<mark style='background-color:{color};'>{query}</mark>", text)

def display_sources_with_highlight(response, docs, query, highlight_color="#ffff00"):
    """
    Display source documents for the response, highlighting the query in the text.
    """
    
    for source in response.metadata.values():
        source_document = source.get("file_name")
        source_page = source.get("page_label")
        if source_page:
            # We need also to filter per document file_name
            d = next((doc for doc in docs if doc.metadata.get("file_name") == source_document and doc.metadata.get("page_label") == source_page), None)
            if d:
                highlighted = highlight(d.text, query, highlight_color)
                display(Markdown(f"### Source Document (page_label: {source_page})\n\n{highlighted}"))

# Example usage:
word = "dee-ess-pie"
display_sources_with_highlight(response, docs, word)

### Source Document (page_label: 8)

Preprint
Table 1: Results with in-context learning on GSM8K math word problems. Each row represents
a separate pipeline: the module in the Program column is compiled against the examples in the
Training set. The programs, compilers, and (small) training sets are defined in Section 6. Rows with
ensemblebuild on the immediately preceding row. Notably, all programs in this table are expressed
by composing two to four DSPy modules and teleprompters. Compiling the correctmodules, instead
of string prompts, improves different LMs from 4–20% accuracy to 49–88% accuracy.
GPT-3.5 Llama2-13b-chat
Program Compilation Training Dev Test Dev Test
vanilla
none n/a 24.0 25.2 7.0 9.4
fewshot trainset 33.1 – 4.3 –
bootstrap trainset 44.0 – 28.0 –
bootstrap×2 trainset 64.7 61.7 37.3 36.5
+ensemble trainset 62.7 61.9 39.0 34.6
CoT
none n/a 50.0 – 26.7 –
fewshot trainset 63.0 – 27.3 –
fewshot +humanCoT 78.6 72.4 34.3 33.7
bootstrap trainset 80.3 72.9 43.3 –
+ensemble trainset 88.3 81.6 43.7 –
reflection
none n/a 65.0 – 36.7 –
fewshot trainset 71.7 – 36.3 –
bootstrap trainset 83.0 76.0 44.3 40.2
+ensemble trainset 86.7 – 49.0 46.9
6 C ASE STUDY: M ATH WORD PROBLEMS
We evaluate on the popular GSM8K dataset with grade school math questions (Cobbe et al., 2021).
We sample 200 and 300 question–answer pairs from the official training set for training and develop-
ment, respectively. Our final evaluations use the 1.3k official test set examples. We report extensive
comparisons on the development set to avoid overfitting on test. Following prior work on GSM8K,
we evaluate the accuracy of the final numerical value that appears in the LM output.
Programs Considered For this task, we consider three simple DSPy programs: a one-step Pre-
dict module (vanilla), a two-step ChainOfThought module ( CoT), and finally a multi-stage Com-
parerOfThoughts module (ThoughtReflection). These are fully defined by the following code:
1 vanilla = dspy.Predict("question -> answer") # GSM8K Program ‘vanilla‘
2
3 CoT = dspy.ChainOfThought("question -> answer") # GSM8K Program ‘CoT‘
1 class ThoughtReflection(dspy.Module):
2 def __init__(self, num_attempts):
3 self.predict = dspy.ChainOfThought("question -> answer", n=num_attempts)
4 self.compare = dspy.MultiChainComparison(’question -> answer’, M=num_attempts)
5
6 def forward(self, question):
7 completions = self.predict(question=question).completions
8 return self.compare(question=question, completions=completions)
9
10 reflection = ThoughtReflection(num_attempts=5) # GSM8K Program ‘reflection‘
In reflection, five reasoning chains are sampled from the LM (alongside their answers) and they
are compared in parallel by a built-in MultiChainComparison module, which generalizes Yoran
et al. (2023). This generates a new answer taking into account the patterns from the five attempts.
Critically, the modules used are all generic, none is specific math problems or particular LM.
Compiling As we discussed in Section 4, DSPy programs can be compiled into new, optimized
programs. In our experiments, we evaluate the programs zero-shot (no compiling) as well as a
number of strategies for compiling. Our simplest compiler is LabeledFewShot:
1 fewshot = dspy.LabeledFewShot(k=8).compile(program, trainset=trainset)
Here, programcan be any DSPy module. This simply samplesk=8random demonstrations from the
trainsetfor the fields common to the training examples and the signature(s), in this case,question
and answer, but not the reasoning for instance. We report the average of 3–5 runs (depending on the
setting) when applying such random sampling.
8

### Source Document (page_label: 2)

Preprint
calls in existing LM pipelines and in popular developer frameworks are generally implemented using
hard-coded ‘prompt templates’, that is, long strings of instructions and demonstrations that are hand
crafted through manual trial and error. We argue that this approach, while pervasive, can be brittle
and unscalable—conceptually akin to hand-tuning the weights for a classifier. A given string prompt
might not generalize to different pipelines or across different LMs, data domains, or even inputs.
Toward a more systematic approach to designing AI pipelines, we introduce theDSPy programming
model.1 DSPy pushes building new LM pipelines away from manipulating free-form strings and
closer to programming (composing modular operators to build text transformation graphs) where a
compiler automatically generates optimized LM invocation strategies and prompts from a program.
We draw inspiration from the consensus that emerged around neural network abstractions (Bergstra
et al., 2013), where (1) many general-purpose layers can be modularly composed in any complex
architecture and (2) the model weights can be trained using optimizers instead of being hand-tuned.
To this end, we propose the DSPy programming model(Sec 3). We first translate string-based
prompting techniques, including complex and task-dependent ones like Chain of Thought (Wei et al.,
2022) and ReAct (Yao et al., 2022), into declarative modules that carrynatural-language typed sig-
natures. DSPy modules are task-adaptive components—akin to neural network layers—that abstract
any particular text transformation, like answering a question or summarizing a paper. We then pa-
rameterize each module so that it can learn its desired behavior by iteratively bootstrapping useful
demonstrations within the pipeline. Inspired directly by PyTorch abstractions (Paszke et al., 2019),
DSPy modules are used via expressive define-by-run computational graphs. Pipelines are expressed
by (1) declaring the modules needed and (2) using these modules in any logical control flow (e.g.,
ifstatements, for loops, exceptions, etc.) to logically connect the modules.
We then develop theDSPy compiler(Sec 4), which optimizes any DSPy program to improve quality
or cost. The compiler inputs are the program, a few training inputs with optional labels, and a valida-
tion metric. The compiler simulates versions of the program on the inputs and bootstraps example
traces of each module for self-improvement, using them to construct effective few-shot prompts
or finetuning small LMs for steps of the pipeline. Optimization in DSPy is highly modular: it is
conducted by teleprompters,2 which are general-purpose optimization strategies that determine how
the modules should learn from data. In this way, the compiler automatically maps the declarative
modules to high-quality compositions of prompting, finetuning, reasoning, and augmentation.
Programming models like DSPy could be assessed along many dimensions, but we focus on the role
of expert-crafted prompts in shaping system performance. We are seeking to reduce or even remove
their role through DSPy modules (e.g., versions of popular techniques like Chain of Thought) and
teleprompters. We report on two expansive case studies: math word problems (GMS8K; Cobbe et al.
2021) and multi-hop question answering (HotPotQA; Yang et al. 2018) with explorations of chain
of thought, multi-chain reflection, multi-hop retrieval, retrieval-augmented question answering, and
agent loops. Our evaluations use a number of different compiling strategies effectively and show
that straightforward DSPy programs outperform systems using hand-crafted prompts, while also
allowing our programs to use much smaller and hence more efficient LMs effectively.
Overall, this work proposes the first programming model that translates prompting techniques into
parameterized declarative modules and introduces an effective compiler with general optimiza-
tion strategies (teleprompters) to optimize arbitrary pipelines of these modules. Our main contri-
butions are empirical and algorithmic: with DSPy, we have found that we can implement very
short programs that can bootstrap self-improving multi-stage NLP systems using LMs as small as
llama2-13b-chat and T5-Large (770M parameters). Without hand-crafted prompts and within
minutes to tens of minutes of compiling, compositions of DSPy modules can raise the quality of
simple programs from 33% to 82% (Sec 6) and from 32% to 46% (Sec 7) for GPT-3.5 and, simi-
larly, from 9% to 47% (Sec 6) and from 22% to 41% (Sec 7) for llama2-13b-chat.
1DSPy is pronounced <mark style='background-color:#ffff00;'>dee-ess-pie</mark>. It’s the second iteration of our earlier Demonstrate–Search–Predict
framework (DSP; Khattab et al. 2022). This paper introduces the key concepts in DSPy. For more extensive and
up-to-date documentation of the framework, we refer readers to https://github.com/stanfordnlp/dspy.
2We derive the name tele-prompters from the notion of abstracting and automating the task of prompting,
in particular, such that it happens at a distance, without manual intervention.
2

### Source Document (page_label: 6)

Preprint
In DSPy, training sets may be small, potentially a handful of examples, though larger data enables
more powerful optimization. Training examples may be incomplete, i.e., only input values are nec-
essary. Labels for the pipeline steps are not required, unless they need to be used in the metric. In
practice, we typically assume labels only for (at most) the program’s final output, not the intermedi-
ate steps. This label-efficiency is critical for modularity: building a new pipeline in DSPy requires
simply recompiling the new pipeline’s code, not annotating data specific to the new pipeline.
Metrics can be simple notions like exact match (EM) or F1, but they can be entire DSPy programs
that balance multiple concerns. For example, we may compile the RAG module above against a
dataset of question–answer pairs qa trainset and the metric EM. The goal of optimization here is
to effectively bootstrap few-shot demonstrations. The following code achieves this:
1 # Small training set with only questions and final answers.
2 qa_trainset = [dspy.Example(question="What is the capital of France?", answer="Paris")]
3
4 # The teleprompter will bootstrap missing labels: reasoning chains and retrieval contexts.
5 teleprompter = dspy.BootstrapFewShot(metric=dspy.evaluate.answer_exact_match)
6 compiled_rag = teleprompter.compile(RAG(), trainset=qa_trainset)
In this example, the BootstrapFewShot teleprompter (Sec 4, Appendix E.1) simulates RAG on the
training example(s). It will collect demonstrations of each module (i.e., examples of its input–output
behavior) that collectively lead to valid output (i.e., respecting the signatures and the metric).
If one wanted to push the compiled program to be extractive given its retrieved contexts, one could
define a custom metric to use in place of dspy.evaluate.answer exact match:
1 def answer_and_context_match(example, pred, trace=None):
2 answer_match = dspy.evaluate.answer_exact_match(example, pred)
3
4 # Is the prediction a substring of some passage?
5 context_match = any((pred.answer.lower() in c) for c in pred.context)
6
7 return answer_match and context_match
Notice that behavior like this might be more accurately checked by another DSPy program that
checks for faithful grounding of answers. Such metrics are fully supported and encouraged in DSPy.
Teleprompters can be composed by specifying a teacher program. DSPy will sample demonstra-
tions from this program for prompt optimization. This composition can enable very rich pipelines,
where expensive programs (e.g., complex expensive ensembles using large LMs) supervise cheap
programs (e.g., simple pipelines using smaller LMs). One may start withcompiled ragfrom above
(say, compiled to use a large Llama2-13b-chat LM) but now fine-tune Flan-T5-large to create an
efficient program:
1 # Larger set of questions with *no labels*. Labels for all steps will be bootstrapped.
2 unlabeled_questions = [dspy.Example(question="What is the capital of Germany?"), ...]
3
4 # As we assumes no answer, we use ‘answer_passage_match‘ to filter ungrounded answers.
5 finetuning_teleprompter = BootstrapFinetune(metric=dspy.evaluate.answer_passage_match)
6
7 # We set ‘teacher=compiled_rag‘ to compose. Bootstrapping will now use ‘compiled_rag ‘.
8 compiled_rag_via_finetune = finetuning_teleprompter.compile(RAG(), teacher=compiled_rag,
trainset=unlabeled_questions, target=’google/flan-t5-large’)
4 T HE DSP Y COMPILER
A key source of DSPy’s expressive power is its ability to compile—or automatically optimize—any
program in this programming model. Compiling relies on a teleprompter, which is an optimizer for
DSPy programs that improves the quality (or cost) of modules via prompting or finetuning, which
are unified in DSPy. While DSPy does not enforce this when creating new teleprompters, typical
teleprompters go through three stages.
Stage 1: Candidate GenerationThe compiler first (recursively) finds all uniquePredictmodules
(predictors) in a program, including those nested under other modules. For each unique predictor
p, the teleprompter may generate candidate values for the parameters of p: the instructions, field
descriptions, or—most importantly—demonstrations (i.e., example input–output pairs). In this iter-
6