# 1. Setup Asyncio

In [3]:
import nest_asyncio

nest_asyncio.apply()

# 2. Setup the Qdrant vector database

Let's now connect to out qdrant database to store the collection of documents we will use for RAG. 
We will use the `qdrant_client` library to interact with the Qdrant database.

In [4]:
import qdrant_client

collection_name = "chat_with_docs"

client = qdrant_client.QdrantClient(
    host="localhost",
    port=6333,
)



# 3. Read the documents

We are now reading the documents using the docling library. For each document in the `docs` folder, we extract images and tables, in addition to its text.

In [5]:
from llama_index.core import SimpleDirectoryReader

input_dir_path = "./docs"

loader = SimpleDirectoryReader(
    input_dir=input_dir_path,
    required_exts=[".pdf"],
    recursive=True
)

docs = loader.load_data()

In [6]:
type(docs[0]), len(docs)

(llama_index.core.schema.Document, 41)

## 4. A function to index data

In [7]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import VectorStoreIndex, StorageContext

def create_index(documents):

    vector_store = QdrantVectorStore(client=client,
                                     collection_name=collection_name)
    
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    index = VectorStoreIndex.from_documents(documents,
                                            storage_context=storage_context)
    
    return index

# 5. Load the embedding model and index data

In [8]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5",
                                   trust_remote_code=True)

Settings.embed_model = embed_model

index = create_index(docs)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
type(index)

llama_index.core.indices.vector_store.base.VectorStoreIndex

## 6. Load the LLM

Now, it's time to define the LLM model we will use for querying the index. We are using Ollama as the LLM provider, but you can replace it with any other LLM provider supported by LlamaIndex.

Please, make sure to have available the intended model locally. To do so, you can use the pull command. 

For this task, we will use a _small_ model. In a separate terminal, execute:
```bash
ollama pull llama3.2:1
```
and wait for the model to download. Once ready, continue with the next cell!

In [10]:
from llama_index.llms.ollama import Ollama

llm = Ollama(model="llama3.2:1b", request_timeout=120.0)

Settings.llm = llm

In [11]:
type(Settings)

llama_index.core.settings._Settings

# 7. Define the prompt template

In [12]:
from llama_index.core import PromptTemplate

template = """Context information is below:
              ---------------------
              {context_str}
              ---------------------
              Given the context information above I want you to think
              step by step to answer the query in a crisp manner,
              incase you don't know the answer say 'I don't know!'
            
              Query: {query_str}
        
              Answer:"""

qa_prompt_tmpl = PromptTemplate(template)

# 8. Reranking

Here, we use a cross-encoder to re-rank the document chunks. Also, we limit the output to the top 3 most relevant chunks based on the model’s scoring.

In [13]:
from llama_index.core.postprocessor import SentenceTransformerRerank

rerank = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-2-v2", 
    top_n=3
)

In [14]:
rerank

SentenceTransformerRerank(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x391c33ac0>, model='cross-encoder/ms-marco-MiniLM-L-2-v2', top_n=3, device='mps', keep_retrieval_score=False, trust_remote_code=False)

# 9. Query the document

In [15]:
query_engine = index.as_query_engine(similarity_top_k=10,
                                     node_postprocessors=[rerank])

query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

# response = query_engine.query("What exactly is DSPy?")
response = query_engine.query("How is DSPy pronounced?")
# response = query_engine.query("What is the github repo for docling?")

Answer:

In [16]:
from IPython.display import Markdown, display

display(Markdown(str(response)))

DSPy is pronounced "dee-ess-pie".

Interestingly, in the metadata field of the response, you can find the document from which the answer was extracted. This is useful for tracking the source of the information provided by the model.

In [17]:
response.metadata

{'ca39e45f-b95d-49ec-b226-37507d7c4b95': {'page_label': '4',
  'file_name': 'dspy.pdf',
  'file_path': '/Users/fc/experiments/rag-project/docs/dspy.pdf',
  'file_type': 'application/pdf',
  'file_size': 460814,
  'creation_date': '2025-06-23',
  'last_modified_date': '2024-11-02'},
 '30beb7a2-b5eb-490b-9ed7-a86219f294b0': {'page_label': '2',
  'file_name': 'dspy.pdf',
  'file_path': '/Users/fc/experiments/rag-project/docs/dspy.pdf',
  'file_type': 'application/pdf',
  'file_size': 460814,
  'creation_date': '2025-06-23',
  'last_modified_date': '2024-11-02'},
 '061b7cb5-52ef-44da-984f-10c3fa2aa94b': {'page_label': '27',
  'file_name': 'dspy.pdf',
  'file_path': '/Users/fc/experiments/rag-project/docs/dspy.pdf',
  'file_type': 'application/pdf',
  'file_size': 460814,
  'creation_date': '2025-06-23',
  'last_modified_date': '2024-11-02'}}

## Bonus: Visualize relevant text in sources

In [18]:
from IPython.display import Markdown, display
import re

def highlight(text, query, color):
        # Case-insensitive highlight
        pattern = re.compile(re.escape(query), re.IGNORECASE)
        return pattern.sub(f"<mark style='background-color:{color};'>{query}</mark>", text)

def display_sources_with_highlight(response, docs, query, highlight_color="#ffff00"):
    """
    Display source documents for the response, highlighting the query in the text.
    """
    
    for source in response.metadata.values():
        source_document = source.get("file_name")
        source_page = source.get("page_label")
        if source_page:
            # We need also to filter per document file_name
            d = next((doc for doc in docs if doc.metadata.get("file_name") == source_document and doc.metadata.get("page_label") == source_page), None)
            if d:
                highlighted = highlight(d.text, query, highlight_color)
                display(Markdown(f"### Source Document (page_label: {source_page})\n\n{highlighted}"))


Usage:

In [19]:
word = "dee-ess-pie"
display_sources_with_highlight(response, docs, word)

### Source Document (page_label: 4)

Preprint
3.1 N ATURAL LANGUAGE SIGNATURES CAN ABSTRACT PROMPTING & FINETUNING
Instead of free-form string prompts, DSPy programs use natural language signatures to assign work
to the LM. A DSPy signature isnatural-language typed declaration of a function: a short declarative
spec that tells DSPy what a text transformation needs to do (e.g., “consume questions and return
answers”), rather than how a specific LM should be prompted to implement that behavior. More
formally, a DSPy signature is a tuple of input fields and output fields (and an optional instruction).
A field consists offield name and optional metadata.4 In typical usage, the roles of fields are inferred
by DSPy as a function of field names. For instance, the DSPy compiler will use in-context learning
to interpret questiondifferently from answer and will iteratively refine its usage of these fields.
Signatures offer two benefits over prompts: they can be compiled into self-improving and pipeline-
adaptive prompts or finetunes. This is primarily done by bootstrapping (Sec 4) useful demonstrating
examples for each signature. Additionally, they handle structured formatting and parsing logic to
reduce (or, ideally, avoid) brittle string manipulation in user programs.
In practice, DSPy signatures can be expressed with a shorthand notation likequestion -> answer,
so that line 1 in the following is a complete DSPy program for a basic question-answering system
(with line 2 illustrating usage and line 3 the response when GPT-3.5 is the LM):
1 qa = dspy.Predict("question -> answer")
2 qa(question="Where is Guaran ´ı spoken?")
3 # Out: Prediction(answer=’Guaran ´ı is spoken mainly in South America.’)
In the shorthand notation, each field’s name indicates the semantic role that the input (or output)
field plays in the transformation. DSPy will parse this notation and expand the field names into
meaningful instructions for the LM, so that english document -> french translation would
prompt for English to French translation. When needed, DSPy offers more advanced programming
interfaces for expressing more explicit constraints on signatures (Appendix A).
3.2 P ARAMETERIZED & TEMPLATED MODULES CAN ABSTRACT PROMPTING TECHNIQUES
Akin to type signatures in programming languages, DSPy signatures simply define an interface and
provide type-like hints on the expected behavior. To use a signature, we must declare amodule with
that signature, like we instantiated a Predict module above. A module declaration like this returns
a function having that signature.
The Predict Module The core module for working with signatures in DSPy isPredict(simplified
pseudocode in Appendix D.1). Internally, Predict stores the supplied signature, an optional LM to
use (initially None, but otherwise overrides the default LM for this module), and a list of demon-
strations for prompting (initially empty). Like layers in PyTorch, the instantiated module behaves as
a callable function: it takes in keyword arguments corresponding to the signature input fields (e.g.,
question), formats a prompt to implement the signature and includes the appropriate demonstra-
tions, calls the LM, and parses the output fields. When Predict detects it’s being used in compile
mode, it will also internally track input/output traces to assist the teleprompter at bootstrapping the
demonstrations.
Other Built-in ModulesDSPy modules translate prompting techniques into modular functions that
support any signature, contrasting with the standard approach of prompting LMs with task-specific
details (e.g., hand-written few-shot examples). To this end, DSPy includes a number of more sophis-
ticated modules like ChainOfThought, ProgramOfThought, MultiChainComparison, and ReAct.5
These can all be used interchangeably to implement a DSPy signature. For instance, simply chang-
4String descriptions of the task and the fields are also optional and usually omitted. Fields can carry optional
field prefix and description. By default, fields are assumed to hold free-form strings; we are actively exploring
optional data type as a way to specify constraints on valid values (e.g.,boolor int) and more gracefully handle
formatting and parsing logic, though this feature is not core to DSPy at the time of writing.
5These modules generalize prompting techniques from the literature, respectively, by Wei et al. (2022),
Chen et al. (2022), Yoran et al. (2023), and Yao et al. (2022) and, in doing so, generalize the ideas on zero-shot
prompting and rationale self-generation from Kojima et al. (2022), Zelikman et al. (2022), Zhang et al. (2022),
and Huang et al. (2022) to parameterized modules that can bootstrap arbitrary multi-stage pipelines.
4

### Source Document (page_label: 2)

Preprint
calls in existing LM pipelines and in popular developer frameworks are generally implemented using
hard-coded ‘prompt templates’, that is, long strings of instructions and demonstrations that are hand
crafted through manual trial and error. We argue that this approach, while pervasive, can be brittle
and unscalable—conceptually akin to hand-tuning the weights for a classifier. A given string prompt
might not generalize to different pipelines or across different LMs, data domains, or even inputs.
Toward a more systematic approach to designing AI pipelines, we introduce theDSPy programming
model.1 DSPy pushes building new LM pipelines away from manipulating free-form strings and
closer to programming (composing modular operators to build text transformation graphs) where a
compiler automatically generates optimized LM invocation strategies and prompts from a program.
We draw inspiration from the consensus that emerged around neural network abstractions (Bergstra
et al., 2013), where (1) many general-purpose layers can be modularly composed in any complex
architecture and (2) the model weights can be trained using optimizers instead of being hand-tuned.
To this end, we propose the DSPy programming model(Sec 3). We first translate string-based
prompting techniques, including complex and task-dependent ones like Chain of Thought (Wei et al.,
2022) and ReAct (Yao et al., 2022), into declarative modules that carrynatural-language typed sig-
natures. DSPy modules are task-adaptive components—akin to neural network layers—that abstract
any particular text transformation, like answering a question or summarizing a paper. We then pa-
rameterize each module so that it can learn its desired behavior by iteratively bootstrapping useful
demonstrations within the pipeline. Inspired directly by PyTorch abstractions (Paszke et al., 2019),
DSPy modules are used via expressive define-by-run computational graphs. Pipelines are expressed
by (1) declaring the modules needed and (2) using these modules in any logical control flow (e.g.,
ifstatements, for loops, exceptions, etc.) to logically connect the modules.
We then develop theDSPy compiler(Sec 4), which optimizes any DSPy program to improve quality
or cost. The compiler inputs are the program, a few training inputs with optional labels, and a valida-
tion metric. The compiler simulates versions of the program on the inputs and bootstraps example
traces of each module for self-improvement, using them to construct effective few-shot prompts
or finetuning small LMs for steps of the pipeline. Optimization in DSPy is highly modular: it is
conducted by teleprompters,2 which are general-purpose optimization strategies that determine how
the modules should learn from data. In this way, the compiler automatically maps the declarative
modules to high-quality compositions of prompting, finetuning, reasoning, and augmentation.
Programming models like DSPy could be assessed along many dimensions, but we focus on the role
of expert-crafted prompts in shaping system performance. We are seeking to reduce or even remove
their role through DSPy modules (e.g., versions of popular techniques like Chain of Thought) and
teleprompters. We report on two expansive case studies: math word problems (GMS8K; Cobbe et al.
2021) and multi-hop question answering (HotPotQA; Yang et al. 2018) with explorations of chain
of thought, multi-chain reflection, multi-hop retrieval, retrieval-augmented question answering, and
agent loops. Our evaluations use a number of different compiling strategies effectively and show
that straightforward DSPy programs outperform systems using hand-crafted prompts, while also
allowing our programs to use much smaller and hence more efficient LMs effectively.
Overall, this work proposes the first programming model that translates prompting techniques into
parameterized declarative modules and introduces an effective compiler with general optimiza-
tion strategies (teleprompters) to optimize arbitrary pipelines of these modules. Our main contri-
butions are empirical and algorithmic: with DSPy, we have found that we can implement very
short programs that can bootstrap self-improving multi-stage NLP systems using LMs as small as
llama2-13b-chat and T5-Large (770M parameters). Without hand-crafted prompts and within
minutes to tens of minutes of compiling, compositions of DSPy modules can raise the quality of
simple programs from 33% to 82% (Sec 6) and from 32% to 46% (Sec 7) for GPT-3.5 and, simi-
larly, from 9% to 47% (Sec 6) and from 22% to 41% (Sec 7) for llama2-13b-chat.
1DSPy is pronounced <mark style='background-color:#ffff00;'>dee-ess-pie</mark>. It’s the second iteration of our earlier Demonstrate–Search–Predict
framework (DSP; Khattab et al. 2022). This paper introduces the key concepts in DSPy. For more extensive and
up-to-date documentation of the framework, we refer readers to https://github.com/stanfordnlp/dspy.
2We derive the name tele-prompters from the notion of abstracting and automating the task of prompting,
in particular, such that it happens at a distance, without manual intervention.
2

### Source Document (page_label: 27)

Preprint
D M ODULES
D.1 P REDICT
1 class Predict(dspy.Module):
2 def __init__(self, signature, **config):
3 self.signature = dspy.Signature(signature)
4 self.config = config
5
6 # Module Parameters.
7 self.lm = dspy.ParameterLM(None) # use the default LM
8 self.demonstrations = dspy.ParameterDemonstrations([])
9
10 def forward(self, **kwargs):
11 lm = get_the_right_lm(self.lm, kwargs)
12 signature = get_the_right_signature(self.signature, kwargs)
13 demonstrations = get_the_right_demonstrations(self.demonstrations, kwargs)
14
15 prompt = signature(demos=self.demos, **kwargs)
16 completions = lm.generate(prompt, **self.config)
17 prediction = Prediction.from_completions(completions, signature=signature)
18
19 if dsp.settings.compiling is not None:
20 trace = dict(predictor=self, inputs=kwargs, outputs=prediction)
21 dspy.settings.traces.append(trace)
22
23 return prediction
D.2 C HAIN OF THOUGHT
1 class ChainOfThought(dspy.Module):
2 def __init__(self, signature):
3
4 # Modify signature from ‘*inputs -> *outputs‘ to ‘*inputs -> rationale, *outputs‘.
5 rationale_field = dspy.OutputField(prefix="Reasoning: Let’s think step by step.")
6 signature = dspy.Signature(signature).prepend_output_field(rationale_field)
7
8 # Declare a sub-module with the modified signature.
9 self.predict = dspy.Predict(self.signature)
10
11 def forward(self, **kwargs):
12 # Just forward the inputs to the sub-module.
13 return self.predict(**kwargs)
27