# 1. Setup Asyncio

In [1]:
import nest_asyncio

nest_asyncio.apply()

# 2. Setup the Qdrant vector database

In [None]:
import qdrant_client

collection_name = "chat_with_docs_chonkie"

client = qdrant_client.QdrantClient(
    host="localhost",
    port=6333,
)



# 3. Read the documents

In [3]:
from llama_index.core import SimpleDirectoryReader

input_dir_path = "./docs"

loader = SimpleDirectoryReader(
    input_dir=input_dir_path,
    required_exts=[".pdf"],
    recursive=True
)

docs = loader.load_data()

In [4]:
docs

[Document(id_='01595f54-f73e-4e17-a1a5-ad637c75da7e', embedding=None, metadata={'page_label': '1', 'file_name': 'dspy.pdf', 'file_path': '/Users/fc/experiments/rag-project/docs/dspy.pdf', 'file_type': 'application/pdf', 'file_size': 460814, 'creation_date': '2025-06-11', 'last_modified_date': '2024-11-02'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Preprint\nDSP Y: C OMPILING DECLARATIVE LANGUAGE\nMODEL CALLS INTO SELF -IMPROVING PIPELINES\nOmar Khattab,1 Arnav Singhvi,2\nParidhi Maheshwari,4 Zhiyuan Zhang,1\nKeshav Santhanam,1 Sri Vardhamanan,6 Saiful Haq,6\nAshutosh Sharma,6 Thomas T. Joshi,7 Hanna Moazam,8\nHeather Miller,3,9 Mate

In [5]:
type(docs), len(docs)

(list, 32)

## 4. Use Chonkie to chunk the documents

In [6]:
from chonkie import SemanticChunker
from llama_index.core.schema import Document
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding


semantic_chunker = SemanticChunker(
    embedding_model="BAAI/bge-large-en-v1.5",
    threshold=0.5,
    chunk_size=512,
    min_sentences=1
)

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5",
                                   trust_remote_code=True)

Settings.embed_model = embed_model

all_chunks = []
for doc in docs:
    chunks = semantic_chunker.chunk(doc.text)
    for chunk in chunks:
        # Use LlamaIndex's embedding model to embed the chunk text
        chunk_embedding = Settings.embed_model.get_text_embedding(chunk.text)
        all_chunks.append(
            Document(
                text=chunk.text,
                metadata=doc.metadata,
                embedding=chunk_embedding
            )
        )

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
len(all_chunks)

76

In [8]:
all_chunks[:10]

[Document(id_='7c2c64c2-b403-43ed-9bc2-785422e02483', embedding=[0.038891248404979706, 0.00458358833566308, -0.0032363533973693848, -0.011541174724698067, -0.023720134049654007, -0.015372456051409245, -0.01644018106162548, -0.017184259369969368, -0.017161991447210312, 0.06521951407194138, -0.003091790247708559, -0.01796267181634903, -0.006126547232270241, -0.03504977002739906, -0.02182835154235363, 0.002376331016421318, -0.02055387571454048, -0.03153911232948303, -0.07666144520044327, 0.0014957647072151303, 0.03511612117290497, -0.01860874705016613, -0.0852985605597496, 0.0051113395020365715, -0.03072628378868103, 0.023758085444569588, 0.02069648914039135, -0.0028157311026006937, 0.07397174835205078, 0.06731146574020386, 0.017167920246720314, 0.0034706720616668463, 0.015570398420095444, -0.03472590073943138, -0.03713826462626457, -0.07597390562295914, 0.007801176514476538, -0.008913443423807621, -0.02500114217400551, -0.010506954975426197, 0.03661350905895233, -0.0425884835422039, 0.05

## 5. Create Ddrant Collection

In [None]:
# import numpy as np
# from qdrant_client.models import VectorParams, Distance
# from qdrant_client.models import PointStruct


# # Create the collection if it doesn't exist
# client.recreate_collection(
#     collection_name=collection_name,
#     vectors_config=VectorParams(
#         size=np.array(all_chunks[0].embedding).shape[0],  # dimension of your embedding
#         distance=Distance.COSINE                # or Distance.DOT, Distance.EUCLID
#     )
# )

# points = []
# for i, chunk in enumerate(all_chunks):
#     if chunk.embedding is not None:
#         points.append(
#             PointStruct(
#                 id=i,
#                 vector=chunk.embedding,
#                 payload={"text": chunk.text}
#             )
#         )

# client.upsert(
#     collection_name=collection_name,
#     points=points
# )

# 5. Load the embedding model and index data

In [9]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import VectorStoreIndex, StorageContext

def create_index(documents):

    vector_store = QdrantVectorStore(client=client,
                                     collection_name=collection_name)
    
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    index = VectorStoreIndex.from_documents(documents,
                                            storage_context=storage_context)
    
    return index

In [10]:
from llama_index.core import Settings

index = create_index(all_chunks)

In [11]:
type(index)

llama_index.core.indices.vector_store.base.VectorStoreIndex

## 6. Load the LLM

In [12]:
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings


llm = Ollama(model="llama3.2:1b", request_timeout=120.0)

Settings.llm = llm

In [13]:
type(Settings), Settings.llm, Settings.embed_model

(llama_index.core.settings._Settings,
 Ollama(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x12a615100>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x11be15160>, completion_to_prompt=<function default_completion_to_prompt at 0x11c188550>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, base_url='http://localhost:11434', model='llama3.2:1b', temperature=None, context_window=-1, request_timeout=120.0, prompt_key='prompt', json_mode=False, additional_kwargs={}, is_function_calling_model=True, keep_alive=None, thinking=None),
 HuggingFaceEmbedding(model_name='BAAI/bge-large-en-v1.5', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x12a615100>, num_workers=None, embeddings_cache=None, max_length=512, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None, show_progress_bar=False))

# 7. Define the prompt template

In [14]:
from llama_index.core import PromptTemplate

template = """Context information is below:
              ---------------------
              {context_str}
              ---------------------
              Given the context information above I want you to think
              step by step to answer the query in a crisp manner,
              incase you don't know the answer say 'I don't know!'
            
              Query: {query_str}
        
              Answer:"""

qa_prompt_tmpl = PromptTemplate(template)

In [16]:
## 8. Query Qdrant directly with your own embedding


In [17]:
# query = "What exactly is DSPy?"

# # Use the same embedding model as for chunking
# query_embedding = semantic_chunker.chunk([query])[0]

# search_result = client.search(
#     collection_name=collection_name,
#     query_vector=query_embedding.tolist(),
#     limit=5
# )

# # Gather the top results' texts
# top_chunks = [hit.payload["text"] for hit in search_result]

# # Optionally, synthesize an answer using your LLM
# context_str = "\n\n".join(top_chunks)
# prompt = template.format(context_str=context_str, query_str=query)

# response = llm.complete(prompt)
# print(response)

# 8. Reranking

Here, we use a cross-encoder to re-rank the document chunks. Also, we limit the output to the top 3 most relevant chunks based on the model’s scoring.

In [18]:
from llama_index.core.postprocessor import SentenceTransformerRerank

rerank = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-2-v2", 
    top_n=3
)

In [19]:
rerank

SentenceTransformerRerank(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x396fab0d0>, model='cross-encoder/ms-marco-MiniLM-L-2-v2', top_n=3, device='mps', keep_retrieval_score=False, trust_remote_code=False)

# 9. Query the document

In [41]:
query_engine = index.as_query_engine(similarity_top_k=3,
                                     node_postprocessors=[rerank])

query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

# response = query_engine.query("What exactly is DSPy?")
response = query_engine.query("How is DSPy pronounced?")

In [42]:
from IPython.display import Markdown, display

display(Markdown(str(response)))

DSPy is pronounced "dee-ess-pie".

In [43]:
response.metadata

{'4127d83f-54ce-4b8b-9f09-8f34f8968228': {'page_label': '2',
  'file_name': 'dspy.pdf',
  'file_path': '/Users/fc/experiments/rag-project/docs/dspy.pdf',
  'file_type': 'application/pdf',
  'file_size': 460814,
  'creation_date': '2025-06-11',
  'last_modified_date': '2024-11-02'},
 '4651e2cd-75f4-4f43-9dfb-80e2310b19fb': {'page_label': '4',
  'file_name': 'dspy.pdf',
  'file_path': '/Users/fc/experiments/rag-project/docs/dspy.pdf',
  'file_type': 'application/pdf',
  'file_size': 460814,
  'creation_date': '2025-06-11',
  'last_modified_date': '2024-11-02'},
 'a421a2a3-c3b6-42b4-920c-bd24222d6579': {'page_label': '11',
  'file_name': 'dspy.pdf',
  'file_path': '/Users/fc/experiments/rag-project/docs/dspy.pdf',
  'file_type': 'application/pdf',
  'file_size': 460814,
  'creation_date': '2025-06-11',
  'last_modified_date': '2024-11-02'}}

In [44]:
response.response

'DSPy is pronounced "dee-ess-pie".'

## Debug

In [71]:
from IPython.display import Markdown, display
import re

def highlight(text, query, color):
        # Case-insensitive highlight
        pattern = re.compile(re.escape(query), re.IGNORECASE)
        return pattern.sub(f"<mark style='background-color:{color};'>{query}</mark>", text)

def display_sources_with_highlight(response, docs, query, highlight_color="#ffff00"):
    """
    Display source documents for the response, highlighting the query in the text.
    """
    
    for source in response.metadata.values():
        source_page = source.get("page_label")
        if source_page:
            d = next((doc for doc in docs if doc.metadata.get("page_label") == source_page), None)
            if d:
                highlighted = highlight(d.text, query, highlight_color)
                display(Markdown(f"### Source Document (page_label: {source_page})\n\n{highlighted}"))

# Example usage:
word = "dee-ess-pie"
display_sources_with_highlight(response, docs, word)

### Source Document (page_label: 2)

Preprint
calls in existing LM pipelines and in popular developer frameworks are generally implemented using
hard-coded ‘prompt templates’, that is, long strings of instructions and demonstrations that are hand
crafted through manual trial and error. We argue that this approach, while pervasive, can be brittle
and unscalable—conceptually akin to hand-tuning the weights for a classifier. A given string prompt
might not generalize to different pipelines or across different LMs, data domains, or even inputs.
Toward a more systematic approach to designing AI pipelines, we introduce theDSPy programming
model.1 DSPy pushes building new LM pipelines away from manipulating free-form strings and
closer to programming (composing modular operators to build text transformation graphs) where a
compiler automatically generates optimized LM invocation strategies and prompts from a program.
We draw inspiration from the consensus that emerged around neural network abstractions (Bergstra
et al., 2013), where (1) many general-purpose layers can be modularly composed in any complex
architecture and (2) the model weights can be trained using optimizers instead of being hand-tuned.
To this end, we propose the DSPy programming model(Sec 3). We first translate string-based
prompting techniques, including complex and task-dependent ones like Chain of Thought (Wei et al.,
2022) and ReAct (Yao et al., 2022), into declarative modules that carrynatural-language typed sig-
natures. DSPy modules are task-adaptive components—akin to neural network layers—that abstract
any particular text transformation, like answering a question or summarizing a paper. We then pa-
rameterize each module so that it can learn its desired behavior by iteratively bootstrapping useful
demonstrations within the pipeline. Inspired directly by PyTorch abstractions (Paszke et al., 2019),
DSPy modules are used via expressive define-by-run computational graphs. Pipelines are expressed
by (1) declaring the modules needed and (2) using these modules in any logical control flow (e.g.,
ifstatements, for loops, exceptions, etc.) to logically connect the modules.
We then develop theDSPy compiler(Sec 4), which optimizes any DSPy program to improve quality
or cost. The compiler inputs are the program, a few training inputs with optional labels, and a valida-
tion metric. The compiler simulates versions of the program on the inputs and bootstraps example
traces of each module for self-improvement, using them to construct effective few-shot prompts
or finetuning small LMs for steps of the pipeline. Optimization in DSPy is highly modular: it is
conducted by teleprompters,2 which are general-purpose optimization strategies that determine how
the modules should learn from data. In this way, the compiler automatically maps the declarative
modules to high-quality compositions of prompting, finetuning, reasoning, and augmentation.
Programming models like DSPy could be assessed along many dimensions, but we focus on the role
of expert-crafted prompts in shaping system performance. We are seeking to reduce or even remove
their role through DSPy modules (e.g., versions of popular techniques like Chain of Thought) and
teleprompters. We report on two expansive case studies: math word problems (GMS8K; Cobbe et al.
2021) and multi-hop question answering (HotPotQA; Yang et al. 2018) with explorations of chain
of thought, multi-chain reflection, multi-hop retrieval, retrieval-augmented question answering, and
agent loops. Our evaluations use a number of different compiling strategies effectively and show
that straightforward DSPy programs outperform systems using hand-crafted prompts, while also
allowing our programs to use much smaller and hence more efficient LMs effectively.
Overall, this work proposes the first programming model that translates prompting techniques into
parameterized declarative modules and introduces an effective compiler with general optimiza-
tion strategies (teleprompters) to optimize arbitrary pipelines of these modules. Our main contri-
butions are empirical and algorithmic: with DSPy, we have found that we can implement very
short programs that can bootstrap self-improving multi-stage NLP systems using LMs as small as
llama2-13b-chat and T5-Large (770M parameters). Without hand-crafted prompts and within
minutes to tens of minutes of compiling, compositions of DSPy modules can raise the quality of
simple programs from 33% to 82% (Sec 6) and from 32% to 46% (Sec 7) for GPT-3.5 and, simi-
larly, from 9% to 47% (Sec 6) and from 22% to 41% (Sec 7) for llama2-13b-chat.
1DSPy is pronounced <mark style='background-color:#ffff00;'>dee-ess-pie</mark>. It’s the second iteration of our earlier Demonstrate–Search–Predict
framework (DSP; Khattab et al. 2022). This paper introduces the key concepts in DSPy. For more extensive and
up-to-date documentation of the framework, we refer readers to https://github.com/stanfordnlp/dspy.
2We derive the name tele-prompters from the notion of abstracting and automating the task of prompting,
in particular, such that it happens at a distance, without manual intervention.
2

### Source Document (page_label: 4)

Preprint
3.1 N ATURAL LANGUAGE SIGNATURES CAN ABSTRACT PROMPTING & FINETUNING
Instead of free-form string prompts, DSPy programs use natural language signatures to assign work
to the LM. A DSPy signature isnatural-language typed declaration of a function: a short declarative
spec that tells DSPy what a text transformation needs to do (e.g., “consume questions and return
answers”), rather than how a specific LM should be prompted to implement that behavior. More
formally, a DSPy signature is a tuple of input fields and output fields (and an optional instruction).
A field consists offield name and optional metadata.4 In typical usage, the roles of fields are inferred
by DSPy as a function of field names. For instance, the DSPy compiler will use in-context learning
to interpret questiondifferently from answer and will iteratively refine its usage of these fields.
Signatures offer two benefits over prompts: they can be compiled into self-improving and pipeline-
adaptive prompts or finetunes. This is primarily done by bootstrapping (Sec 4) useful demonstrating
examples for each signature. Additionally, they handle structured formatting and parsing logic to
reduce (or, ideally, avoid) brittle string manipulation in user programs.
In practice, DSPy signatures can be expressed with a shorthand notation likequestion -> answer,
so that line 1 in the following is a complete DSPy program for a basic question-answering system
(with line 2 illustrating usage and line 3 the response when GPT-3.5 is the LM):
1 qa = dspy.Predict("question -> answer")
2 qa(question="Where is Guaran ´ı spoken?")
3 # Out: Prediction(answer=’Guaran ´ı is spoken mainly in South America.’)
In the shorthand notation, each field’s name indicates the semantic role that the input (or output)
field plays in the transformation. DSPy will parse this notation and expand the field names into
meaningful instructions for the LM, so that english document -> french translation would
prompt for English to French translation. When needed, DSPy offers more advanced programming
interfaces for expressing more explicit constraints on signatures (Appendix A).
3.2 P ARAMETERIZED & TEMPLATED MODULES CAN ABSTRACT PROMPTING TECHNIQUES
Akin to type signatures in programming languages, DSPy signatures simply define an interface and
provide type-like hints on the expected behavior. To use a signature, we must declare amodule with
that signature, like we instantiated a Predict module above. A module declaration like this returns
a function having that signature.
The Predict Module The core module for working with signatures in DSPy isPredict(simplified
pseudocode in Appendix D.1). Internally, Predict stores the supplied signature, an optional LM to
use (initially None, but otherwise overrides the default LM for this module), and a list of demon-
strations for prompting (initially empty). Like layers in PyTorch, the instantiated module behaves as
a callable function: it takes in keyword arguments corresponding to the signature input fields (e.g.,
question), formats a prompt to implement the signature and includes the appropriate demonstra-
tions, calls the LM, and parses the output fields. When Predict detects it’s being used in compile
mode, it will also internally track input/output traces to assist the teleprompter at bootstrapping the
demonstrations.
Other Built-in ModulesDSPy modules translate prompting techniques into modular functions that
support any signature, contrasting with the standard approach of prompting LMs with task-specific
details (e.g., hand-written few-shot examples). To this end, DSPy includes a number of more sophis-
ticated modules like ChainOfThought, ProgramOfThought, MultiChainComparison, and ReAct.5
These can all be used interchangeably to implement a DSPy signature. For instance, simply chang-
4String descriptions of the task and the fields are also optional and usually omitted. Fields can carry optional
field prefix and description. By default, fields are assumed to hold free-form strings; we are actively exploring
optional data type as a way to specify constraints on valid values (e.g.,boolor int) and more gracefully handle
formatting and parsing logic, though this feature is not core to DSPy at the time of writing.
5These modules generalize prompting techniques from the literature, respectively, by Wei et al. (2022),
Chen et al. (2022), Yoran et al. (2023), and Yao et al. (2022) and, in doing so, generalize the ideas on zero-shot
prompting and rationale self-generation from Kojima et al. (2022), Zelikman et al. (2022), Zhang et al. (2022),
and Huang et al. (2022) to parameterized modules that can bootstrap arbitrary multi-stage pipelines.
4

### Source Document (page_label: 11)

Preprint
Table 2: Results with in-context learning on HotPotQA multi-hop retrieval question answering. We
report answer exact match (Ans) and pair-retrieval accuracy (Psg). Each row represents a separate
pipeline: the module in the Program column is compiled against the examples in the Training set.
The programs, compilers, and (small) training sets are defined in the main text. For HotPotQA, we
use the training set (and not dev) directly for cross-validation. ∗The marked result is evaluated on
50% of our test set due to cost.
GPT-3.5 Llama2-13b-chat
Program Compiler Dev Test Dev Test
Ans Psg Ans Psg Ans Psg Ans Psg
vanilla fewshot 34.3 n/a 31.5 n/a 27.5 n/a 21.8 n/a
CoTRAG fewshot 36.4 36.0 29.8 34.4 34.5 36.0 28.0 34.4
bootstrap 42.3 36.0 – – 38.3 36.0 32.9 34.4
react
none 20.3 – – – 20.0 – – –
+humanr 33.0 – – – 28.3 – – –
bootstrap 31.0 – – – 24.7 – – –
bootstrap×2 39.0 – – – 40.0 – – –
multihop
fewshot 36.9 38.3 31.2 40.8 34.7 32.0 31.3 30.8
bootstrap 48.7 47.0 39.6 43.8 42.0 48.3 36.4 43.5
ensemble 54.7 – 45.6∗ – 50.0 – 41.0 –
questions. For compiling, we use a teacher program consisting of an ensemble (union) of two
multihop with llama2-13b-chat. Considering its extremely small size and local availability, this
compiled program with T5-Largewould impose orders of magnitude lower costs for inference than
a proprietary LM like GPT-3.5.
Our results may be pegged against the evaluation on HotPotQA in a number of recent papers, though
there is significant variation in evaluation methodology and test set samples across studies in this
space. Using CoT prompting, Si et al. (2022) achieve 25.2% EM. With a “recite-and-answer” tech-
nique that uses PaLM-62B (Chowdhery et al., 2022) to recite evidence passages, Sun et al. (2022)
achieve 26.5% EM. Wang et al. (2022a) achieve 33.8% EM and 44.6% F1 when applying self-
consistency for PaLM-540B. Yao et al. (2022) achieve 27.4% EM using ReAct with PaLM-540B
and 30.8 with text-davinci-002, with a tool giving it the ability for search using a Wikipedia
API. They push their PaLM results to 35.1% EM by applying an additional CoT step with self-
consistency, which may resemble our ensemble approach in the sense of aggregating multiple an-
swers. Trivedi et al. (2022) reports 49% using a pipeline with code-davinci-002 LM on a sample
of 500 HotPotQA questions.
8 C ONCLUSION
This paper introduced DSPy, a new programming model for designing AI systems using pipelines
of pretrained LMs and other tools. We presented three new concepts introduced in this abstraction
(DSPy signatures, modules, and teleprompters), and showed in two very different case studies that
it supports rapid development of highly effective systems that use relatively small LMs. We have
maintained open-source versions of this framework for close to a year. In this period, we have seen
and created a large number of programs that were compiled to high-quality systems by DSPy, span-
ning tasks from information extraction to low-resource synthetic data generation. In the interest of
space and to maintain reasonable scope in this paper, we leave reporting on such tasks under con-
trolled experimental conditions to future work. While in-context learning has proved transformative
over the past 2–3 years of LM research, we argue that the true expressive power in this emerging
paradigm is in building sophisticated text transformation graphs in which composable modules and
optimizers (teleprompters) come together to leverage LMs in more systematic and reliable ways.
ACKNOWLEDGMENTS
We thank Josh Purtell for suggesting the apt name “text transformation graph” for the computational
graph abstraction of DSPy. We thank Rick Battle, Igor Kotenkov, Lisa Li, David Hall, Ashwin
Paranjape, Chris Manning, Percy Liang, and many researchers, developers, and users for valuable
11

Let's try retrieving the docs containing the query instead

In [67]:
# Write an inline function to find the first document containing a specific text
def find_document_with_text(text):
    for i, doc in enumerate(docs):
        if text in doc.text:
            return i, doc
    return None, None

i, doc = find_document_with_text(word)

In [68]:
doc

Document(id_='8084eedf-d1f1-4f0b-bf47-c3612d196444', embedding=None, metadata={'page_label': '2', 'file_name': 'dspy.pdf', 'file_path': '/Users/fc/experiments/rag-project/docs/dspy.pdf', 'file_type': 'application/pdf', 'file_size': 460814, 'creation_date': '2025-06-11', 'last_modified_date': '2024-11-02'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Preprint\ncalls in existing LM pipelines and in popular developer frameworks are generally implemented using\nhard-coded ‘prompt templates’, that is, long strings of instructions and demonstrations that are hand\ncrafted through manual trial and error. We argue that this approach, while per

In [75]:
Markdown(highlight(docs[1].text, word, "#ffff00"))

Preprint
calls in existing LM pipelines and in popular developer frameworks are generally implemented using
hard-coded ‘prompt templates’, that is, long strings of instructions and demonstrations that are hand
crafted through manual trial and error. We argue that this approach, while pervasive, can be brittle
and unscalable—conceptually akin to hand-tuning the weights for a classifier. A given string prompt
might not generalize to different pipelines or across different LMs, data domains, or even inputs.
Toward a more systematic approach to designing AI pipelines, we introduce theDSPy programming
model.1 DSPy pushes building new LM pipelines away from manipulating free-form strings and
closer to programming (composing modular operators to build text transformation graphs) where a
compiler automatically generates optimized LM invocation strategies and prompts from a program.
We draw inspiration from the consensus that emerged around neural network abstractions (Bergstra
et al., 2013), where (1) many general-purpose layers can be modularly composed in any complex
architecture and (2) the model weights can be trained using optimizers instead of being hand-tuned.
To this end, we propose the DSPy programming model(Sec 3). We first translate string-based
prompting techniques, including complex and task-dependent ones like Chain of Thought (Wei et al.,
2022) and ReAct (Yao et al., 2022), into declarative modules that carrynatural-language typed sig-
natures. DSPy modules are task-adaptive components—akin to neural network layers—that abstract
any particular text transformation, like answering a question or summarizing a paper. We then pa-
rameterize each module so that it can learn its desired behavior by iteratively bootstrapping useful
demonstrations within the pipeline. Inspired directly by PyTorch abstractions (Paszke et al., 2019),
DSPy modules are used via expressive define-by-run computational graphs. Pipelines are expressed
by (1) declaring the modules needed and (2) using these modules in any logical control flow (e.g.,
ifstatements, for loops, exceptions, etc.) to logically connect the modules.
We then develop theDSPy compiler(Sec 4), which optimizes any DSPy program to improve quality
or cost. The compiler inputs are the program, a few training inputs with optional labels, and a valida-
tion metric. The compiler simulates versions of the program on the inputs and bootstraps example
traces of each module for self-improvement, using them to construct effective few-shot prompts
or finetuning small LMs for steps of the pipeline. Optimization in DSPy is highly modular: it is
conducted by teleprompters,2 which are general-purpose optimization strategies that determine how
the modules should learn from data. In this way, the compiler automatically maps the declarative
modules to high-quality compositions of prompting, finetuning, reasoning, and augmentation.
Programming models like DSPy could be assessed along many dimensions, but we focus on the role
of expert-crafted prompts in shaping system performance. We are seeking to reduce or even remove
their role through DSPy modules (e.g., versions of popular techniques like Chain of Thought) and
teleprompters. We report on two expansive case studies: math word problems (GMS8K; Cobbe et al.
2021) and multi-hop question answering (HotPotQA; Yang et al. 2018) with explorations of chain
of thought, multi-chain reflection, multi-hop retrieval, retrieval-augmented question answering, and
agent loops. Our evaluations use a number of different compiling strategies effectively and show
that straightforward DSPy programs outperform systems using hand-crafted prompts, while also
allowing our programs to use much smaller and hence more efficient LMs effectively.
Overall, this work proposes the first programming model that translates prompting techniques into
parameterized declarative modules and introduces an effective compiler with general optimiza-
tion strategies (teleprompters) to optimize arbitrary pipelines of these modules. Our main contri-
butions are empirical and algorithmic: with DSPy, we have found that we can implement very
short programs that can bootstrap self-improving multi-stage NLP systems using LMs as small as
llama2-13b-chat and T5-Large (770M parameters). Without hand-crafted prompts and within
minutes to tens of minutes of compiling, compositions of DSPy modules can raise the quality of
simple programs from 33% to 82% (Sec 6) and from 32% to 46% (Sec 7) for GPT-3.5 and, simi-
larly, from 9% to 47% (Sec 6) and from 22% to 41% (Sec 7) for llama2-13b-chat.
1DSPy is pronounced <mark style='background-color:#ffff00;'>dee-ess-pie</mark>. It’s the second iteration of our earlier Demonstrate–Search–Predict
framework (DSP; Khattab et al. 2022). This paper introduces the key concepts in DSPy. For more extensive and
up-to-date documentation of the framework, we refer readers to https://github.com/stanfordnlp/dspy.
2We derive the name tele-prompters from the notion of abstracting and automating the task of prompting,
in particular, such that it happens at a distance, without manual intervention.
2