# 1. Setup Asyncio

In [1]:
import nest_asyncio

nest_asyncio.apply()

# 2. Setup the Qdrant vector database

In [2]:
import qdrant_client

collection_name = "chat_with_docs"

client = qdrant_client.QdrantClient(
    host="localhost",
    port=6333,
)



# 3. Read the documents

In [1]:
from llama_index.core import SimpleDirectoryReader

input_dir_path = "./docs"

loader = SimpleDirectoryReader(
    input_dir=input_dir_path,
    required_exts=[".pdf"],
    recursive=True
)

docs = loader.load_data()



In [2]:
docs

[Document(id_='f2784fbd-3a38-403c-98f4-fe387f6b523f', embedding=None, metadata={'page_label': '1', 'file_name': 'docling.pdf', 'file_path': '/Users/fc/experiments/rag-project/docs/docling.pdf', 'file_type': 'application/pdf', 'file_size': 5566575, 'creation_date': '2025-06-13', 'last_modified_date': '2025-06-13'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Docling Technical Report\nVersion 1.0\nChristoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos\nPanos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer\nKasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima\nValery Weber Lucas Mo

In [3]:
type(docs), len(docs)

(list, 41)

## 4. A function to index data

In [6]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import VectorStoreIndex, StorageContext

def create_index(documents):

    vector_store = QdrantVectorStore(client=client,
                                     collection_name=collection_name)
    
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    index = VectorStoreIndex.from_documents(documents,
                                            storage_context=storage_context)
    
    return index

# 5. Load the embedding model and index data

In [7]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5",
                                   trust_remote_code=True)

Settings.embed_model = embed_model

index = create_index(docs)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
type(index)

llama_index.core.indices.vector_store.base.VectorStoreIndex

## 6. Load the LLM

In [9]:
from llama_index.llms.ollama import Ollama

llm = Ollama(model="llama3.2:1b", request_timeout=120.0)

Settings.llm = llm

In [10]:
type(Settings)

llama_index.core.settings._Settings

# 7. Define the prompt template

In [11]:
from llama_index.core import PromptTemplate

template = """Context information is below:
              ---------------------
              {context_str}
              ---------------------
              Given the context information above I want you to think
              step by step to answer the query in a crisp manner,
              incase you don't know the answer say 'I don't know!'
            
              Query: {query_str}
        
              Answer:"""

qa_prompt_tmpl = PromptTemplate(template)

# 8. Reranking

Here, we use a cross-encoder to re-rank the document chunks. Also, we limit the output to the top 3 most relevant chunks based on the model’s scoring.

In [12]:
from llama_index.core.postprocessor import SentenceTransformerRerank

rerank = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-2-v2", 
    top_n=3
)

In [13]:
rerank

SentenceTransformerRerank(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x3bcd44640>, model='cross-encoder/ms-marco-MiniLM-L-2-v2', top_n=3, device='mps', keep_retrieval_score=False, trust_remote_code=False)

# 9. Query the document

In [20]:
query_engine = index.as_query_engine(similarity_top_k=10,
                                     node_postprocessors=[rerank])

query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

# response = query_engine.query("What exactly is DSPy?")
response = query_engine.query("How is DSPy pronounced?")
# response = query_engine.query("What is the github repo for docling?")

In [21]:
from IPython.display import Markdown, display

display(Markdown(str(response)))

DSPy is pronounced "dee-ess-pie".

In [22]:
response.metadata

{'ef31af7c-fb16-4157-b76f-640b73834b9a': {'page_label': '4',
  'file_name': 'dspy.pdf',
  'file_path': '/Users/fc/experiments/rag-project/docs/dspy.pdf',
  'file_type': 'application/pdf',
  'file_size': 460814,
  'creation_date': '2025-06-23',
  'last_modified_date': '2024-11-02'},
 'e0952a48-0aec-4de4-b0ff-b8438e2c28e5': {'page_label': '2',
  'file_name': 'dspy.pdf',
  'file_path': '/Users/fc/experiments/rag-project/docs/dspy.pdf',
  'file_type': 'application/pdf',
  'file_size': 460814,
  'creation_date': '2025-06-23',
  'last_modified_date': '2024-11-02'},
 '247836cf-75bb-4108-89b6-9d57e180a749': {'page_label': '27',
  'file_name': 'dspy.pdf',
  'file_path': '/Users/fc/experiments/rag-project/docs/dspy.pdf',
  'file_type': 'application/pdf',
  'file_size': 460814,
  'creation_date': '2025-06-23',
  'last_modified_date': '2024-11-02'}}