```
using python3.11.0
In docker we need:

apt get curl
curl https://ollama.com/install.sh | sh
ollama serve &
ollama pull llama3.2:1b



```

In [1]:
import nest_asyncio
nest_asyncio.apply()

import qdrant_client
from qdrant_client.models import VectorParams, Distance
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, ServiceContext, StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

# Define collection name
collection_name = "chat_with_docs"

# Connect to Qdrant
try:
    client = qdrant_client.QdrantClient(
        host="host.docker.internal",
        port=6333
    )
    # Test connection by getting collection info
    try:
        client.get_collection(collection_name)
        print(f"Successfully connected to Qdrant and found collection '{collection_name}'")
    except Exception as e:
        print(f"Collection '{collection_name}' not found. Creating it now...")
        # Get the dimension from your embedding model
        embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5",
                                          trust_remote_code=True)
        embedding_dimension = 1024  # bge-large-en-v1.5 uses 1024 dimensions
        
        # Create the collection with proper format using VectorParams
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(
                size=embedding_dimension,
                distance=Distance.COSINE
            )
        )
        print(f"Created collection '{collection_name}'")
except Exception as e:
    print(f"Error connecting to Qdrant: {e}")
    print("\nPossible solutions:")
    print("1. Make sure Qdrant is running locally with: docker run -p 6333:6333 qdrant/qdrant")
    print("2. Check if port 6333 is not blocked by firewall")
    print("3. Verify no other application is using port 6333")
    raise

# Load documents
input_dir_path = './docs'
loader = SimpleDirectoryReader(
    input_dir=input_dir_path,
    required_exts=[".pdf"],
    recursive=True
)
docs = loader.load_data()
print(f"Loaded {len(docs)} documents")

# Set up embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5",
                                  trust_remote_code=True)
Settings.embed_model = embed_model

# Create index function
def create_index(documents):
    print("Creating vector store...")
    vector_store = QdrantVectorStore(client=client,
                                    collection_name=collection_name)
    
    print("Setting up storage context...")
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    print("Building index from documents (this may take a while)...")
    index = VectorStoreIndex.from_documents(documents,
                                           storage_context=storage_context)
    
    print("Index creation complete!")
    return index

# Create the index
try:
    index = create_index(docs)
    print("Successfully created index!")
except Exception as e:
    print(f"Error creating index: {e}")
    raise

Collection 'chat_with_docs' not found. Creating it now...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Created collection 'chat_with_docs'
Loaded 32 documents
Creating vector store...
Setting up storage context...
Building index from documents (this may take a while)...
Index creation complete!
Successfully created index!


    ```
    client = qdrant_client.QdrantClient(
        host="host.docker.internal",
        port=6333
    )

    ```
    This is running on the host where docker is mounted! need to make it locally within one docker container!

In [2]:
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings

llm = Ollama(model="llama3.2:1b", request_timeout=120.0)

Settings.llm = llm

In [3]:
from llama_index.core import PromptTemplate

template = """Context information is below:
              ---------------------
              {context_str}
              ---------------------
              Given the context information above I want you to think
              step by step to answer the query in a crisp manner,
              incase you don't know the answer say 'I don't know!'
            
              Query: {query_str}
        
              Answer:"""

qa_prompt_tmpl = PromptTemplate(template)

In [4]:
from llama_index.core.postprocessor import SentenceTransformerRerank

rerank = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-2-v2", 
    top_n=3
)

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/62.5M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [20]:
import time
query_engine = index.as_query_engine(similarity_top_k=10,
                                     node_postprocessors=[rerank])

query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

total_time_s = time.time()

response = query_engine.query("What exactly is DSPy?")

total_time_e = time.time()
total_time = total_time_e - total_time_s

print(f"time taken for fill response is {total_time}")



time taken for fill response is 3.5352823734283447


In [21]:
from IPython.display import Markdown, display

display(Markdown(str(response)))

DSPy stands for "Deep Speech Programming Yout". It appears to be a programming framework and model that allows developers to design, implement, and optimize large-scale natural language processing (NLP) systems using a declarative modeling approach.

In [26]:
import time
from llama_index.core import QueryBundle

query = "What exactly is DSPy?"
query_bundle = QueryBundle(query_str=query)

# Measure total query time
start_time = time.time()

# Measure similarity search time
sim_start_time = time.time()
retrieved_docs = query_engine.retriever.retrieve(query) 
sim_time = time.time() - sim_start_time
print(f"Time taken for similarity search: {sim_time:.4f} seconds")

# Measure reranking time
rerank_start_time = time.time()
reranked_docs = rerank.postprocess_nodes(retrieved_docs, query_bundle=query_bundle)
rerank_time = time.time() - rerank_start_time
print(f"Time taken for reranking: {rerank_time:.4f} seconds")

# Measure LLM response time
llm_start_time = time.time()
response = query_engine.synthesize(query_bundle, nodes=reranked_docs)
llm_time = time.time() - llm_start_time
print(f"Time taken for LLM response: {llm_time:.4f} seconds")

# Total time
total_time = time.time() - start_time
print(f"Total query time: {total_time:.4f} seconds")

# Print the final response
print("\nResponse from LLM:")
print(response)


Time taken for similarity search: 0.1507 seconds
Time taken for reranking: 0.1786 seconds
Time taken for LLM response: 3.3454 seconds
Total query time: 3.6758 seconds

Response from LLM:
DSPy stands for Deep Speech Processing and Prediction. It's a programming model developed by Stanford Natural Language Processing Group that aims to abstract prompting techniques into parameterized declarative modules, which can be used to implement natural language processing (NLP) pipelines.
