# How to access a persistent ChromaDB vector store

In [None]:
import openai
from dotenv import load_dotenv
import os
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

In [6]:
load_dotenv()  # Load environment variables from .env file
openai.api_key = os.getenv("OPENAI_API_KEY")  # Set OpenAI API key from environment variable

model = "gpt-4o-mini"
persist_dir = "./vector_store"  # Directory to save the vector store

In [None]:
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",  # Use a smaller model for faster processing
    openai_api_key=openai.api_key,  # Set OpenAI API key for embeddings
)

vector_db = Chroma(
    collection_name="book-rag",  # Name of the collection in the vector store
    embedding_function=embeddings,
    persist_directory=persist_dir,  # Directory to save the vector store
)

## Summary and Additional Tips

### Key Querying Methods:

1. **LangChain Methods:**
   - `similarity_search(query, k=n)` - Basic similarity search
   - `similarity_search_with_score(query, k=n)` - With relevance scores
   - `similarity_search(query, k=n, filter={})` - With metadata filters

2. **Direct ChromaDB Methods:**
   - `collection.query(query_texts=[], n_results=n)` - Basic query
   - `collection.query(query_embeddings=[], n_results=n)` - With embeddings
   - `collection.query(where={}, n_results=n)` - With metadata filters

3. **Useful Parameters:**
   - `k` or `n_results`: Number of documents to return
   - `include`: What to include in results ['metadatas', 'documents', 'distances']
   - `where`: Metadata filtering conditions
   - `filter`: LangChain metadata filtering

4. **Performance Tips:**
   - Use smaller `k` values for faster queries
   - Use metadata filtering to narrow down search space
   - Consider using direct embeddings for repeated similar queries

# Var1: Persistent Client

In [7]:
import chromadb

persistent_client = chromadb.PersistentClient(path=persist_dir)
collection = persistent_client.get_or_create_collection(name="book-rag")
collection.peek()  # Check the contents of the collection

{'ids': ['980d8491-849a-4916-afd0-f6481315151d',
  'b7644c28-3f13-498a-9dcd-d0def691dd8f',
  '66a97b10-e31a-4206-ae0b-6d4b3e437aaa',
  'e7f848e6-dcbb-49a9-b34a-b61968a4e5ee',
  '0d18b382-9bc6-43b9-bc5b-bbb516b4fea3',
  'a8a2302d-b610-4000-9f25-9ef35b3a6363',
  '6e29481f-6352-4e9b-af92-ca09242382e3',
  '948f2917-95d2-47b7-a4b5-6d7271659d3a',
  'b3018478-be24-4f4d-b733-eb819b07f654',
  '21d360ac-9a8a-4648-801f-4ffc07f94f3b'],
 'embeddings': array([[-0.04114935,  0.01725916, -0.0259696 , ..., -0.0274252 ,
          0.01073788, -0.01724761],
        [-0.02594493,  0.05223745,  0.02822908, ..., -0.02153802,
          0.00872693,  0.00682761],
        [-0.04504454,  0.01727702,  0.02702175, ..., -0.02781724,
         -0.01397077,  0.01471654],
        ...,
        [-0.02168313,  0.01067083,  0.02736986, ..., -0.01000103,
         -0.0144598 ,  0.01296917],
        [ 0.03294064,  0.03395545,  0.02321879, ...,  0.0131722 ,
          0.00389179,  0.01176162],
        [-0.03101617,  0.03568044, 

In [None]:
# Example 1: Collection statistics and information
print("Collection Information:")
print(f"Collection name: {collection.name}")
print(f"Collection count: {collection.count()}")

Collection Information:
Collection name: book-rag
Collection count: 673



# Querying the Persistent ChromaDB Vector Store

Now let's explore different ways to query the persistent ChromaDB vector store. We'll cover:
1. Basic similarity search using LangChain
2. Direct ChromaDB client queries
3. Similarity search with metadata filtering
4. Retrieving documents with scores

## Method 1: Basic Similarity Search using LangChain

In [10]:
# Basic similarity search using LangChain
query = "What are large language models?"
docs = vector_db.similarity_search(query, k=3)  # Get top 3 most similar documents

print(f"Found {len(docs)} documents for query: '{query}'\n")
for i, doc in enumerate(docs, 1):
    print(f"Document {i}:")
    print(f"Content: {doc.page_content[:200]}...")
    print(f"Metadata: {doc.metadata}")
    print("-" * 50)

Found 3 documents for query: 'What are large language models?'

Document 1:
Content: Was sind Large Language Models? | 33
Wie LLMs funktionieren
Die Art und Weise, wie ein LLM vortrainiert und feingetunt wird, macht den Unter-
schied zwischen einem Modell mit akzeptabler Leistung und ...
Metadata: {'page_label': '33', 'source': "documents/O'Reilly_Praxiseinstieg Large Language Models Einsatz von ChatGPT und anderen LLMs.pdf", 'creationdate': '2024-04-15T11:07:40+02:00', 'title': 'Praxiseinstieg Larg Language Models', 'total_pages': 274, 'copyright': 'O’Reilly', 'page': 32, 'producer': 'Acrobat Distiller 11.0 (Windows)', 'author': 'Sinan Ozdemir', 'subject': 'Strategien und Best Practices für den Einsatz von ChatGPT und anderen LLMs', 'moddate': '2024-11-22T17:23:37+01:00', 'keywords': '', 'creator': 'FrameMaker 12.0.4'}
--------------------------------------------------
Document 2:
Content: Was sind Large Language Models? | 29
Autoregressive Sprachmodelle werden so traini ert, dass sie

## Method 2: Similarity Search with Scores

In [11]:
# Similarity search with relevance scores
docs_with_scores = vector_db.similarity_search_with_score(query, k=3)

print(f"Found {len(docs_with_scores)} documents with scores for query: '{query}'\n")
for i, (doc, score) in enumerate(docs_with_scores, 1):
    print(f"Document {i} (Score: {score:.4f}):")
    print(f"Content: {doc.page_content[:200]}...")
    print(f"Metadata: {doc.metadata}")
    print("-" * 50)

Found 3 documents with scores for query: 'What are large language models?'

Document 1 (Score: 0.6233):
Content: Was sind Large Language Models? | 33
Wie LLMs funktionieren
Die Art und Weise, wie ein LLM vortrainiert und feingetunt wird, macht den Unter-
schied zwischen einem Modell mit akzeptabler Leistung und ...
Metadata: {'page': 32, 'producer': 'Acrobat Distiller 11.0 (Windows)', 'subject': 'Strategien und Best Practices für den Einsatz von ChatGPT und anderen LLMs', 'page_label': '33', 'title': 'Praxiseinstieg Larg Language Models', 'creator': 'FrameMaker 12.0.4', 'keywords': '', 'total_pages': 274, 'author': 'Sinan Ozdemir', 'copyright': 'O’Reilly', 'moddate': '2024-11-22T17:23:37+01:00', 'creationdate': '2024-04-15T11:07:40+02:00', 'source': "documents/O'Reilly_Praxiseinstieg Large Language Models Einsatz von ChatGPT und anderen LLMs.pdf"}
--------------------------------------------------
Document 2 (Score: 0.6407):
Content: Was sind Large Language Models? | 29
Autoregressive 

## Method 3: Direct ChromaDB Client Queries

In [17]:
# Direct ChromaDB client query
# Note: When using direct ChromaDB client queries with query_texts, 
# ChromaDB will use its default embedding function if no embedding function is set.
# To avoid dimension mismatches, we can use the LangChain wrapper instead:

print("Using LangChain wrapper for consistent embeddings:")
docs = vector_db.similarity_search(query, k=3)
print(f"Found {len(docs)} documents using LangChain wrapper:\n")
for i, doc in enumerate(docs, 1):
    print(f"Document {i}:")
    print(f"Content: {doc.page_content[:200]}...")
    print(f"Metadata: {doc.metadata}")
    print("-" * 50)

# Alternative: Use pre-computed embeddings with direct ChromaDB
print("\nUsing pre-computed embeddings with direct ChromaDB:")
query_embedding = embeddings.embed_query(query)
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3,
    include=['metadatas', 'documents', 'distances']
)

print(f"Found {len(results['documents'][0])} documents using direct ChromaDB with embeddings:\n")
for i in range(len(results['documents'][0])):
    print(f"Document {i+1} (Distance: {results['distances'][0][i]:.4f}):")
    print(f"Content: {results['documents'][0][i][:200]}...")
    print(f"Metadata: {results['metadatas'][0][i]}")
    print("-" * 50)

Using LangChain wrapper for consistent embeddings:
Found 3 documents using LangChain wrapper:

Document 1:
Content: tung
Fehlinterpretation von 
Anweisungen, Überanpas-
sung an das Trainingsset, 
spärliches Belohnungssig-
nal beim Reinforcement 
Learning
Nutzung verschiedener 
Trainingsdatensets, um die 
Vielfalt d...
Metadata: {'page_label': '264', 'total_pages': 274, 'title': 'Praxiseinstieg Larg Language Models', 'moddate': '2024-11-22T17:23:37+01:00', 'creationdate': '2024-04-15T11:07:40+02:00', 'creator': 'FrameMaker 12.0.4', 'keywords': '', 'producer': 'Acrobat Distiller 11.0 (Windows)', 'copyright': 'O’Reilly', 'source': "documents/O'Reilly_Praxiseinstieg Large Language Models Einsatz von ChatGPT und anderen LLMs.pdf", 'subject': 'Strategien und Best Practices für den Einsatz von ChatGPT und anderen LLMs', 'author': 'Sinan Ozdemir', 'page': 263}
--------------------------------------------------
Document 2:
Content: SAWYER: Sinans Versuch, kluge und dennoch fesselnde Antworten z

## Method 4: Metadata Filtering

You can filter documents based on metadata when querying:

In [13]:
# Example 1: Filter by source using LangChain
# This assumes your documents have a 'source' metadata field

# First, let's see what metadata keys are available
sample_docs = vector_db.similarity_search("test", k=1)
if sample_docs:
    print("Available metadata keys:", list(sample_docs[0].metadata.keys()))
    print("Example metadata:", sample_docs[0].metadata)
else:
    print("No documents found in the vector store")

Available metadata keys: ['total_pages', 'creator', 'keywords', 'creationdate', 'author', 'title', 'page_label', 'subject', 'producer', 'moddate', 'copyright', 'page', 'source']
Example metadata: {'total_pages': 274, 'creator': 'FrameMaker 12.0.4', 'keywords': '', 'creationdate': '2024-04-15T11:07:40+02:00', 'author': 'Sinan Ozdemir', 'title': 'Praxiseinstieg Larg Language Models', 'page_label': '109', 'subject': 'Strategien und Best Practices für den Einsatz von ChatGPT und anderen LLMs', 'producer': 'Acrobat Distiller 11.0 (Windows)', 'moddate': '2024-11-22T17:23:37+01:00', 'copyright': 'O’Reilly', 'page': 108, 'source': "documents/O'Reilly_Praxiseinstieg Large Language Models Einsatz von ChatGPT und anderen LLMs.pdf"}


In [19]:
# Example 2: Filter by specific metadata using LangChain
# Uncomment and modify based on your metadata structure
docs_filtered = vector_db.similarity_search(
     query,
     k=3,
     filter={"author": "Sinan Ozdemir"}  # Filter by source
 )

print(f"Found {len(docs_filtered)} documents with metadata filter for query: '{query}'\n")
for i, doc in enumerate(docs_filtered, 1):
    print(f"Document {i}:")
    print(f"Content: {doc.page_content[:200]}...")
    print(f"Metadata: {doc.metadata}")
    print("-" * 50)

# Example 3: Using direct ChromaDB client with metadata filters
# You can use where clause with ChromaDB for more complex filtering
# filtered_results = collection.query(
#     query_texts=[query],
#     n_results=3,
#     # where={"source": {"$eq": "specific_document.pdf"}},  # Uncomment to filter by source
#     include=['metadatas', 'documents', 'distances']
# )

Found 3 documents with metadata filter for query: 'What are large language models?'

Document 1:
Content: tung
Fehlinterpretation von 
Anweisungen, Überanpas-
sung an das Trainingsset, 
spärliches Belohnungssig-
nal beim Reinforcement 
Learning
Nutzung verschiedener 
Trainingsdatensets, um die 
Vielfalt d...
Metadata: {'title': 'Praxiseinstieg Larg Language Models', 'copyright': 'O’Reilly', 'creationdate': '2024-04-15T11:07:40+02:00', 'keywords': '', 'author': 'Sinan Ozdemir', 'page_label': '264', 'producer': 'Acrobat Distiller 11.0 (Windows)', 'creator': 'FrameMaker 12.0.4', 'subject': 'Strategien und Best Practices für den Einsatz von ChatGPT und anderen LLMs', 'page': 263, 'total_pages': 274, 'source': "documents/O'Reilly_Praxiseinstieg Large Language Models Einsatz von ChatGPT und anderen LLMs.pdf", 'moddate': '2024-11-22T17:23:37+01:00'}
--------------------------------------------------
Document 2:
Content: SAWYER: Sinans Versuch, kluge und dennoch fesselnde Antworten zu geben | 

## Method 5: Vector Search with Custom Parameters

Let's explore more advanced query parameters and options:

In [21]:
# Example 1: Query with embeddings directly
# Get embeddings for your query
query_embedding = embeddings.embed_query(query)

# Use ChromaDB to query with the embedding directly
embedding_results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5,
    include=['metadatas', 'documents', 'distances']
)

print("Query using direct embeddings:")
for i in range(len(embedding_results['documents'][0])):
    print(f"Document {i+1} (Distance: {embedding_results['distances'][0][i]:.4f}):")
    print(f"Content: {embedding_results['documents'][0][i][:150]}...")
    print("-" * 40)

Query using direct embeddings:
Document 1 (Distance: 0.6233):
Content: Was sind Large Language Models? | 33
Wie LLMs funktionieren
Die Art und Weise, wie ein LLM vortrainiert und feingetunt wird, macht den Unter-
schied z...
----------------------------------------
Document 2 (Distance: 0.6407):
Content: Was sind Large Language Models? | 29
Autoregressive Sprachmodelle werden so traini ert, dass sie das nächste Token in
einem Satz vorhersagen, und zwar...
----------------------------------------
Document 3 (Distance: 0.6444):
Content: | 23
TEIL I
Einführung in Large Language Models...
----------------------------------------
Document 4 (Distance: 0.6526):
Content: Was sind Large Language Models? | 31
Wie schon erwähnt, lassen sich LLMs im Allgemeinen drei Hauptkategorien zuord-
nen:
• Autoregressive Modelle wie ...
----------------------------------------
Document 5 (Distance: 0.6567):
Content: Was sind Large Language Models? | 39
Abbildung 1-11: LLMs können alles Mögliche über die Wel

## Method 6: Multiple Query Types and Collection Operations

Let's explore other useful operations with your persistent vector store:

In [None]:
# Example 2: Multiple queries at once
multiple_queries = [
    "What are transformers in NLP?",
    "How do language models work?",
    "What is attention mechanism?"
]

print("Multiple queries:")
for i, query_text in enumerate(multiple_queries, 1):
    print(f"\nQuery {i}: {query_text}")
    results = collection.query(
        query_texts=[query_text],
        n_results=2,
        include=['documents', 'distances']
    )
    for j in range(len(results['documents'][0])):
        print(f"  Result {j+1} (Distance: {results['distances'][0][j]:.4f}): {results['documents'][0][j][:100]}...")

Collection Information:
Collection name: book-rag
Collection count: 673

Multiple queries:

Query 1: What are transformers in NLP?


InvalidArgumentError: Collection expecting embedding with dimension of 1536, got 384