# **Chroma Basics**

In [None]:
# First step ist to install the chromadb package
# pip install chromadb
# Then, we can import the chromadb package and create a client

import chromadb 

# This is a Ephemeral client that will not persist data to disk
# client= chromadb.Client()    

# This will disable anonymized telemetry which means that Chroma will not collect any data about your usage
from chromadb.config import Settings
client= chromadb.Client(Settings(anonymized_telemetry=False))

In [2]:
import chromadb
from chromadb.config import Settings

settings = Settings(allow_reset=True,anonymized_telemetry=False) # this will allow the client to be reset otherwise it will raise an error
# This is a Persistent client that will persist data to disk
client= chromadb.PersistentClient(path="./chroma_db",settings=settings) 

In [3]:
# Now, we can create a collection
collection = client.create_collection(name="my_collection")

In [4]:
# This will delete the collection
client.delete_collection(name="my_collection")

In [5]:
collection = client.get_or_create_collection(name="my_collection")

In [6]:
# We can add documents to the collection
collection.add(
    documents=["Hello world", "Goodbye world"],
    metadatas=[{"source": "doc1"}, {"source": "doc2"}],
    ids=["id1", "id2"]
)

In [7]:
# # We can query the collection
results = collection.query(
    query_texts=["Hello world"], # Chroma will embed this for you
    n_results=2 # how many results to return
)

results
# The results will be a list of dictionaries, each containing the document, metadata, and ID
# The distance is the similarity score between the query and the document
# The lower the distance, the more similar the document is to the query

{'ids': [['id1', 'id2']],
 'embeddings': None,
 'documents': [['Hello world', 'Goodbye world']],
 'uris': None,
 'data': None,
 'metadatas': [[{'source': 'doc1'}, {'source': 'doc2'}]],
 'distances': [[0.0, 0.9318176781641861]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [7]:
# we can also delete documents from the collection
collection.delete(ids=["id1"])

In [8]:
collection.get() # this will return the remaining documents in the collection

{'ids': ['id2'],
 'embeddings': None,
 'documents': ['Goodbye world'],
 'uris': None,
 'data': None,
 'metadatas': [{'source': 'doc2'}],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [15]:
# we can also update documents in the collection
collection.update(
    ids=["id2"],
    documents=["Hello world updated"],
    metadatas=[{"source": "doc2 updated"}]
)

collection.get()

{'ids': ['id2', 'id1'],
 'embeddings': None,
 'documents': ['Hello world updated', 'Welcome to cromaDB'],
 'uris': None,
 'data': None,
 'metadatas': [{'source': 'doc2 updated'}, {'source': 'doc1 Inserted'}],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [14]:
# we can also upsert documents in the collection which will insert the documents if they do not exist or update them if they do
# this is useful for updating documents in the collection without having to check if they exist first
collection.upsert(
    ids=["id1","id2"],
    documents=["Welcome to cromaDB","Hello world Again"],
    metadatas=[{"source": "doc1 Inserted"}, {"source": "doc2 Updated"}],
)

collection.get()

{'ids': ['id2', 'id1'],
 'embeddings': None,
 'documents': ['Hello world Again', 'Welcome to cromaDB'],
 'uris': None,
 'data': None,
 'metadatas': [{'source': 'doc2 Updated'}, {'source': 'doc1 Inserted'}],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [13]:
collection.delete(where={"source": "doc1 Inserted"}) # this will delete the document with the source doc1 Inserted rather than the id
collection.get()

{'ids': ['id2'],
 'embeddings': None,
 'documents': ['Hello world Again'],
 'uris': None,
 'data': None,
 'metadatas': [{'source': 'doc2 Updated'}],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [17]:
collection.peek() # this will return the first 10 documents in the collection

{'ids': ['id2', 'id1'],
 'embeddings': array([[-4.12920006e-02, -1.48042347e-02,  3.10580619e-02,
         -1.22511219e-02,  5.31616993e-03, -1.74742728e-01,
          2.38755830e-02, -3.81815284e-02, -8.56200531e-02,
          4.50995751e-02,  6.89512715e-02,  4.01390828e-02,
         -5.61843403e-02,  3.37167550e-03,  2.93540061e-02,
         -2.42322516e-02,  2.14292631e-02, -1.44890845e-02,
         -1.74461514e-01, -1.29503570e-02, -7.85445943e-02,
          3.68521251e-02,  2.92963274e-02,  2.84595247e-02,
         -6.25015199e-02, -1.18167726e-02, -2.03392319e-02,
          2.84620337e-02, -9.42580029e-03, -5.91433756e-02,
         -1.61376353e-02,  5.42549752e-02,  1.45207688e-01,
         -3.45489830e-02, -4.36641835e-03, -7.64705380e-03,
          1.66993029e-02, -4.59347889e-02, -5.50863706e-02,
          6.20748624e-02,  2.47155204e-02, -6.74521849e-02,
          6.93234103e-03,  4.93285917e-02, -7.18474388e-02,
          2.73446962e-02, -3.99208181e-02,  3.24272439e-02,
  

In [18]:
collection.count() # this will return the number of documents in the collection

2

In [19]:
collection.modify(name="new_collection_name") # this will rename the collection 

In [20]:
client.list_collections() 

['new_collection_name']

### 🔧 **Chroma Collection Configuration (HNSW Indexing)**

When you create a collection in Chroma, you can configure how it stores and searches your vector embeddings using **metadata settings**. These settings are especially useful for tuning the performance, accuracy, and speed of similarity searches.


#### 🧭 1. **Distance Function – hnsw:space**

This controls **how similarity is calculated** between vectors (documents/embeddings). Choose based on your data and what "similar" means for your use case.

| Value    | Meaning                   | Best For                         |
|----------|---------------------------|----------------------------------|
| l2     | Squared L2 norm (default) | Geometric distance (Euclidean)   |
| cosine | Cosine similarity         | Textual embeddings (e.g., from OpenAI, BERT) |
| ip     | Inner product             | Dot-product-based models         |

🔹 **Example**: If you're storing sentence embeddings from an LLM, use cosine for better semantic matching.


#### ⚙️ 2. **Index Construction Quality – hnsw:construction_ef**

Controls **how many candidates Chroma looks at** when building the index. Think of it like planning a city map—more time spent planning = better connections.

- **Higher value** → better index quality, slower to build, uses more memory  
- **Lower value** → builds faster, but the index may be less accurate  
- **Default**: 100

🔹 **Use case**: Increase this if you want higher precision in results, especially if you have a lot of data.


#### 🔍 3. **Search Accuracy – hnsw:search_ef**

Controls how many nodes the algorithm explores **during a query**. Think of it like how far the system is willing to look to find the best matches.

- **Higher value** → better recall and accuracy, slower search  
- **Lower value** → faster search, but might miss some good results  
- **Default**: 100

🔹 **Tip**: Use a higher value if you want more accurate search results (e.g., top-5 most relevant documents).


#### 🕸️ 4. **Graph Connectivity – hnsw:M**

Controls the **maximum number of connections (neighbors)** each point (embedding) has in the search graph.

- **Higher value** → denser graph, better search results, more memory usage  
- **Lower value** → sparser graph, less memory, possibly lower accuracy  
- **Default**: 16

🔹 **When to tweak**: Increase if you need high precision and have enough memory.

#### 🧵 5. **Thread Usage – hnsw:num_threads**

Sets how many CPU threads to use for **building the index** or **performing search operations**.

- **Default**: All available CPU cores (multiprocessing.cpu_count())

🔹 **Good to know**: Useful when running on high-performance machines or servers; allows parallelism.


In [28]:
# Create a custom collection with specific HNSW parameters

collection = client.get_or_create_collection(
    name="collectionTWO",
    metadata={
        "hnsw:space": "cosine",
        "hnsw:construction_ef": 200,
        "hnsw:search_ef": 150,
        "hnsw:M": 32,
        "hnsw:num_threads": 4
    }
)

In [29]:
collection.add(
    documents=["Hello world", "Goodbye world"],
    metadatas=[{"source": "doc1"}, {"source": "doc2"}],
    ids=["id1", "id2"]
)
# Now, we can query the collection
results = collection.query(
    query_texts=["Hello world"], # Chroma will embed this for you
    n_results=2 # how many results to return
)
results

{'ids': [['id1', 'id2']],
 'embeddings': None,
 'documents': [['Hello world', 'Goodbye world']],
 'uris': None,
 'data': None,
 'metadatas': [[{'source': 'doc1'}, {'source': 'doc2'}]],
 'distances': [[-5.472506892090223e-10, 0.4659088386998965]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [30]:
collection.get_model() # this will return the model used to embed the documents

Collection(id=UUID('0516922c-a262-481b-8c9b-e756a7e3e58c'), name='collectionTWO', configuration_json={'hnsw_configuration': {'space': 'l2', 'ef_construction': 100, 'ef_search': 100, 'num_threads': 48, 'M': 16, 'resize_factor': 1.2, 'batch_size': 100, 'sync_threshold': 1000, '_type': 'HNSWConfigurationInternal'}, '_type': 'CollectionConfigurationInternal'}, metadata={'hnsw:M': 32, 'hnsw:construction_ef': 200, 'hnsw:num_threads': 4, 'hnsw:search_ef': 150, 'hnsw:space': 'cosine'}, dimension=None, tenant='default_tenant', database='default_database', version=0, log_position=0)

In [31]:
client.heartbeat() # this will return the current status of the client it returns a nanosecond heartbeat.

1744877389323695100

In [32]:
client.reset() # this will reset the client and delete all collections and documents 
# it will raise error if in setting allow_reset is set to False

True

In [33]:
client.list_collections() # As we can see the collection is deleted

[]

# **LangChain Integration**

In [None]:
## This is a simple example of how to use ChromaDB with LangChain

from langchain.embeddings import HuggingFaceEmbeddings
from chromadb import Client,PersistentClient  
from chromadb.config import Settings

# Step 1: Setup ChromaDB
chroma_client = Client(Settings(anonymized_telemetry=False)) # for demo we are using ephemeral client you can use PersistentClient for production
collection = chroma_client.get_or_create_collection(name="mycollection")

# Step 2: Setup LangChain Embedding Model
embedding_model = HuggingFaceEmbeddings(model_name="intfloat/e5-base", model_kwargs={"device": "cuda"})

# Step 3: Prepare documents
texts = ["LangChain is awesome", "ChromaDB is powerful"]
metadatas = [{"source": "lc1"}, {"source": "lc2"}]
ids = ["doc1", "doc2"]

# Step 4: Get embeddings using LangChain
embeddings = embedding_model.embed_documents(texts)

# Step 5: Use Chroma's low-level add() method
collection.add(
    embeddings=embeddings,
    documents=texts,
    metadatas=metadatas,
    ids=ids
)

In [None]:
collection.get() 

{'ids': ['doc1', 'doc2'],
 'embeddings': None,
 'documents': ['LangChain is awesome', 'ChromaDB is powerful'],
 'uris': None,
 'data': None,
 'metadatas': [{'source': 'lc1'}, {'source': 'lc2'}],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [None]:
# This is a LangChain example that uses ChromaDB as a vector store and HighLevel Wrapper

from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document  

text = """LangChain is a framework for developing applications powered by language models. 
It provides a standard interface for LLMs and tools to help you build applications. 
ChromaDB is a vector database that can be used to store and query embeddings."""

# 1. Split the text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
chunks = text_splitter.split_text(text)

# 2. Convert chunks to LangChain Document objects
docs = [Document(page_content=chunk) for chunk in chunks]

# 3. Create the embedding model
embedding_model = HuggingFaceEmbeddings(model_name="intfloat/e5-base", model_kwargs={"device": "cuda"})

# 4. Store in ChromaDB
chroma_dir = "./chroma_db"
vectorstore = Chroma.from_documents(docs, embedding_model, persist_directory=chroma_dir) # This is a higher level method that handles embedding and storage
# We cannot use low level methods such as add() , delete() etc. with this method
# we can use add_documents() , delete_documents() etc. with this method

print("✅ Stored documents in ChromaDB!")

✅ Stored documents in ChromaDB!


In [None]:
vectorstore

{'ids': ['9458db96-64a2-426d-be33-f20c0fea7e3e',
  'f13a8ef5-bfb0-48db-8888-8b3b485f4f57',
  '899d8134-5172-4327-952e-a596c0916167',
  '9e48e1c4-1cd2-40f3-9305-684d34fc0669',
  '4f4e73cf-3a42-4c42-8fb2-5454cc4433aa',
  '73610753-02c8-48ca-a355-c3e0e97e103a'],
 'embeddings': None,
 'documents': ['LangChain is a framework for developing applications powered by language models.',
  'It provides a standard interface for LLMs and tools to help you build applications.',
  'ChromaDB is a vector database that can be used to store and query embeddings.',
  'LangChain is a framework for developing applications powered by language models.',
  'It provides a standard interface for LLMs and tools to help you build applications.',
  'ChromaDB is a vector database that can be used to store and query embeddings.'],
 'uris': None,
 'data': None,
 'metadatas': [None, None, None, None, None, None],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

# **Chroma Clients**


### 🔄 **ChromaDB supports Sync and Async Client but What does They mean?**

### 🧠 **Think of it like this:**

#### ⏳ **Synchronous (regular code):**
Imagine you're cooking pasta:

1. Boil water 🫧  
2. Wait... ⌛  
3. Add pasta 🍝  
4. Wait... ⌛  
5. Drain and serve ✅

You're just **waiting** between steps, doing nothing else.

---

#### ⚡ **Asynchronous (async code):**
Now imagine you're cooking pasta *and* making a sandwich at the same time:

1. Start boiling water 🫧
2. While it boils, make a sandwich 🥪  
3. Hear the water boil, add pasta 🍝  
4. While pasta cooks, clean the kitchen 🧼  
5. Pasta done! Drain and serve ✅

You **don't wait around** doing nothing. You use the waiting time to do other useful things.


In [21]:
# Running Chroma on Client-Server Mode 

# To start the server, we need to run the following command in the terminal
! chroma run --path ./chroma_db

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[1m

                [38;5;069m(((((((((    [38;5;203m((((([38;5;220m####
             [38;5;069m((((((((((((([38;5;203m((((((((([38;5;220m#########
           [38;5;069m((((((((((((([38;5;203m((((((((((([38;5;220m###########
         [38;5;069m(((((((((((((([38;5;203m(((((((((((([38;5;220m############
        [38;5;069m((((((((((((([38;5;203m(((((((((((((([38;5;220m#############
        [38;5;069m((((((((((((([38;5;203m(((((((((((((([38;5;220m#############
         [38;5;069m(((((((((((([38;5;203m((((((((((((([38;5;220m##############
         [38;5;069m(((((((((((([38;5;203m(((((((((((([38;5;220m##############
           [38;5;069m(((((((((([38;5;203m((((((((((([38;5;220m#############
             [38;5;069m(((((((([38;5;203m(((((((([38;5;220m##############
                [38;5;069m((((([38;5;203m((((    [38;5;220m#########[0m

    
[1m
Running Chroma
[0m
[1mSaving data to:[0m [32m.[0m[32m/[0m[32mchroma_db[0m
[1mConnect to chroma at:[0m 

In [46]:
import chromadb
from chromadb.config import Settings
# We can connect to the server using the HttpClient , The server will be running on localhost:8000

settings = Settings(allow_reset=True,anonymized_telemetry=False) 
client = chromadb.HttpClient(host='localhost', port=8000,settings=settings) # This is a sync client

In [47]:
collection = client.get_or_create_collection("my_collection")
collection.add(
    documents=["Hello", "World"],
    metadatas=[{"source": "doc1"}, {"source": "doc2"}],
    ids=["1", "2"]
)

In [None]:
collection.get()

{'ids': ['id1', 'id2', '1', '2'],
 'embeddings': None,
 'metadatas': [{'source': 'doc1'},
  {'source': 'doc2'},
  {'source': 'doc1'},
  {'source': 'doc2'}],
 'documents': ['hello world', 'Welcom to Chroma', 'Hello', 'World'],
 'data': None,
 'uris': None,
 'included': ['documents', 'metadatas']}

In [None]:
# The methods we use for chroma sever are the same as the ones we use for the persistent client but we need to use asyncio method 
import asyncio

async def main():
    client = await chromadb.AsyncHttpClient(host="localhost", port=8000) # This is a async client

    collection = await client.get_or_create_collection(name="my_collection") # This will create a collection if it does not exist

    await collection.add(
        documents=["hello world","Welcom to Chroma"],
        metadatas=[{"source": "doc1"}, {"source": "doc2"}],
        ids=["id1", "id2"]
    )
    results = await collection.query(
        query_texts=["hello world"],
        n_results=2
    )
    return results

asyncio.run(main()) # The asyncio code needs to be executed in python not jupyter notebook
# await main() # this will run the main function and return the results

{'ids': [['id1', 'id2']],
 'distances': [[0.0, 1.8720082823642281]],
 'embeddings': None,
 'metadatas': [[{'source': 'doc1'}, {'source': 'doc2'}]],
 'documents': [['hello world', 'Welcom to Chroma']],
 'uris': None,
 'data': None,
 'included': ['distances', 'documents', 'metadatas']}

In [None]:
# ! pip install nest_asyncio # This will allow to run asyncio code in jupyter notebook

In [None]:
import nest_asyncio
nest_asyncio.apply() # this will apply the nest_asyncio to the event loop
# Now run the asyncio code in jupyter notebook