In [1]:
%pip install -Uq chromadb

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.7/21.7 MB[0m [31m96.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m98.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.5/72.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00

## Quick Start

Initialize your client and create a collection. Feel free to give the collection a name you want, this is going to be the identifier of your collection so that you can retrieve it afterwards.

In [2]:
import chromadb
client = chromadb.Client()

collection = client.get_or_create_collection(name="my_collection")


In this example, we're adding a couple of documents to our collection. As you can see, we are passing the documents as simple text.

But of course, since vector databases need to embed the text before adding it, these documents are going to be processed with the [sentence transformers](https://www.sbert.net/index.html) locally within ChromaDB. All of this is done behind the scenes. You don't have to worry about it.

In [3]:
collection.add(
    ids=["id1", "id2"],
    documents=[
        "This is a document about pineapple",
        "This is a document about oranges"
    ]
)


/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:00<00:00, 104MiB/s]


To get the documents from your collection, use the query method and provide the query text to find similar documents. You can also specify the number of results to return.

The text in your query text parameter is going to be embedded automatically by Chroma using the same embeddings model that was used to embed the documents that it ingested before: the [sentence transformer](https://www.sbert.net/index.html) model.

In [4]:
results = collection.query(
    query_texts=["This is a query document about hawaii"], # Chroma will embed this for you
    n_results=2 # how many results to return
)
print(results)


{'ids': [['id1', 'id2']], 'embeddings': None, 'documents': [['This is a document about pineapple', 'This is a document about oranges']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[None, None]], 'distances': [[1.0404009819030762, 1.2430799007415771]]}


Here, we can clearly see the results from Chroma. They are sorted by how close they are to our query. You can also view how far each document is from your query in the `distances` property.

In [None]:
results

{'ids': [['id1', 'id2']],
 'embeddings': None,
 'documents': [['This is a document about pineapple',
   'This is a document about oranges']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[None, None]],
 'distances': [[1.0404009819030762, 1.2430799007415771]]}

## CRUD on Data Points

Let's get the collection first.

In [5]:
collection = client.get_or_create_collection(
    name="my_collection",
    metadata={"description": "..."}
)

### Add Data Points

In [6]:
collection.add(
    ids=["1", "2", "3", "4", "5"],
    documents=[
      "The Eiffel Tower in Paris stands at 324 meters tall.",
      "Penguins can swim at speeds up to 22 miles per hour.",
      "The human body contains approximately 37.2 trillion cells.",
      "Mount Everest grows about 4 millimeters higher every year.",
      "The first email was sent in 1971 by Ray Tomlinson."
      ],
    metadatas=[
        {"source": "architecture", "location": "Paris", "year_built": 1889},
        {"source": "wildlife", "animal": "penguin", "habitat": "Antarctica"},
        {"source": "biology", "topic": "human anatomy", "fact_type": "cellular"},
        {"source": "geology", "mountain": "Everest", "fact_type": "growth"},
        {"source": "technology", "topic": "communication", "inventor": "Ray Tomlinson"}
    ],
)


### Update Data Points

If an id is not found in the collection, an error will be logged and the update will be ignored. If documents are supplied without corresponding embeddings, the embeddings will be recomputed with the collection's embedding function.

In [7]:
collection.update(
    ids=["1"],
    # embeddings = [[1.1, 2.3, 3.2], [2.1, 2.1, 2.4], ....]
    documents=[
        "This Colosseum is the largest ancient amphitheatre ever built",
    ],
    metadatas=[
        {"location": "Rome"}, # only updates the specified keys
    ]
)


There is also the possibility of using the `upsert` method, which updates a data point if it already exists; if it doesn't exist, it creates it.


In [8]:
collection.upsert(
    ids=["1", "6"],
    # embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...],
    metadatas=[
      {"location": "Rome"},
      {"location": "Paris"},
    ],
    documents=[
      "The Colosseum is a Roman amphitheatre in the centre of the city of Rome, Italy.",
      "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France."],
)


### Get Data Points

In [10]:
collection.get(
    ids=["1", "2", "3"],
    include=["embeddings", "metadatas", "documents"] # default is ["metadatas", "documents"]
)


{'ids': ['1', '2', '3'],
 'embeddings': array([[-0.00102553,  0.01578833,  0.00681701, ...,  0.03840652,
         -0.04598437,  0.05072193],
        [ 0.05948927,  0.05309642,  0.03503862, ..., -0.01154802,
          0.02469297,  0.05622638],
        [ 0.0467286 ,  0.02748974, -0.05327778, ...,  0.05028534,
          0.02743343, -0.02720457]]),
 'documents': ['The Colosseum is a Roman amphitheatre in the centre of the city of Rome, Italy.',
  'Penguins can swim at speeds up to 22 miles per hour.',
  'The human body contains approximately 37.2 trillion cells.'],
 'uris': None,
 'included': ['embeddings', 'metadatas', 'documents'],
 'data': None,
 'metadatas': [{'source': 'architecture',
   'location': 'Rome',
   'year_built': 1889},
  {'animal': 'penguin', 'source': 'wildlife', 'habitat': 'Antarctica'},
  {'topic': 'human anatomy', 'source': 'biology', 'fact_type': 'cellular'}]}

In [11]:
collection.query(
  query_texts=["Information about the capital of France"],
  n_results=2
)

{'ids': [['6', '1']],
 'embeddings': None,
 'documents': [['The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France.',
   'The Colosseum is a Roman amphitheatre in the centre of the city of Rome, Italy.']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'location': 'Paris'},
   {'source': 'architecture', 'year_built': 1889, 'location': 'Rome'}]],
 'distances': [[1.259374976158142, 1.6846143007278442]]}

## CRUD on Collections

### Create Collection

You can use the `get_or_create_collection` method to get a collection by name. And if it doesn't exist, it will be created and the method will return that newly created collection.

Also notice that we can add custom metadata, which are arbitrary key-value pairs with information about your collection.


In [12]:
collection = client.get_or_create_collection(
    name="my_collection",
    metadata={"description": "..."}
)


### Get Collection

In [13]:
collections = client.list_collections()
print(collections)

[Collection(name=my_collection)]


By default, list_collections returns up to 100 collections. If you want to go through the entire list of collections, you're going to have to do something like this:

In [14]:
batch_size = 100
offset = 0
all_collections = []

while True:
    collections_batch = client.list_collections(limit=batch_size, offset=offset)
    if not collections_batch:  # If no more collections are returned
        break
    all_collections.extend(collections_batch)
    offset += batch_size

print(all_collections)

[Collection(name=my_collection)]


### Patch collection

You can update a collection's information using the `modify` method.

In [None]:
collection.modify(
   name="my_newer_collection",
   metadata={"description": "this is a great collection of data points"}
)


### Delete collection

In [None]:
client.delete_collection(name="my_newer_collection")

### Convenience methods

In [None]:
collection.peek(
  limit=2
)

{'ids': ['1', '2'],
 'embeddings': array([[-1.02552655e-03,  1.57883298e-02,  6.81701116e-03,
          2.64323987e-02, -6.91068843e-02, -7.25080520e-02,
         -1.02297470e-01,  5.13165258e-02, -2.47250940e-03,
         -1.82425659e-02, -7.09465891e-02, -1.19897954e-01,
         -3.62669565e-02,  3.69712128e-03, -6.02033138e-02,
         -6.68522194e-02,  4.56865691e-02,  5.00661097e-02,
          1.63283367e-02,  2.89263809e-03,  1.01354003e-01,
         -6.68971017e-02,  2.71175578e-02, -7.45674521e-02,
          1.97686968e-04,  9.00884792e-02, -4.95964773e-02,
          5.50812073e-02, -1.78458053e-03,  5.24679385e-02,
          3.25026289e-02, -1.44009767e-02,  2.98162214e-02,
         -4.03733104e-02,  4.70872670e-02,  2.86959447e-02,
         -7.73608685e-02, -5.85268661e-02,  1.42845765e-01,
          4.13834006e-02, -1.97399296e-02,  1.41063463e-02,
          6.89955950e-02,  1.89172383e-02,  3.20235044e-02,
         -1.50266266e-03,  3.47387865e-02,  6.88607991e-02,
      

## Persistent Database

In [None]:
import chromadb

client = chromadb.PersistentClient(path="./chroma")
collection = client.get_or_create_collection(name="rag_documents")

In [None]:
# remote instance
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

# managed cloud instance
client = chromadb.CloudClient(
    tenant='Tenant ID',
    database='Database name',
    api_key='Chroma Cloud API key'
)

If you set the `CHROMA_API_KEY`, `CHROMA_TENANT`, and the `CHROMA_DATABASE` environment variables, you can simply instantiate a `CloudClient` with no arguments:


In [None]:
client = chromadb.CloudClient()