In [2]:
import chromadb
from chromadb.utils import embedding_functions

In [3]:
# Define Embedding Function
ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name='all-MiniLM-L6-v2'
)

In [25]:
# create a new instance of ChromaClient
client = chromadb.Client()

In [26]:
collection_name= 'grocery'  

In [27]:
# Define main function to interact with ChromaDB
def main():
    try:
        collection = client.create_collection(
            name = collection_name,
            metadata={'description':'A collection for stroing grocery data'},
            configuration={
                'hnsw':{'space':'cosine'},
                'embedding_function':ef
            }
        )
        print(f"Collection created: {collection.name}")
        # Array of grocery-related text items with professional humor
        texts = [
            'fresh red apples',
            'organic bananas',
            'ripe mangoes',
            'whole wheat bread',
            'farm-fresh eggs',
            'natural yogurt',
            'frozen vegetables',
            'grass-fed beef',
            'free-range chicken',
            'fresh salmon fillet',
            'aromatic coffee beans',
            'pure honey',
            'golden apple',
            'red fruit']
        
        # Create a list of IDs for each item in the 'texts' array
        ids = [f'food_{index + 1}' for index,_ in enumerate(texts)]

        collection.add(
            documents=texts,
            metadatas= [{'source': 'grocery_store','category':'food'} for _ in texts ],
            ids = ids
        )

        # Retrieve all items stored in the collection
        all_items = collection.get()

        print('Collection Contents:')
        print(f"Numeber of Documents: {len(all_items['documents'])}")

        def perform_similarity_search(collection, all_items):
            try:
                # Define Query term you want to search
                query_term = ['red','fresh','grass']
                if isinstance(query_term,str):
                    query_term = [query_term]
                

                # Perform query search for the most similar documents to the 'query_term'
                results = collection.query(
                    query_texts = query_term,
                    n_results = 3
                )
                print(f"Query Results for '{query_term}':")
                print(results)

                # Check if now results are returned or the results array is empty
                if not results or not results['ids'] or len(results['ids'][0])==0:
                    print(f"Now document is found similar to '{query_term}'")
                    return
                
                for q in range(len(query_term)):
                    print(f"Top 3 Similar Documents to '{query_term}':")
                    # Access the nested arrays in 'results['ids']' 'resuts
                    for i in range(min(3,len(results['ids'][q]))):
                        doc_id= results['ids'][q][i]
                        score = results['distances'][q][i]
                        # Retrieve Text Data from results 
                        text = results['documents'][q][i]
                        if not text:
                            print(f'-ID {doc_id}, Text:"text not available", Score: {score:.4f}')
                        else:
                            print(f'-ID : {doc_id} , Text: "{text}", Score: {score:.4f}')

            except Exception as error:
                print(f'Error in Similarity search: {error}')

    except Exception as error:
        print(f"Error : {error}")
        



In [28]:
if __name__ == "__main__":
    main()

Collection created: grocery
Collection Contents:
Numeber of Documents: 14


In [29]:
for i in range(min(3,10)):
    print(i)

0
1
2


# Cross Checking (Practicing)

In [30]:
client = chromadb.Client()
collection = client.create_collection(
name='dummy',
metadata={"description": "A collection for storing grocery data"},
configuration={
    "hnsw": {"space": "cosine"},
    "embedding_function": ef
})
print(f"Collection created: {collection.name}")
        # Array of grocery-related text items with professional humor
texts = [
            'fresh red apples',
            'organic bananas',
            'ripe mangoes',
            'whole wheat bread',
            'farm-fresh eggs',
            'natural yogurt',
            'frozen vegetables',
            'grass-fed beef',
            'free-range chicken',
            'fresh salmon fillet',
            'aromatic coffee beans',
            'pure honey',
            'golden apple',
            'red fruit'
        ]
        # Create a list of unique IDs for each text item in the 'texts' array
        # Each ID follows the format 'food_<index>', where <index> starts from 1
ids = [f"food_{index + 1}" for index, _ in enumerate(texts)]
        # Add documents and their corresponding IDs to the collection
        # The `add` method inserts the data into the collection
        # The documents are the actual text items, and the IDs are unique identifiers
                # ChromaDB will automatically generate embeddings using the configured embedding function
collection.add(
            documents=texts,
            metadatas=[{"source": "grocery_store", "category": "food"} for _ in texts],
            ids=ids
        )
        # Retrieve all the items (documents) stored in the collection
        # The `get` method fetches all data from the collection
all_items = collection.get()
        # Log the retrieved items to the console for inspection
        # This will print out all the documents, IDs, and metadata stored in the collection
print("Collection contents:")
print(f"Number of documents: {len(all_items['documents'])}")

InternalError: Collection [dummy] already exists

In [None]:
query_term = ["red", "fresh", 'beef']
if isinstance(query_term, str):
    query_term = [query_term]
                # Perform a query to search for the most similar documents to the 'query_term'
results = collection.query(
                    query_texts=query_term,
                    n_results=3  # Retrieve top 3 results
                )
print(f"Query results for '{query_term}':")
print(results)


Query results for '['red', 'fresh', 'beef']':
{'ids': [['food_14', 'food_1', 'food_13'], ['food_1', 'food_5', 'food_12'], ['food_8', 'food_14', 'food_9']], 'embeddings': None, 'documents': [['red fruit', 'fresh red apples', 'golden apple'], ['fresh red apples', 'farm-fresh eggs', 'pure honey'], ['grass-fed beef', 'red fruit', 'free-range chicken']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[{'category': 'food', 'source': 'grocery_store'}, {'category': 'food', 'source': 'grocery_store'}, {'category': 'food', 'source': 'grocery_store'}], [{'category': 'food', 'source': 'grocery_store'}, {'category': 'food', 'source': 'grocery_store'}, {'category': 'food', 'source': 'grocery_store'}], [{'category': 'food', 'source': 'grocery_store'}, {'category': 'food', 'source': 'grocery_store'}, {'source': 'grocery_store', 'category': 'food'}]], 'distances': [[0.3132774233818054, 0.4539964199066162, 0.7393019795417786], [0.47737598419189453, 0.48541

In [None]:
results['ids']

[['food_14', 'food_1', 'food_13'],
 ['food_1', 'food_5', 'food_12'],
 ['food_8', 'food_14', 'food_9']]

In [None]:
results['ids'][0]

['food_14', 'food_1', 'food_13']

In [None]:
results['documents'][0]

['red fruit', 'fresh red apples', 'golden apple']

In [None]:
results['included']

['metadatas', 'documents', 'distances']

In [None]:
results['distances']

[[0.3132774233818054, 0.4539964199066162, 0.7393019795417786],
 [0.47737598419189453, 0.4854104518890381, 0.6252564191818237],
 [0.28510165214538574, 0.6224480271339417, 0.6528536677360535]]

In [None]:
results.keys()

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas', 'distances'])

In [None]:
results.values()

dict_values([[['food_14', 'food_1', 'food_13'], ['food_1', 'food_5', 'food_12'], ['food_8', 'food_14', 'food_9']], None, [['red fruit', 'fresh red apples', 'golden apple'], ['fresh red apples', 'farm-fresh eggs', 'pure honey'], ['grass-fed beef', 'red fruit', 'free-range chicken']], None, ['metadatas', 'documents', 'distances'], None, [[{'category': 'food', 'source': 'grocery_store'}, {'category': 'food', 'source': 'grocery_store'}, {'category': 'food', 'source': 'grocery_store'}], [{'category': 'food', 'source': 'grocery_store'}, {'category': 'food', 'source': 'grocery_store'}, {'category': 'food', 'source': 'grocery_store'}], [{'category': 'food', 'source': 'grocery_store'}, {'category': 'food', 'source': 'grocery_store'}, {'source': 'grocery_store', 'category': 'food'}]], [[0.3132774233818054, 0.4539964199066162, 0.7393019795417786], [0.47737598419189453, 0.4854104518890381, 0.6252564191818237], [0.28510165214538574, 0.6224480271339417, 0.6528536677360535]]])