In [2]:
%%capture
# @title Step 1: Install Required Libraries
!pip install chromadb sentence-transformers

print("Libraries installed successfully. Proceed to the next step for imports.")


In [3]:
# @title Step 2: Import Necessary Modules

import chromadb
from chromadb.utils import embedding_functions
import os

print("Modules imported successfully. Now, initialize the ChromaDB client and embedding function.")


Modules imported successfully. Now, initialize the ChromaDB client and embedding function.


In [4]:
# @title Step 3: Initialize ChromaDB Client and Embedding Function
client = chromadb.Client() # In-memory client initialized.

print("ChromaDB client initialized (in-memory).")

embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

print(f"Embedding function loaded using model: {embedding_function.model_name}")
print("ChromaDB client and embedding function are ready. Proceed to create a collection.")


ChromaDB client initialized (in-memory).


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding function loaded using model: all-MiniLM-L6-v2
ChromaDB client and embedding function are ready. Proceed to create a collection.


In [5]:
# @title Step 4: Create or Get a ChromaDB Collection

collection_name = "rag_milestone_collection"


try:
    collection = client.get_or_create_collection(
        name=collection_name,
        embedding_function=embedding_function
    )
    print(f"Collection '{collection_name}' created or retrieved successfully.")
except Exception as e:
    print(f"Error creating/getting collection: {e}")
    # In Colab, sometimes a previous run might leave a corrupted state.
    # This block attempts to delete and re-create for a clean slate.
    try:
        print(f"Attempting to delete and re-create collection '{collection_name}'...")
        client.delete_collection(name=collection_name)
        collection = client.get_or_create_collection(
            name=collection_name,
            embedding_function=embedding_function
        )
        print(f"Collection '{collection_name}' re-created successfully.")
    except Exception as e_recreate:
        print(f"Failed to delete and re-create collection: {e_recreate}")
        print("Please consider restarting your Colab runtime (Runtime -> Restart runtime) and running all cells from the beginning.")

print("Collection is ready for adding documents.")


Collection 'rag_milestone_collection' created or retrieved successfully.
Collection is ready for adding documents.


In [7]:
# @title Step 5: Add Documents and Verify Collection Contents

# Array of grocery-related text items
texts = [
    'fresh red apples',
    'organic bananas',
    'ripe mangoes',
    'whole wheat bread',
    'farm-fresh eggs',
    'natural yogurt',
    'frozen vegetables',
    'grass-fed beef',
    'free-range chicken',
    'fresh salmon fillet',
    'aromatic coffee beans',
    'pure honey',
    'golden apple',
    'red fruit'
]

# Create a list of unique IDs for each text item in the 'texts' array
# Each ID follows the format 'food_<index>', where <index> starts from 1
ids = [f"food_{index + 1}" for index, _ in enumerate(texts)]


try:
    collection.add(
        documents=texts,
        metadatas=[{"source": "grocery_store", "category": "food"} for _ in texts],
        ids=ids
    )
    print(f"\nSuccessfully added {len(texts)} grocery items to the collection.")
except Exception as e:
    print(f"Error adding documents: {e}")
    print("Please check your input data and the collection's status.")


# Retrieve all the items (documents) stored in the collection
# The `get` method fetches all data from the collection
all_items = collection.get()

# Log the retrieved items to the console for inspection
# This will print out all the documents, IDs, and metadata stored in the collection
print("\n--- Collection contents (retrieved using .get()) ---")
print(f"Number of documents: {len(all_items['documents'])}")
# Optionally, you can print a few retrieved items to verify
print("Sample retrieved documents:")
for i in range(min(3, len(all_items['documents']))):
    print(f"  ID: {all_items['ids'][i]}, Document: '{all_items['documents'][i]}', Metadata: {all_items['metadatas'][i]}")

print("\nDocuments added and collection contents verified. Proceed to querying.")



Successfully added 14 grocery items to the collection.

--- Collection contents (retrieved using .get()) ---
Number of documents: 14
Sample retrieved documents:
  ID: food_1, Document: 'fresh red apples', Metadata: None
  ID: food_2, Document: 'organic bananas', Metadata: None
  ID: food_3, Document: 'ripe mangoes', Metadata: None

Documents added and collection contents verified. Proceed to querying.


In [8]:
# @title Step 6: Perform a Similarity Search Function

# Function to perform a similarity search in the collection
def perform_similarity_search(collection):
    try:
        # Define the query term you want to search for in the collection
        query_term = "red fruit" # You can change this query term

        # Perform a query to search for the most similar documents to the 'query_term'
        # n_results specifies the number of top results to retrieve.
        results = collection.query(
            query_texts=[query_term],
            n_results=3,  # Retrieve top 3 results
            include=['documents', 'metadatas', 'distances'] # Ensure full results are included
        )

        print(f"\n--- Similarity Search Results for '{query_term}' ---")
        if results and results['documents'] and results['documents'][0]:
            for i in range(len(results['documents'][0])):
                doc_content = results['documents'][0][i]
                metadata = results['metadatas'][0][i] if results['metadatas'] and results['metadatas'][0] else "N/A"
                distance = results['distances'][0][i] if results['distances'] and results['distances'][0] else "N/A"
                print(f"Result {i+1}:")
                print(f"  Content: '{doc_content}'")
                print(f"  Metadata: {metadata}")
                print(f"  Distance: {distance:.4f}") # Lower distance means higher similarity
        else:
            print("No results found for the query.")

    except Exception as error:
        print(f"Error in similarity search: {error}")

# Call the function to perform the similarity search
perform_similarity_search(collection)

print("\nSimilarity search function defined and executed.")



--- Similarity Search Results for 'red fruit' ---
Result 1:
  Content: 'red fruit'
  Metadata: None
  Distance: 0.0000
Result 2:
  Content: 'fresh red apples'
  Metadata: None
  Distance: 0.5082
Result 3:
  Content: 'organic bananas'
  Metadata: None
  Distance: 0.9178

Similarity search function defined and executed.
