### DBMS Init

In [1]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-1.0.20-cp39-abi3-win_amd64.whl.metadata (7.4 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pydantic>=1.9 (from chromadb)
  Using cached pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp311-cp311-win_amd64.whl.metadata (9.0 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.35.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp311-cp311-win_amd64.whl.metadata (5.1 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.36.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading op

Path to store the database at

In [4]:
from pathlib import Path
import chromadb

Relative_Database_path = "./chroma_Data" # Relative
Absolute_Database_path = Path(Relative_Database_path).resolve()
client = chromadb.PersistentClient(path=Absolute_Database_path)
print("[INFO] Absolute Path to database: ", Absolute_Database_path)

[INFO] Absolute Path to database:  C:\Users\micro\Desktop\Abhinav college\Resources\Sem 7\Advanced NLP\RAG_for_research_papers\VectorDB\chroma_Data


In [5]:
client = chromadb.PersistentClient(path=Absolute_Database_path)

print("[INFO] Client: ", client, "initialized")

[INFO] Client:  <chromadb.api.client.Client object at 0x0000017755CFD350> initialized


In [6]:
collections = client.list_collections()
print("Collections of this client: ", collections)

Collections of this client:  [Collection(name=my_collection)]


### Creating first collection. 
A collection is analogous to a table in a relational DBMS. eg. a collection of knowledge. 

You can only query one collection at a time. Thus, we will have one collection for the whole RAG.

In [17]:
!pip install sentence_transformers  

Collecting sentence_transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence_transformers)
  Downloading transformers-4.56.0-py3-none-any.whl.metadata (40 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence_transformers)
  Downloading safetensors-0.6.2-cp38-abi3-win_amd64.whl.metadata (4.1 kB)
Downloading sentence_transformers-5.1.0-py3-none-any.whl (483 kB)
Downloading transformers-4.56.0-py3-none-any.whl (11.6 MB)
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
    --------------------------------------- 0.3/11.6 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.6 MB 1.5 MB/s eta 0:00:08
   -- ------------------------------------- 0.8/11.6 MB 1.3 MB/s eta 0:00:09
   --- ------------------------------------ 1.0/11.6 MB 1.4 MB/s eta 0:00:08
   ---- ----------------------------------- 1.3/11.6 MB 1.4 MB/s eta 0:00:08
   ----- -----------

In [7]:
# Choosing the default embedding function
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
embedding_function = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
from datetime import datetime

collection = client.get_or_create_collection(
    name="my_collection",
    embedding_function=embedding_function,
    metadata={
        "owner": "Abhinav",
        "description": "my first Chroma collection",
        "created": str(datetime.now())
    },
    configuration={ # This offers more control. Tune parameters to improve performance
        "hnsw:space": "cosine",
        "hnsw:construction_ef": 200,
        "hnsw:M": 16,
        "hnsw:search_ef": 50
    }
)

print("Collection Generated with embedding function:", embedding_function)

Collection Generated with embedding function: <chromadb.utils.embedding_functions.sentence_transformer_embedding_function.SentenceTransformerEmbeddingFunction object at 0x0000017755CA5D50>


In [8]:
docs = [
    "The Eiffel Tower, a famous landmark, is located in Paris, France.",
    "The Colosseum is in Rome.",
    "The Great Wall of China is visible from space (not really)."
]

metas = [
    {"city": "Paris", "country": "France"},
    {"city": "Rome", "country": "Italy"},
    {"city": "Beijing", "country": "China"}
]

ids = ["doc1", "doc2", "doc3"]

# collection.add(
#     documents=docs,
#     metadatas=metas,
#     ids=ids
# )

collection.update(
    documents=docs,
    metadatas=metas,
    ids=ids
)


In [9]:
print(collection.count())      # number of items
print(collection.peek())       # first few items


3
{'ids': ['doc1', 'doc2', 'doc3'], 'embeddings': array([[ 0.06705675,  0.05626705, -0.00802136, ...,  0.05057557,
         0.07656233,  0.0231804 ],
       [ 0.01886094,  0.00550705,  0.03100901, ..., -0.01101136,
        -0.03171892,  0.05067538],
       [ 0.05961616,  0.05668674,  0.04932906, ...,  0.01538757,
        -0.07548427,  0.07908341]], shape=(3, 384)), 'documents': ['The Eiffel Tower, a famous landmark, is located in Paris, France.', 'The Colosseum is in Rome.', 'The Great Wall of China is visible from space (not really).'], 'uris': None, 'included': ['metadatas', 'documents', 'embeddings'], 'data': None, 'metadatas': [{'country': 'France', 'city': 'Paris', 'source': 'test'}, {'country': 'Italy', 'city': 'Rome'}, {'city': 'Beijing', 'country': 'China'}]}


### Querying the Database

In [23]:
!pip install rich




In [10]:
from rich.console import Console
from rich.table import Table

console = Console()

def print_results_table(results):
    table = Table(show_header=True, header_style="bold magenta")
    table.add_column("Rank", style="dim", width=6)
    table.add_column("ID", style="cyan")
    table.add_column("Document", style="green")
    table.add_column("Metadata", style="yellow")
    table.add_column("Distance", justify="right")

    docs = results.get("documents", [[]])[0]
    metas = results.get("metadatas", [[]])[0]
    dists = results.get("distances", [[]])[0]
    ids   = results.get("ids", [[]])[0]

    for i, (doc, meta, dist, _id) in enumerate(zip(docs, metas, dists, ids)):
        table.add_row(
            str(i+1),
            str(_id),
            str(doc),
            str(meta),
            f"{dist:.4f}"
        )

    console.print(table)

In [11]:
results = collection.query(
    query_texts=["Where is the Eiffel Tower?"],
    n_results=2,
    include=["documents", "metadatas", "distances"]
)

print_results_table(results)