Install necessary libraries

In [1]:
!pip install langchain==0.3.7 langchain-core==0.3.15 langchain-google-genai==2.0.4 google-generativeai==0.8.3 protobuf==5.27.0



In [7]:
!pip install chromadb
import os
import json
import numpy as np
from chromadb.utils import embedding_functions

# LangChain for Gemini Embeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
# ChromaDB Client
import chromadb
from google.colab import userdata



In [8]:
# --- API Key Setup ---
# Use Colab Secrets for GEMINI_API_KEY
gemini_api_key = userdata.get("DAY1")


if not gemini_api_key:
    print("Warning: GEMINI_API_KEY not found. Gemini embedding will not run.")
else:
    print("Setup complete. Environment configured.")



Setup complete. Environment configured.


2. Text Embedding: Translating Text to Vectors

In [9]:
# 2.1. Defining Sample Data: We will use four sample documents, two of which are conceptually similar ("Biking" and "Cycling").
documents = [
    "The official university policy states that all faculty must submit expense reports by the 15th of every month.", # Doc 1: Finance
    "Riding a bicycle provides excellent low-impact cardiovascular exercise and is a great way to commute.",           # Doc 2: Cycling
    "I enjoy going cycling on the weekends, especially when the weather is clear and the trails are dry.",           # Doc 3: Biking
    "Please consult the academic handbook regarding grading policies and attendance requirements for final year students." # Doc 4: Academics
]

# The user's query we want to compare against
user_query = "What is the best form of exercise using wheels?"

2.2. Gemini Embedding Model (Google):

In [10]:

gemini_embedder = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004",
    google_api_key=gemini_api_key
)

# Generate embeddings for the documents and the query
try:
    doc_embeddings_gemini = gemini_embedder.embed_documents(documents)
    query_embedding_gemini = gemini_embedder.embed_query(user_query)

    print(f"Gemini Embedding Dimension: {len(query_embedding_gemini)}")
    print(f"Vector for Doc 1 (start): {doc_embeddings_gemini[0][:5]}...")
    print(f"Vector for Doc 2 (start): {doc_embeddings_gemini[1][:5]}...")
    print(f"Vector for Doc 3 (start): {doc_embeddings_gemini[2][:5]}...")
    print(f"Vector for Doc 4 (start): {doc_embeddings_gemini[3][:5]}...")
    print(f"Vector for Query (start): {query_embedding_gemini[:5]}...")

except Exception as e:
    print(f"\nError using Gemini Embedder (Check API key): {e}")

Gemini Embedding Dimension: 768
Vector for Doc 1 (start): [0.033828187733888626, -0.016320111230015755, -0.013897339813411236, -0.03159501776099205, 0.015815390273928642]...
Vector for Doc 2 (start): [-0.012195533141493797, -0.007790034636855125, -0.016390662640333176, -0.05465248227119446, -0.0024157853331416845]...
Vector for Doc 3 (start): [0.003655583830550313, -0.028434794396162033, -0.02677733637392521, -0.035773057490587234, 0.009551131166517735]...
Vector for Doc 4 (start): [0.017848238348960876, 0.026532815769314766, -0.03261523321270943, -0.08204198628664017, -0.000329095171764493]...
Vector for Query (start): [0.010527719743549824, -0.06336859613656998, 0.00984877347946167, -0.033117953687906265, 0.03267448768019676]...


2.3. Open-Source Embedding Model (Hugging Face):

In [13]:
# Instead of using ChromaDB's wrapper for direct embedding,
# we will use the SentenceTransformer library directly, as the wrapper
# does not expose `embed_documents` or `embed_query` methods in this manner for external calls.
from sentence_transformers import SentenceTransformer

# Initialize the open-source sentence transformer model
model_name = "all-MiniLM-L6-v2"
hf_model = SentenceTransformer(model_name)

# Generate embeddings using the open-source model
doc_embeddings_hf = hf_model.encode(documents).tolist()
query_embedding_hf = hf_model.encode(user_query).tolist()

print(f"\nHF Embedding Dimension: {len(query_embedding_hf)}")
print(f"Vector for Doc 1 (start): {doc_embeddings_hf[0][:5]}...")


HF Embedding Dimension: 384
Vector for Doc 1 (start): [0.01963353157043457, -0.01359286718070507, -0.053158652037382126, 0.01634487695991993, 0.05234808847308159]...


3. Vector Database:

In [14]:
from chromadb.api.types import EmbeddingFunction, Embeddable
from typing import List

# Define a custom wrapper class to make GoogleGenerativeAIEmbeddings compatible with ChromaDB
class GeminiChromaEmbeddingFunction(EmbeddingFunction):
    def __init__(self, embedder_model):
        self._embedder = embedder_model

    def __call__(self, input: Embeddable) -> List[List[float]]:
        # ChromaDB's EmbeddingFunction expects a list of strings
        # LangChain's embed_documents handles this directly
        return self._embedder.embed_documents(input)

    def name(self) -> str:
        # Provide a unique name for this embedding function, as ChromaDB expects it
        return "google_gemini_text_embedding"

client = chromadb.Client()

# Wrap the LangChain embedder for ChromaDB compatibility
chroma_gemini_ef = GeminiChromaEmbeddingFunction(gemini_embedder)

# Create a collection using the wrapped Gemini Embeddings
collection_gemini = client.get_or_create_collection(
    name="gemini_docs_collection",
    embedding_function=chroma_gemini_ef # Use the wrapped function here
)

# Add documents (text and metadata) to the collection
# Only add documents if the collection is empty to prevent duplicates on re-run
if collection_gemini.count() == 0:
    collection_gemini.add(
        documents=documents,
        ids=[f"doc{i+1}" for i in range(len(documents))],
        metadatas=[
            {"type": "Finance"},
            {"type": "Cycling"},
            {"type": "Biking"},
            {"type": "Academics"}
        ]
    )
    print("\nChromaDB collection created and documents embedded using Gemini.")
else:
    print("\nChromaDB collection already exists and contains documents. Skipping document addition.")


ChromaDB collection already exists and contains documents. Skipping document addition.


4. Semantic Search & Cosine Similarity:

In [15]:
def cosine_similarity(vec_a, vec_b):
    """Calculates cosine similarity between two NumPy vectors."""
    # Convert lists to NumPy arrays if necessary
    A = np.array(vec_a)
    B = np.array(vec_b)

    # Calculate Dot Product
    dot_product = np.dot(A, B)

    # Calculate Magnitude (Norm)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)

    if norm_a == 0 or norm_b == 0:
        return 0.0

    return dot_product / (norm_a * norm_b)

# Compare the query against all documents using Gemini Embeddings
print("\n--- Manual Cosine Similarity Scores (Gemini) ---")

for i, doc_vector in enumerate(doc_embeddings_gemini):
    score = cosine_similarity(query_embedding_gemini, doc_vector)
    print(f"Doc {i+1} ({collection_gemini.get(ids=[f'doc{i+1}'])['metadatas'][0]['type']}): {score:.4f}")


--- Manual Cosine Similarity Scores (Gemini) ---
Doc 1 (Finance): 0.2788
Doc 2 (Cycling): 0.6211
Doc 3 (Biking): 0.4806
Doc 4 (Academics): 0.2878


4.2. ChromaDB Retrieval:

In [16]:
results = collection_gemini.query(
    query_texts=[user_query],
    n_results=4, # Return top  results
    include=['documents', 'distances']
)

print("\n--- ChromaDB Semantic Search Results ---")
print(f"Query: {user_query}")
print("-" * 40)

# Output the results
for i in range(len(results['documents'][0])):
    doc_id = results['ids'][0][i]
    content = results['documents'][0][i]
    distance = results['distances'][0][i]
    metadata = collection_gemini.get(ids=[doc_id])['metadatas'][0]

    print(f"Rank {i+1}: ID {doc_id} | Type: {metadata['type']} | Distance: {distance:.4f}")
    print(f"   Content: {content[:70]}...")



--- ChromaDB Semantic Search Results ---
Query: What is the best form of exercise using wheels?
----------------------------------------
Rank 1: ID doc2 | Type: Cycling | Distance: 0.4030
   Content: Riding a bicycle provides excellent low-impact cardiovascular exercise...
Rank 2: ID doc3 | Type: Biking | Distance: 0.6020
   Content: I enjoy going cycling on the weekends, especially when the weather is ...
Rank 3: ID doc4 | Type: Academics | Distance: 0.9100
   Content: Please consult the academic handbook regarding grading policies and at...
Rank 4: ID doc1 | Type: Finance | Distance: 0.9677
   Content: The official university policy states that all faculty must submit exp...
