[![Lab Documentation and Solutions](https://img.shields.io/badge/Lab%20Documentation%20and%20Solutions-purple)](https://mongodb-developer.github.io/vector-search-lab/)

# Step 1: Setup prerequisites

Replace `<MONGODB_URI>` with your **MongoDB connection string**

In [25]:
import os
from pymongo import MongoClient

In [None]:
# Retain the quotes ("") when pasting the URI
MONGODB_URI = "<MONGODB_URI>"
# Initialize a MongoDB Python client
mongodb_client = MongoClient(MONGODB_URI, appname="devrel.workshop.vector_search")
# Check the connection to the server
mongodb_client.admin.command("ping")

### **Do not change the values assigned to the variables below**

In [27]:
# Database name
DB_NAME = "mongodb_genai_devday"
# Collection name
COLLECTION_NAME = "books"
# Name of the vector search index
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"

📚 https://pymongo.readthedocs.io/en/stable/tutorial.html#getting-a-collection

In [28]:
# Connect to the `COLLECTION_NAME` collection.
# Use the `mongodb_client`, database and collection variables defined above.
collection = <CODE_BLOCK_1>

In [29]:
SERVERLESS_URL = os.environ.get("SERVERLESS_URL")

# Step 2: Import data

In [30]:
import requests

In [None]:
response = requests.post(
    url=SERVERLESS_URL, json={"task": "import_data", "data": MONGODB_URI}
)
if response.status_code == 200:
    db = mongodb_client[DB_NAME]
    print(
        f"{db["full_docs"].count_documents({})} documents ingested into the full_docs collection."
    )
    print(
        f"{db["chunked_docs"].count_documents({})} documents ingested into the chunked_docs collection."
    )
    print(
        f"{db["books"].count_documents({})} documents ingested into the books collection."
    )
else:
    print(f"Error code {response.status_code}: Error ingesting data into MongoDB")

# Step 3: Generating Embeddings

In [32]:
from sentence_transformers import SentenceTransformer
from PIL import Image

In [58]:
# Load a multimodal embedding model using the Sentence Transformers library
embedding_model = SentenceTransformer("clip-ViT-B-32")

### For images

📚 https://huggingface.co/sentence-transformers/clip-ViT-B-32#usage

In [34]:
image_url = "https://images.isbndb.com/covers/77/44/9780789467744.jpg"
# Load the image from the URL above
image = Image.open(requests.get(image_url, stream=True).raw)
# Embed the `image` using the `embedding_model` instantiated above and return the embedding as a list
# An array can be converted to a list using the `tolist()` method
embedding = <CODE_BLOCK_2>

In [None]:
print(embedding)

### For text

In [36]:
text = "Puppy Preschool: Raising Your Puppy Right---Right from the Start!"
# Use the same `embedding_model` to embed a piece of text
embedding = embedding_model.encode(text).tolist()

In [None]:
print(embedding)

# Step 4: Adding Embeddings to Existing Data in Atlas

In [89]:
from typing import List, Dict, Optional

In [38]:
# Field in the documents to embed-- in this case, the book cover
field_to_embed = "cover"
# Name of the embedding field to add to the documents
embedding_field = "embedding"

In [59]:
def get_embedding(content: str, type: str) -> List[float]:
    """
    Generate embeddings

    Args:
        content (str): Content to embed
        type (str): Type of content (Can be one of "image" or "text")

    Returns:
        List[float]: Embedding of the content as a list.
    """
    # If an image URL is provided, first load the image
    if type == "image":
        content = Image.open(requests.get(content, stream=True).raw)
    return embedding_model.encode(content).tolist()

📚 https://www.mongodb.com/docs/manual/reference/operator/query/exists/#syntax

In [43]:
# Create a query expression for documents where the `embedding_field` does not exist.
query = <CODE_BLOCK_3>

📚 https://www.mongodb.com/docs/manual/reference/method/db.collection.find/#syntax

In [44]:
# Execute the `query` against the `collection` collection using `find()`
results = <CODE_BLOCK_4>

📚 **$set:** https://www.mongodb.com/docs/manual/reference/operator/update/set/#syntax

📚 **update_one():** https://www.mongodb.com/docs/manual/reference/method/db.collection.updateOne/#definition

In [42]:
# Update each document in the `collection` collection with embeddings
for result in results:
    content = result[field_to_embed]
    # Use the `get_embedding` function defined above to embed the `content`
    # Note that `content` contains the cover image URL for the book 
    embedding = <CODE_BLOCK_5>

    
    # Filter for the document where the `_id` field is equal to the `_id` of the current document
    filter = {"_id": result["_id"]}
    # Set the `embedding_field` field to the value `embedding` using the `$set` operator
    update = <CODE_BLOCK_6>
    # Update the documents in the `collection` collection inplace using the `update_one()` operation
    # Get the right document `_id` using the `filter` and apply the `update`
    <CODE_BLOCK_7>

# Step 5: Create a vector search index

In [46]:
# Create vector index definition specifying:
# path: Path to the embeddings field
# numDimensions: Number of embedding dimensions- depends on the embedding model used
# similarity: Similarity metric. One of cosine, euclidean, dotProduct.
model = {
    "name": ATLAS_VECTOR_SEARCH_INDEX_NAME,
    "type": "vectorSearch",
    "definition": {
        "fields": [
            {
                "type": "vector",
                "path": "embedding",
                "numDimensions": 512,
                "similarity": "cosine",
            }
        ]
    },
}

📚 https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.create_search_index

In [None]:
# Create a vector search index with the above definition for the `collection` collection
<CODE_BLOCK_8>

# Step 6: Perform semantic search queries

📚 https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-stage/#ann-examples (Refer to the "Filter Example")

In [107]:
# Define a function to retrieve relevant documents for a user query using vector search
def vector_search(
    user_query: str, type: str, filter: Optional[Dict] = {}
) -> List[Dict]:
    """
    Retrieve relevant documents for a user query using vector search.

    Args:
    user_query (str): The user's query (can be a piece of text or a link to an image)
    type (str): Query type (image or text)
    filter (Optional[Dict], optional): Optional vector search pre-filter

    Returns:
    list: A list of matching documents.
    """
    # Generate embedding for the `user_query` using the `get_embedding` function defined in Step 4
    query_embedding = <CODE_BLOCK_9>

    # Define an aggregation pipeline consisting of a $vectorSearch stage, followed by a $project stage
    # Set the number of candidates to 50 and only return the top 5 documents from the vector search
    # Set the `filter` field to the value `filter` 
    # In the $project stage, exclude the `_id` field and include only the `title` field and `vectorSearchScore`
    # NOTE: Use variables defined previously for the `index`, `queryVector` and `path` fields in the $vectorSearch stage
    pipeline = <CODE_BLOCK_10>

    # Execute the aggregation `pipeline` and store the results in `results`
    results = <CODE_BLOCK_11>
    return list(results)

In [None]:
# Test the vector search with a text query
vector_search("Feel good books", "text")

In [None]:
# Test the vector search with an image query
vector_search("https://images.isbndb.com/covers/38/93/9780439343893.jpg", "image")

# Step 7: Adding pre-filters to your vector search

In [None]:
# Helper function to update a search index
def update_index(model: Dict) -> None:
    """
    Update a MongoDB Atlas Vector Search index

    Args:
        model (Dict): Index definition
    """
    collection.update_search_index(
        name=ATLAS_VECTOR_SEARCH_INDEX_NAME, definition=model["definition"]
    )

### Filter for books that were published after the year `2000`

📚 https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/#about-the-filter-type

In [101]:
# Modify the vector search index `model` from Step 5 to include the `year` field as a `filter` field
model = <CODE_BLOCK_11>

In [102]:
# Update the index using the `update_index` function defined above
update_index(model)

**NOTE:** Check that the update is complete before proceeding. To do so, ensure that the status of the index shows "Ready" in the Atlas UI.

📚 https://www.mongodb.com/docs/manual/reference/operator/query/gte/#syntax

In [None]:
# Create a filter definition to filter for books where the `year` field is greater than `2000` using the `$gte` operator
filter = <CODE_BLOCK_12>
# Pass the `filter` as an argument to the `vector_search` function.
# Notice how this filter is incorporated in the `pipeline` in the `vector_search function`
vector_search("Feel good books", "text", filter)

### Filter for books that published after the year `2000` and under `100` pages

📚 https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/#about-the-filter-type

In [112]:
# Modify the vector search index `model` from Step 5 to include `year` and `pages` as filter fields
model = <CODE_BLOCK_13>

In [None]:
# Update the index using the `update_index` function defined above
update_index(model)

**NOTE:** Check that the update is complete before proceeding. To do so, ensure that the status of the index shows "Ready" in the Atlas UI.

📚 https://www.mongodb.com/docs/manual/reference/operator/query/lte/#mongodb-query-op.-lte

In [None]:
# Create a filter definition to filter for books where the `year` field is greater than or equal to `2000` and the `pages` field is less than or equal to 100
# Use the `$gte` and `$lte` operators
filter = <CODE_BLOCK_14>
# Pass the `filter` as an argument to the `vector_search` function.
# Notice how this filter is incorporated in the `pipeline` in the `vector_search function`
vector_search(
    "https://images.isbndb.com/covers/38/93/9780439343893.jpg", "image", filter
)

# Step 8: Tuning your vector search

#### Try a different similarity metric

📚 https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/#syntax

In [117]:
# Modify the vector search index `model` from Step 5 to change the similarity metric to `dotProduct`
model = <CODE_BLOCK_15>

In [118]:
# Update the index using the `update_index` function defined in Step 7
update_index(model)

**NOTE:** Check that the update is complete before proceeding. To do so, ensure that the status of the index shows "Ready" in the Atlas UI.

In [None]:
# Perform a semantic search
# Note any differences in the results due to the different similarity metric
vector_search("Feel good books", "text")

### Enable vector quantization

📚 https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/#syntax

In [96]:
# Modify the vector search index `model` from Step 5 to use `scalar` quantization
model = <CODE_BLOCK_16>

In [98]:
# Update the index using the `update_index` function defined in Step 7
update_index(model)

**NOTE:** Check that the update is complete before proceeding. To do so, ensure that the status of the index shows "Ready" in the Atlas UI.

In [None]:
# Perform a semantic search
# Note any differences in the results due to vector quantization
vector_search("Feel good books", "text")

# 🦹‍♀️ Hybrid Search

In [123]:
# Name of the full-text search index
ATLAS_FTS_INDEX_NAME = "fts_index"

In [None]:
# Create full-text search index definition specifying the field mappings
model = {
    "name": ATLAS_FTS_INDEX_NAME,
    "type": "search",
    "definition": {
        "mappings": {"dynamic": False, "fields": {"synopsis": {"type": "string"}}}
    },
}

📚 https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.create_search_index

In [None]:
# Create a full-text search index with the above definition for the `collection` collection
<CODE_BLOCK_17>

**NOTE:** Refer to our [documentation](https://www.mongodb.com/docs/atlas/atlas-vector-search/tutorials/reciprocal-rank-fusion/#about-the-query) for a detailed explanation of the hybrid search query below.

In [155]:
def hybrid_search(
    user_query: str, vector_weight: float, full_text_weight: float
) -> None:
    """
    Retrieve relevant documents for a user query using hybrid search.

    Args:
        user_query (str): User query string
        vector_weight (float): Weight of vector search in the final search results
        full_text_weight (float): Weight of full-text search in the final search results
    """
    pipeline = [
        {
            "$vectorSearch": {
                "index": ATLAS_VECTOR_SEARCH_INDEX_NAME,
                "path": "embedding",
                "queryVector": get_embedding(user_query, "text"),
                "numCandidates": 50,
                "limit": 10,
            }
        },
        {"$group": {"_id": None, "docs": {"$push": "$$ROOT"}}},
        {"$unwind": {"path": "$docs", "includeArrayIndex": "rank"}},
        {
            "$addFields": {
                "vs_score": {
                    "$multiply": [
                        vector_weight,
                        {"$divide": [1.0, {"$add": ["$rank", 60]}]},
                    ]
                }
            }
        },
        {"$project": {"vs_score": 1, "_id": "$docs._id", "title": "$docs.title"}},
        {
            "$unionWith": {
                "coll": COLLECTION_NAME,
                "pipeline": [
                    {
                        "$search": {
                            "index": ATLAS_FTS_INDEX_NAME,
                            "text": {"query": user_query, "path": "synopsis"},
                        }
                    },
                    {"$limit": 10},
                    {"$group": {"_id": None, "docs": {"$push": "$$ROOT"}}},
                    {"$unwind": {"path": "$docs", "includeArrayIndex": "rank"}},
                    {
                        "$addFields": {
                            "fts_score": {
                                "$multiply": [
                                    full_text_weight,
                                    {"$divide": [1.0, {"$add": ["$rank", 60]}]},
                                ]
                            }
                        }
                    },
                    {
                        "$project": {
                            "fts_score": 1,
                            "_id": "$docs._id",
                            "title": "$docs.title",
                        }
                    },
                ],
            }
        },
        {
            "$group": {
                "_id": "$_id",
                "title": {"$first": "$title"},
                "vs_score": {"$max": "$vs_score"},
                "fts_score": {"$max": "$fts_score"},
            }
        },
        {
            "$project": {
                "_id": 1,
                "title": 1,
                "vs_score": {"$ifNull": ["$vs_score", 0]},
                "fts_score": {"$ifNull": ["$fts_score", 0]},
            }
        },
        {
            "$project": {
                "score": {"$add": ["$fts_score", "$vs_score"]},
                "_id": 1,
                "title": 1,
                "vs_score": 1,
                "fts_score": 1,
            }
        },
        {"$sort": {"score": -1}},
        {"$limit": 5},
    ]

    results = collection.aggregate(pipeline)
    return list(results)

In [None]:
# Test the hybrid search query with a weight of 0.7 for vector search and 0.3 for full-text search
hybrid_search(user_query="Feel good books", vector_weight=0.7, full_text_weight=0.3)