## Vector Search - Cosmos DB Mongo vCore

### Prerequisites
  
- Generate embeddings - [generate_embeddings.ipynb](../../common/generate_embeddings.ipynb) 
- Create Mongo Search Index and ingest embeddings - [data_pipeline.ipynb](./data_pipeline.ipynb)

#### Set environment variables

In [None]:
import os
from dotenv import load_dotenv
import openai

load_dotenv()


def get_env_var(name):
    value = os.getenv(name)
    if value is None or value == "":
        print(f"{name} environment variable not set.")
        exit()
    return value


# Cosmos Mongo vCore Secrets
mongo_clustername = get_env_var("MONGO_CLUSTERNAME")
mongo_username = get_env_var("MONGO_USERNAME")
mongo_password = get_env_var("MONGO_PASSWORD")

# Azure OpenAI Secrets
aoai_endpoint = get_env_var("AOAI_ENDPOINT")
aoai_api_version = get_env_var("AOAI_API_VERSION")
aoai_embedding_deployed_model = get_env_var("AOAI_EMBEDDING_DEPLOYED_MODEL")
azure_openai_key = get_env_var("AZURE_OPENAI_KEY")

# Azure Computer Vision Secrets
com_vision_endpoint = get_env_var("COM_VISION_ENDPOINT")
com_vision_api_version = get_env_var("COM_VISION_API_VERSION")
com_vision_key = get_env_var("COM_VISION_KEY")

text_index_name = "text-sample"
doc_index_name = "doc-sample"
image_index_name = "image-sample"

openai.api_type = "azure"
openai.api_key = azure_openai_key
openai.api_base = aoai_endpoint
openai.api_version = aoai_api_version

#### Helper methods

In [None]:
import pymongo
import numpy as np
from typing import List
from pymongo.errors import ConnectionFailure

conn_str = f"mongodb+srv://{mongo_username}:{mongo_password}@{mongo_clustername}.mongocluster.cosmos.azure.com/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000"


def get_mongo_client():
    return pymongo.MongoClient(conn_str)


# TODO: use private endpoint around mongo to avoid need to add ClientIPAddress to firewall rules

try:
    client = get_mongo_client()

    response = client.admin.command("ping")

    if response.get("ok") == 1.0:
        print("Successful ping")

except ConnectionFailure():
    print("Server not available")


def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def get_embedding(text: str, engine, **kwargs) -> List[float]:
    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    return openai.Embedding.create(input=[text], engine=engine, **kwargs)["data"][0][
        "embedding"
    ]


def vectorize_text_com_vision(com_vision_endpoint, com_vision_key, query):
    vectorize_text_url = f"{com_vision_endpoint}/computervision/retrieval:vectorizeText"
    params = {"api-version": "2023-02-01-preview"}
    headers = {
        "Content-Type": "application/json",
        "Ocp-Apim-Subscription-Key": com_vision_key,
    }
    data = {"text": query}

    response = requests.post(
        vectorize_text_url, params=params, headers=headers, json=data
    )
    query_vector = response.json()["vector"]

    return query_vector


def show_image(image_folder, image):
    image_path = os.path.join(image_folder, image)
    plt.imshow(Image.open(image_path))
    plt.axis("off")
    plt.show()

#### Simple vector search

This demo shows how to apply vector search on single field.

In [None]:
client = get_mongo_client()

db = client.semanticsearch

embedding = get_embedding(
    "tools for software development", engine=aoai_embedding_deployed_model
)

pipeline = [
    {
        "$search": {
            "cosmosSearch": {"vector": embedding, "path": "title_vector", "k": 5},
            "returnStoredSource": True,
        }
    }
]

results = db.text.aggregate(pipeline)

print("Top 5 records from vector search:")

[{k: record[k] for k in ("id", "title", "content", "category")} for record in results]

#### Metadata filtering with vector search

This demo shows how to apply metadata filtering (where, order by etc.) on top of vector search.

In [None]:
client = get_mongo_client()

db = client.semanticsearch

embedding = get_embedding(
    "tools for software development", engine=aoai_embedding_deployed_model
)

pipeline = [
    {
        "$search": {
            "cosmosSearch": {"vector": embedding, "path": "title_vector", "k": 15},
            "returnStoredSource": True,
        }
    },
    {"$match": {"category": "Web"}},
]

results = db.text.aggregate(pipeline)

print("Top records from vector search:")

[{k: record[k] for k in ("id", "title", "content", "category")} for record in results]

#### Document search example

This demo shows how to apply vector search for searching within documents.

In [None]:
client = get_mongo_client()

db = client.semanticsearch

embedding = get_embedding(
    "when are performance review announced?", engine=aoai_embedding_deployed_model
)

pipeline = [
    {
        "$search": {
            "cosmosSearch": {
                "vector": embedding,
                "path": "chunk_content_vector",
                "k": 5,
            },
            "returnStoredSource": True,
        }
    }
]

results = db.docs.aggregate(pipeline)

print("Top 5 records from vector search:")

[{k: record[k] for k in ("id", "chunk_content")} for record in results]

#### Image search example

This demo shows how to apply vector search for searching images.

In [None]:
query = "flower"
image_folder = "../data/images"

embedding = vectorize_text_com_vision(com_vision_endpoint, com_vision_key, query)

client = get_mongo_client()

db = client.semanticsearch

pipeline = [
    {
        "$search": {
            "cosmosSearch": {"vector": embedding, "path": "image_vector", "k": 5},
            "returnStoredSource": True,
        }
    }
]

results = db.images.aggregate(pipeline)

print("Top 5 records from vector search:")

results = [{k: record[k] for k in ("id", "image")} for record in results]

for result in results:
    show_image(image_folder, result.get("image"))
    print("\n")