## Data Pipeline - Cosmos DB Mongo vCore

### Prerequisites
  
- Generate embeddings - [generate_embeddings.ipynb](../../common/generate_embeddings.ipynb) 

#### Set environment variables

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()


def get_env_var(name):
    value = os.getenv(name)
    if value is None or value == "":
        print(f"{name} environment variable not set.")
        exit()
    return value


mongo_clustername = get_env_var("MONGO_CLUSTERNAME")
mongo_username = get_env_var("MONGO_USERNAME")
mongo_password = get_env_var("MONGO_PASSWORD")

text_index_name = "text-sample"
doc_index_name = "doc-sample"
image_index_name = "image-sample"

#### Helper methods

In [2]:
import math
import pymongo
from pymongo.errors import ConnectionFailure

conn_str = f"mongodb+srv://{mongo_username}:{mongo_password}@{mongo_clustername}.mongocluster.cosmos.azure.com/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000"


def get_mongo_client():
    return pymongo.MongoClient(conn_str)


try:
    client = get_mongo_client()
except ConnectionFailure():
    print("Server not available")

# TODO: use private endpoint around mongo to avoid need to add ClientIPAddress to firewall rules

client.admin.command("ping")


def ivf_numlists(length):
    # calculate number of lists for an IVF index

    if length < 1000000:
        num_lists = length // 1000

        if length % 1000 != 0:
            num_lists += 1
        return num_lists

    else:
        num_lists = math.isqrt(length)

        return num_lists

#### Create text-sample Mongo index

In [9]:
import pandas as pd
from pymongo import UpdateOne

text_df = pd.read_json("../data/text/product_docs_embeddings.json")

records = text_df.to_dict("records")

client = get_mongo_client()

db = client.semanticsearch

# Create a collection
collection = db.text

# Prepare bulk upsert operations
operations = [
    UpdateOne({"_id": doc["id"]}, {"$set": doc}, upsert=True) for doc in records
]

# Execute bulk upsert
collection.bulk_write(operations)

num_lists = ivf_numlists(len(records))

# db.text.drop_index("text_vector_index")

# Create Vector Index using IVF (Inverted File Index)
db.command(
    {
        "createIndexes": "text",
        "indexes": [
            {
                "name": "text_vector_index",
                "key": {"title_vector": "cosmosSearch"},
                "cosmosSearchOptions": {
                    "kind": "vector-ivf",
                    "numLists": num_lists,
                    "similarity": "COS",
                    "dimensions": 1536,  # 1536 for OpenAI ADA model embeddings
                },
            }
        ],
    }
)

{'raw': {'defaultShard': {'numIndexesBefore': 1,
   'numIndexesAfter': 2,
   'createdCollectionAutomatically': False,
   'ok': 1}},
 'ok': 1}

#### Create doc-sample Mongo index

In [8]:
import pandas as pd
from pymongo import UpdateOne

doc_df = pd.read_json("../data/docs/employee_handbook_embeddings.json")

records = doc_df.to_dict("records")

client = get_mongo_client()

db = client.semanticsearch

# Create a collection
collection = db.docs

# Prepare bulk upsert operations
operations = [
    UpdateOne({"_id": doc["id"]}, {"$set": doc}, upsert=True) for doc in records
]

# Execute bulk upsert
collection.bulk_write(operations)

num_lists = ivf_numlists(len(records))

# db.docs.drop_index("docs_vector_index")

# Create Vector Index using IVF (Inverted File Index)
db.command(
    {
        "createIndexes": "docs",
        "indexes": [
            {
                "name": "docs_vector_index",
                "key": {"chunk_content_vector": "cosmosSearch"},
                "cosmosSearchOptions": {
                    "kind": "vector-ivf",
                    "numLists": num_lists,
                    "similarity": "COS",
                    "dimensions": 1536,  # 1536 for OpenAI ADA model embeddings
                },
            }
        ],
    }
)

{'raw': {'defaultShard': {'numIndexesBefore': 1,
   'numIndexesAfter': 2,
   'createdCollectionAutomatically': False,
   'ok': 1}},
 'ok': 1}

#### Create image-sample Mongo index

In [15]:
import pandas as pd
from pymongo import UpdateOne

doc_df = pd.read_json("../data/images/images_embeddings.json")

records = doc_df.to_dict("records")

client = get_mongo_client()

db = client.semanticsearch

# Create a collection
collection = db.images

# Prepare bulk upsert operations
operations = [
    UpdateOne({"_id": doc["id"]}, {"$set": doc}, upsert=True) for doc in records
]

# Execute bulk upsert
collection.bulk_write(operations)

num_lists = ivf_numlists(len(records))

# db.images.drop_index("images_vector_index")

# Create Vector Index using IVF (Inverted File Index)
db.command(
    {
        "createIndexes": "images",
        "indexes": [
            {
                "name": "images_vector_index",
                "key": {"image_vector": "cosmosSearch"},
                "cosmosSearchOptions": {
                    "kind": "vector-ivf",
                    "numLists": num_lists,
                    "similarity": "COS",
                    "dimensions": 1024,  # 1024 for Azure Computer Vision Service embeddings
                },
            }
        ],
    }
)

{'raw': {'defaultShard': {'numIndexesBefore': 1,
   'numIndexesAfter': 2,
   'createdCollectionAutomatically': False,
   'ok': 1}},
 'ok': 1}