# Setup, Vectorize and Load Data

In this tutorial, we'll demonstrate how to leverage a sample dataset stored in Azure Cosmos DB for MongoDB vCore to ground OpenAI models. We'll do this taking advantage of Azure Cosmos DB for Mongo DB vCore's [vector similarity search](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/vector-search) functionality. In the end, we'll create an interatice chat session with the GPT-3.5 completions model to answer questions about Azure services informed by our dataset. This process is known as Retrieval Augmented Generation, or RAG.

In [None]:
! pip install openai
! pip install pymongo
! pip install python-dotenv
! pip install azure-storage-blob
! pip install ijson

In [3]:
import json
import pymongo
import ijson
from azure.storage.blob import BlobServiceClient
from openai import AzureOpenAI
from dotenv import dotenv_values


# Load environment values and intantiate clients

In [4]:

# specify the name of the .env file name 
env_name = "fabcondemo.env" # following example.env template change to your own .env file name
config = dotenv_values(env_name)

mongo_conn = config['mongo_connection_string']
mongo_database = config['mongo_database_name']
mongo_products_collection = config['mongo_collection_name']
mongo_cache_collection = config['mongo_cache_collection_name']
# Create the MongoDB client
mongo_client = pymongo.MongoClient(mongo_conn)

storage_account_url = config['storage_account_url']
storage_container_name = config['storage_container_name']
storage_file_name = config['storage_file_name']
# Create the blob client
blob_service_client = BlobServiceClient(account_url = storage_account_url)
blob_client = blob_service_client.get_blob_client( storage_container_name, storage_file_name)

openai_endpoint = config['openai_endpoint']
openai_key = config['openai_key']
openai_version = config['openai_version']
openai_embeddings_deployment = config['openai_embeddings_deployment']
openai_embeddings_model = config['openai_embeddings_model']
openai_embeddings_dimensions = int(config['openai_embeddings_dimensions'])
openai_completions_deployment = config['openai_completions_deployment']
openai_completions_model = config['openai_completions_model']
# Create the OpenAI client
openai_client = AzureOpenAI(azure_endpoint=openai_endpoint, api_key=openai_key, api_version=openai_version)


#  Set up the MongoDB vCore database and collection

In [5]:
def create_collection_and_vector_index(database, collection_name):

    collection = database[collection_name]

    database.command(
    {
        "createIndexes": collection_name,
        "indexes": [
            {
                "name": "VectorSearchIndex",
                "key": {
                    "contentVector": "cosmosSearch"
                },
                "cosmosSearchOptions": { 
                    "kind": "vector-hnsw", 
                    "m": 16, # default value 
                    "efConstruction": 64, # default value 
                    "similarity": "COS", 
                    "dimensions": openai_embeddings_dimensions
                } 
            } 
        ] 
    }
    )

    return collection
    

In [6]:

# Check if the collection database and drop if it does
if mongo_database in mongo_client.list_database_names():
    mongo_client.drop_database(mongo_database)

# Create the database FabConfDB
db = mongo_client[mongo_database]

# Create the collection with vector index
products_collection = create_collection_and_vector_index(db, mongo_products_collection)

# Create the cache collection with vector index
cache_collection = create_collection_and_vector_index(db, mongo_cache_collection)


In [7]:
def generate_embeddings(text):
    '''
    Generate embeddings from string of text.
    This will be used to vectorize data and user input for interactions with Azure OpenAI.
    '''
    # OpenAI asks for a model but it's actually a deployment.
    response = openai_client.embeddings.create(input = text, model = openai_embeddings_deployment, dimensions= openai_embeddings_dimensions)
    
    embeddings = response.model_dump()
    return embeddings['data'][0]['embedding']

# Ingest, vectorize & store

Read the data out of blob storage, generate vectors on it, then store in Mongo vCore

In [None]:

# Download the blob to a stream
stream = blob_client.download_blob().readall()

# Use ijson to parse the json file incrementally
objects = ijson.items(stream, 'item')

for obj in objects:
    # serialize the object to a string
    sObject = json.dumps(obj)
    
    # generate an embedding for each object
    vectorArray = generate_embeddings(sObject)

    # add the embedding to the object
    obj["contentVector"] = vectorArray

    # insert the object into the collection
    products_collection.insert_one(obj)

print("Data inserted into collection: '{}'.\n".format(products_collection.name))