# Movie Lens
Subset of the Movie Lens 25M dataset

# Setup, Vectorize and Load Data

In this tutorial, we'll demonstrate how to leverage a sample dataset stored in Azure Cosmos DB for MongoDB vCore to ground OpenAI models. We'll do this taking advantage of Azure Cosmos DB for Mongo DB vCore's [vector similarity search](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/vector-search) functionality. In the end, we'll create an interatice chat session with the GPT-3.5 completions model to answer questions about Azure services informed by our dataset. This process is known as Retrieval Augmented Generation, or RAG.

In [None]:
! pip install openai
! pip install pymongo
! pip install python-dotenv
! pip install ijson
! pip install urlopen

In [3]:
import pymongo
import ijson
from openai import AzureOpenAI
from dotenv import dotenv_values
import urllib
from tenacity import retry, stop_after_attempt, wait_random_exponential
from time import sleep


# Load environment values and intantiate clients

In [6]:

# specify the name of the .env file name 
env_name = "fabcondemo.env" # following example.env template change to your own .env file name
config = dotenv_values(env_name)

mongo_conn = config['mongo_connection_string']
mongo_database = config['mongo_database_name']
mongo_collection = config['mongo_collection_name']
mongo_vector_property = config['mongo_vector_property_name']
mongo_cache = config['mongo_cache_collection_name']
# Create the MongoDB client
mongo_client = pymongo.MongoClient(mongo_conn)

storage_file_url = config['storage_file_url']

openai_endpoint = config['openai_endpoint']
openai_key = config['openai_key']
openai_version = config['openai_version']
openai_embeddings_deployment = config['openai_embeddings_deployment']
openai_embeddings_model = config['openai_embeddings_model']
openai_embeddings_dimensions = int(config['openai_embeddings_dimensions'])
openai_completions_deployment = config['openai_completions_deployment']
openai_completions_model = config['openai_completions_model']
# Create the OpenAI client
openai_client = AzureOpenAI(azure_endpoint=openai_endpoint, api_key=openai_key, api_version=openai_version)



#  Create a collection with a vector index

This function takes a database object, a collection name, the name of the document property that will store vectors, and the number of vector dimensions used for the embeddings.

In [11]:
def create_collection_and_vector_index(database, mongo_collection, vector_property, embeddings_dimensions):

    collection = database[mongo_collection]

    # create a vector index on the collection
   
    database.command(
            {
                "createIndexes": mongo_collection,
                "indexes": [
                    {
                        "name": "VectorSearchIndex",
                        "key": {
                            vector_property: "cosmosSearch"
                        },
                        "cosmosSearchOptions": { 
                            "kind": "vector-hnsw", 
                            "m": 16, # default value 
                            "efConstruction": 64, # default value 
                            "similarity": "COS", 
                            "dimensions": embeddings_dimensions
                        } 
                    } 
                ] 
            }
        )
    

    return collection
    

# Create the Database and Collections with Vector Index

Create a collection for the movie data and another as a conversation cache

In [8]:

# Check if the collection database and drop if it does
if mongo_database in mongo_client.list_database_names():
    mongo_client.drop_database(mongo_database)

# Create the database FabConfDB
database = mongo_client[mongo_database]

# Create the data collection with vector index
collection = create_collection_and_vector_index(database, mongo_collection, mongo_vector_property, openai_embeddings_dimensions)

# Create the cache collection with vector index
cache = create_collection_and_vector_index(database, mongo_cache, mongo_vector_property, openai_embeddings_dimensions)


# Generate embeddings from Azure OpenAI

Generate embeddings from passed in text. Add retry to handle any throttling due to quota limits.

In [12]:
@retry(wait=wait_random_exponential(min=1, max=200), stop=stop_after_attempt(20))
def generate_embeddings(text):
    
    response = openai_client.embeddings.create(
        input=text,
        model = openai_embeddings_deployment,
        dimensions=openai_embeddings_dimensions
    )
    
    embeddings = response.model_dump()
    return embeddings['data'][0]['embedding']

# Stream, vectorize & store

Stream the data out of blob storage, generate vectors on it, then store in Azure Cosmos DB for MongoDB

In [None]:
# open the file and stream the data to ingest
stream = urllib.request.urlopen(storage_file_url)

counter = 0

# iterate through the stream, generate vectors and insert into collection
for object in ijson.items(stream, 'item', use_float=True):

    # generate the embeddings
    vectorArray = generate_embeddings(object['overview'])

    # add the vector to the object
    object[mongo_vector_property] = vectorArray

    # insert the object into the collection
    collection.insert_one(object)

    counter += 1

    if counter % 100 == 0:
        print("Inserted {} documents into collection: '{}'.".format(counter, collection.name))
        sleep(.5)   # sleep for 0.5 seconds to help avoid rate limiting


print("Data inserted into collection: '{}'.\n".format(collection.name))