# Movie Lens
Subset of the Movie Lens 25M dataset

# Setup, Vectorize and Load Data

In this tutorial, we'll demonstrate how to leverage a sample dataset stored in Azure Cosmos DB for MongoDB to ground OpenAI models. We'll do this taking advantage of Azure Cosmos DB for Mongo DB vCore's [vector similarity search](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/vector-search) functionality. In the end, we'll create an interatice chat session with the GPT-3.5 completions model to answer questions about Azure services informed by our dataset. This process is known as Retrieval Augmented Generation, or RAG.

In [None]:
! pip install openai
! pip install pymongo
! pip install python-dotenv
! pip install urlopen

In [None]:
import pymongo
import zipfile
import json
from openai import AzureOpenAI
from dotenv import dotenv_values
import urllib
from tenacity import retry, stop_after_attempt, wait_random_exponential
from time import sleep

# Load environment values and initiate clients

In [None]:
# specify the name of the .env file name 
env_name = "../fabconf.env" # following example.env template change to your own .env file name
config = dotenv_values(env_name)
cosmos_conn = config['cosmos_for_mongodb_connection_string']
cosmos_database = config['cosmos_database_name']
cosmos_collection = config['cosmos_collection_name']
cosmos_vector_property = config['cosmos_vector_property_name']
cosmos_cache = config['cosmos_cache_collection_name']

openai_endpoint = config['openai_endpoint']
openai_key = config['openai_key']
openai_api_version = config['openai_api_version']
openai_embeddings_deployment = config['openai_embeddings_deployment']
openai_embeddings_model = config['openai_embeddings_model']
openai_embeddings_dimensions = int(config['openai_embeddings_dimensions'])
openai_completions_deployment = config['openai_completions_deployment']
openai_completions_model = config['openai_completions_model']

In [None]:
# Create the Azure Cosmos DB for MongoDB client
cosmos_client = pymongo.MongoClient(cosmos_conn)
# Create the OpenAI client
openai_client = AzureOpenAI(azure_endpoint=openai_endpoint, api_key=openai_key, api_version=openai_api_version)

#  Create a collection with a vector index

This function takes a database object, a collection name, the name of the document property that will store vectors, and the number of vector dimensions used for the embeddings.

In [None]:
def create_collection_and_vector_index(database, cosmos_collection, vector_property, embeddings_dimensions):

    collection = database[cosmos_collection]

    database.command(
        {
            "createIndexes": cosmos_collection,
            "indexes": [
                {
                    "name": "VectorSearchIndex",
                    "key": {
                        vector_property: "cosmosSearch"
                    },
                    "cosmosSearchOptions": { 
                        "kind": "vector-hnsw", 
                        "m": 16, # default value 
                        "efConstruction": 64, # default value 
                        "similarity": "COS", 
                        "dimensions": embeddings_dimensions
                    } 
                } 
            ] 
        }
    )  

    return collection

# Create the Database and Collections with Vector Index

Create a collection for the movie data and another as a conversation cache

In [None]:
# Check if the collection database and drop if it does
if cosmos_database in cosmos_client.list_database_names():
    cosmos_client.drop_database(cosmos_database)

# Create the database FabConfDB
database = cosmos_client[cosmos_database]

# Create the data collection with vector index
collection = create_collection_and_vector_index(database, cosmos_collection, cosmos_vector_property, openai_embeddings_dimensions)

# Create the cache collection with vector index
cache = create_collection_and_vector_index(database, cosmos_cache, cosmos_vector_property, openai_embeddings_dimensions)

# Generate embeddings from Azure OpenAI

Generate embeddings from passed in text. Add retry to handle any throttling due to quota limits.

In [None]:
@retry(wait=wait_random_exponential(min=1, max=200), stop=stop_after_attempt(20))
def generate_embeddings(text):
    
    response = openai_client.embeddings.create(
        input=text,
        model=openai_embeddings_deployment,
        dimensions=openai_embeddings_dimensions
    )
    
    embeddings = response.model_dump()
    return embeddings['data'][0]['embedding']

In [None]:
# Unzip the data file
with zipfile.ZipFile("../Data/MovieLens-4489-256D.zip", 'r') as zip_ref:
    zip_ref.extractall("../Data")
zip_ref.close()

In [None]:
# Load the data file
data =[]
with open('../Data/MovieLens-4489-256D.json', 'r') as d:
    data = json.load(d)

In [None]:
# Peek at the first document
data[0]

# Stream, vectorize & store

Stream the data out of blob storage, generate vectors on it, then store in Azure Cosmos DB for MongoDB

In [None]:
counter = 0
# iterate through the stream, generate vectors and insert into collection
for object in data:
    #The following code to create vector embeddings for the data is commented out as the sample data is already vectorized.
    # #vectorArray = generate_embeddings("Title:" + data[i]['original_title'] + ", Tagline:" + data[i]['tagline'] + ", Overview:" + data[i]['overview'])
    #object[cosmos_vector_property] = vectorArray

    #insert the document into the collection
    collection.insert_one(object)

    counter += 1
    if counter % 100 == 0:
        print("Inserted {} documents into collection: '{}'.".format(counter, collection.name))

print ("Upsert completed!")

Now you're ready to start building your Chatbot!