# Introduction

In this demo, we'll demonstrate how to leverage a sample dataset stored in Azure Cosmos DB for MongoDB to ground OpenAI models. We'll do this taking advantage of Azure Cosmos DB for Mongo DB's [vector similarity search](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/vector-search) functionality. In the end, we'll create an interatice chat session with the GPT-3.5 completions model to answer questions about Azure services informed by our dataset. This process is known as Retrieval Augmented Generation, or RAG.

This tutorial borrows some code snippets and example data from the Azure Cognitive Search Vector Search demo 

# Preliminaries <a class="anchor" id="preliminaries"></a>
First, let's start by installing the packages that we'll need later. 

In [None]:
! pip install json
! pip install python-dotenv

! pip install pymongo
! pip install openai

! pip install gradio

In [None]:
import time
import json
from dotenv import dotenv_values

import pymongo
from openai import AzureOpenAI

import gradio as gr

Please use the example.env as a template to provide the necessary keys and endpoints in your own .env file.
Make sure to modify the env_name accordingly. 

In [3]:

# specify the name of the .env file name 
env_name = "fabcondemo.env" # following example.env template change to your own .env file name
config = dotenv_values(env_name)


mongo_conn = config['mongo_connection_string']
mongo_database = config['mongo_database_name']
mongo_collection = config['mongo_collection_name']
mongo_vector_property = config['mongo_vector_property_name']
mongo_cache = config['mongo_cache_collection_name']
mongo_semcache = config['mongo_semcache_collection_name']
mongo_chat_history = config['mongo_chathistory_collection_name']
# Create the MongoDB client
mongo_client = pymongo.MongoClient(mongo_conn)


openai_endpoint = config['openai_endpoint']
openai_key = config['openai_key']
openai_version = config['openai_version']
openai_embeddings_deployment = config['openai_embeddings_deployment']
openai_embeddings_model = config['openai_embeddings_model']
openai_embeddings_dimensions = int(config['openai_embeddings_dimensions'])
openai_completions_deployment = config['openai_completions_deployment']
openai_completions_model = config['openai_completions_model']
# Create the OpenAI client
openai_client = AzureOpenAI(azure_endpoint=openai_endpoint, api_key=openai_key, api_version=openai_version)


# MongoDB database and collections

In [4]:
# Get the database
database = mongo_client[mongo_database]

# Get the movie collection
movies = database[mongo_collection]

# Get the cache collection
cache = database[mongo_cache]


# Generate embeddings from Azure OpenAI

This is used to vectorize the user input for the vector search

In [5]:
## Rewrite, but copy from setup.

def generate_embeddings(text):
    '''
    Generate embeddings from string of text.
    This will be used to vectorize data and user 
    input for interactions with Azure OpenAI.
    '''
    response = openai_client.embeddings.create(
        input = text, 
        # OpenAI asks for a model but it's actually a deployment.
        model = openai_embeddings_deployment,
        dimensions = openai_embeddings_dimensions)

    embeddings = response.model_dump()
    return embeddings['data'][0]['embedding']

# Vector Search in Azure Cosmos DB for MongoDB

This defines a function for performing a vector search over any collection in Azure Cosmos DB for MongoDB with a similarly configured vector index. Takes a collection reference, array of vectors, and optional similarity score to filter for top matches and number of results to return to filter further.

In [6]:
def vector_search(collection, vectors, similarity_score=0.02, num_results=3):
    
    ##Rewrite. Copy and paste.

    pipeline = [
        {
            '$search': {
                "cosmosSearch": {
                    "vector": vectors,
                    "path": mongo_vector_property,
                    "k": num_results,
                    "efsearch": 40 # optional for HNSW only 
                },
                "returnStoredSource": True }},
        { '$project': { 'similarityScore': { '$meta': 'searchScore' }, 'document' : '$$ROOT' } },
        { '$match': { "similarityScore": { '$gt': similarity_score } } }
    ]

    results = list(collection.aggregate(pipeline))

    # Exclude the 'vector' to reduce payload size to LLM and _id properties to avoid serialization issues 
    for result in results:
        del result['document']['vector']
        del result['_id']
        del result['document']['_id']
    
    return results

Let's run a test query below.

In [None]:
query = "What movie has Buzz Lightyear?"
embeddings = generate_embeddings(query)
results = vector_search(movies, embeddings)
for result in results: 
    #print(result)
    print(f"Similarity Score: {result['similarityScore']}")  
    print(f"Title: {result['document']['title']}")  

# Define Function to get recent chat history

This provides conversational context to the LLM, allowing it to better have a conversation with the user.

In [8]:
# Grab chat history to as part of the payload to GPT model for completion.
def get_chat_history(completions=3):

    # Sort by _id in descending order and limit the results to 3
    results = cache.find({}, {"prompt": 1, "completion": 1}).sort([("_id", -1)]).limit(completions)
    
    return results

# Define Chat Completion Function

This function assembles the payload to send to a GPT model to generate a completion

In [9]:
#This function helps to ground the model with prompts and system instructions.

def generate_completion(user_prompt, vector_search_results, chat_history):
    
    system_prompt = '''
    You are an intelligent assistant for the Movie Lens Expert AI Assistant.
    You are designed to provide helpful answers to user questions about movies in your database.
    You are friendly, helpful, and informative and wee bit cheeky.
        - Only answer questions related to the information provided below.
        - Write two lines of whitespace between each answer in the list.
        - If you're unsure of an answer, you can say ""I don't know"" or ""I'm not sure"" and recommend users search themselves."
    '''

    # Create a list of messages as a payload to send to the OpenAI Completions API

    ### Rewrite this

    # System Prompt
    messages=[{"role": "system", "content": system_prompt}]

    # Chat history
    for item in chat_history:
        messages.append({"role": "system", "content": item['prompt'] + " " + item['completion']})

    # User Prompt
    messages.append({"role": "user", "content": user_prompt})

    # Add the vector search results
    for item in vector_search_results:
        messages.append({"role": "system", "content": json.dumps(item['document'])})

    # Send the payload to the OpenAI Completions API
    response = openai_client.chat.completions.create(model = openai_completions_deployment, messages = messages)
   
    
    return response.model_dump()

# Cache Generated Responses

Save the user prompts and generated completions in a conversation. 

This can be used to answer the same questions from other users. Generally this is more efficient and faster.

In [11]:
def cache_response(user_prompt, prompt_vectors, response):

    chat = [
        {
            'prompt': user_prompt,
            'completion': response['choices'][0]['message']['content'],
            'completionTokens': str(response['usage']['completion_tokens']),
            'promptTokens': str(response['usage']['prompt_tokens']),
            'totalTokens': str(response['usage']['total_tokens']),
            'model': response['model'],
            mongo_vector_property: prompt_vectors
         }
    ]

    cache.insert_one(chat[0])


# Define LLM Pipeline function

In [12]:
def chat_completion(user_input):

    # Generate embeddings from the user input
    user_embeddings = generate_embeddings(user_input)

    # Query the chat history cache first to see if this question has been asked before
    # Similarity score set to 0.99, will only return exact matches. Limit to 1 result.
    cache_results = vector_search(cache, user_embeddings, similarity_score=0.99, num_results=1)

    if len(cache_results) > 0:
        
        return cache_results[0]['document']['completion']
        
    else:
        
        ##Rewrite this

        # Perform a vector search on the user input
        search_results = vector_search(movies, user_embeddings)

        # Get recent chat history to send to GPT model
        chat_history = get_chat_history(3)
        
        # Generate completion based on the search results and user input with chat history for context
        completions_results = generate_completion(user_input, search_results, chat_history)

        # Cache the generated completion
        cache_response(user_input, user_embeddings, completions_results)

        # Return the generated LLM completion
        return completions_results['choices'][0]['message']['content'] 
    

# Let's make a UX for this thing


In [13]:
chat_history = []
with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox(label="Ask me anything about movies!")
    clear = gr.Button("Clear")

    def user(user_message, chat_history):

        # Create a timer to measure the time it takes to complete the request
        start_time = time.time()
        
        # Get LLM completion
        response_payload = chat_completion(user_message)

        # Stop the timer
        end_time = time.time()

        elapsed_time = round((end_time - start_time) * 1000, 2)

        response = response_payload['completion']
        cache_hit = response_payload['cache_hit']
        
        # Append user message and response to chat history
        chat_history.append([user_message, response + f"\n (Time: {elapsed_time}ms, Cache Hit: {cache_hit})"])
        
        return gr.update(value=""), chat_history
    
    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False)
    
    clear.click(lambda: None, None, chatbot, queue=False)

In [None]:
demo.launch(debug=True)

In [None]:
demo.close()