# Build a RAG solution using Azure Cosmos DB Solution

## Install Dependencies

In [None]:
%pip install numpy
%pip install openai
%pip install python-dotenv
%pip install azure-core
%pip install azure-cosmos

## Load Azure configurations

You always need to run this!

In [1]:
from dotenv import load_dotenv
import os

load_dotenv() # take environment variables from .env.

azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_key = os.getenv("AZURE_OPENAI_API_KEY")
azure_openai_deployment = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME")
azure_openai_embeddings_deployment = os.getenv("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT")
azure_openai_api_version = "2024-10-01-preview"
azure_openai_embedding_size = 1536

azure_cosmosdb_endpoint = os.getenv("AZURE_COSMOSDB_ENDPOINT")
azure_cosmosdb_key = os.getenv("AZURE_COSMOSDB_KEY")
azure_cosmosdb_database = "recipes-database"
azure_cosmosdb_container = "recipes-container"

## Setup Azure Cosmos DB

In [2]:
from azure.cosmos import CosmosClient
from azure.cosmos import PartitionKey, exceptions

# Setup the connection
cosmos_client = CosmosClient(url=azure_cosmosdb_endpoint, credential=azure_cosmosdb_key)

# Create database
db = cosmos_client.create_database_if_not_exists(id=azure_cosmosdb_database)

# Author the vector embedding policy
vector_embedding_policy = {
    "vectorEmbeddings": [
        {
            "path":"/contentVector",
            "dataType":"float32",
            "distanceFunction":"cosine",
            "dimensions":azure_openai_embedding_size
        }
    ]
}

full_text_policy = {
   "defaultLanguage": "en-US",
   "fullTextPaths": [
       {
           "path": "/name",
           "language": "en-US"
       },
       {
           "path": "/description",
           "language": "en-US"
       }
   ]
}

# Add vector indexes to indexing policy
indexing_policy = {
    "includedPaths": [
        {
            "path": "/*"
        }
    ],
    "excludedPaths": [
        {
            "path": "/\"_etag\"/?"
        },
        {
            "path": "/contentVector/*"
        }
    ],
    "fullTextIndexes": [
        {
            "path": "/name"
        },
        {
            "path": "/description"
        }
    ],
    "vectorIndexes": [
        {"path": "/contentVector",
         "type": "quantizedFlat"
        }
    ]
}

try:    
    container = db.create_container_if_not_exists(
                    id=azure_cosmosdb_container,
                    partition_key=PartitionKey(path='/id', kind='Hash'),
                    indexing_policy=indexing_policy,
                    vector_embedding_policy=vector_embedding_policy,
                    full_text_policy=full_text_policy)

    print('Container with id \'{0}\' created'.format(id))

except exceptions.CosmosResourceExistsError:
    print('A container with id \'{0}\' already exists'.format(id))

container = db.get_container_client(azure_cosmosdb_container)

Container with id '<built-in function id>' created


## Creating embeddings separately

We are computing the embeddings manually

In [3]:
from openai import AzureOpenAI
import json

# Azure OpenAI client
openai_client = AzureOpenAI(
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    azure_deployment=azure_openai_embeddings_deployment,
    api_key=azure_openai_key)

# Read the recipes.json
path = os.path.join('../Data/recipes/', 'recipes.json')
with open(path, 'r', encoding='utf-8') as file:
    recipes = json.load(file)

# Convert each recipe dictionary into a formatted string 
# And store these strings in a list
combined_strings = []
for recipe in recipes:
    combined_string = ""
    for key, value in recipe.items():
        if isinstance(value, list):
            combined_string += f"{key}:\n"
            for item in value:
                combined_string += f"  - {item}\n"
        else:
            combined_string += f"{key}: {value}\n"
    combined_strings.append(combined_string)

# Generate embeddings for each combined string
content_response = openai_client.embeddings.create(
    input=combined_strings, 
    model=azure_openai_embeddings_deployment, 
    dimensions=azure_openai_embedding_size
)

content_embeddings = [recipe.embedding for recipe in content_response.data]

# add contentVector field in recipes
for i, item in enumerate(recipes):
    item['contentVector'] = content_embeddings[i]

# Output embeddings to new json file
output_path = os.path.join('../Data/recipes/', 'recipesVectors.json')
output_directory = os.path.dirname(output_path)
if not os.path.exists(output_directory):
    os.makedirs(output_directory)
with open(output_path, "w") as f:
    json.dump(recipes, f)

## Upload data to the container

In [4]:
import json

with open('../Data/recipes/recipesVectors.json') as f:
   data = json.load(f)

container_client = db.get_container_client(azure_cosmosdb_container)

for item in data:
    print("writing item ", item['id'])
    container_client.upsert_item(item)

writing item  1
writing item  2
writing item  3
writing item  4
writing item  5
writing item  6
writing item  7
writing item  8
writing item  9
writing item  10
writing item  11
writing item  12
writing item  13
writing item  14
writing item  15
writing item  16
writing item  17
writing item  18
writing item  19
writing item  20


## Hybrid Search helper function

In [5]:
from openai import AzureOpenAI
from azure.cosmos import CosmosClient

# Simple function to assist with hybrid search
def hybrid_search(user_query, num_results):

    # Setup the connection
    cosmos_client = CosmosClient(url=azure_cosmosdb_endpoint, credential=azure_cosmosdb_key)
    database = cosmos_client.get_database_client(azure_cosmosdb_database)
    container = database.get_container_client(azure_cosmosdb_container)

    # Azure OpenAI client
    openai_client = AzureOpenAI(
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    azure_deployment=azure_openai_embeddings_deployment,
    api_key=azure_openai_key)

    # get embedding of user query
    response = openai_client.embeddings.create(input=user_query, 
                                               model=azure_openai_embeddings_deployment, 
                                               dimensions=azure_openai_embedding_size)
    embedding = response.data[0].embedding

    # format the query
    query ='''
                SELECT TOP {0} 
                    c.id, 
                    c.name,
                    c.description,
                    c.cuisine,
                    c.difficulty,
                    c.prepTime,
                    c.cookTime,
                    c.totalTime,
                    c.servings,
                    c.ingredients,
                    c.instructions, 
                    VectorDistance(c.contentVector,{1}) AS SimilarityScore 
                FROM c 
                ORDER BY RANK RRF 
                    (VectorDistance(c.contentVector, {1}), FullTextScore(c.description, ['{2}']))
            '''.format(num_results, embedding, user_query)
    
    results = container.query_items(
            query=query,
            enable_cross_partition_query=True)

    # Extract the necessary information from the results
    formatted_results = []
    for document in results:
        score = document.pop('SimilarityScore')
        formatted_result = {
            'SimilarityScore': score,
            'document': document
        }
        formatted_results.append(formatted_result)

    return formatted_results    
    

## Hybrid Query Search

In [6]:
query = "teas in recipe"
results = hybrid_search(query, 3)

for document in results:
        print(f"Similarity Score: {document['SimilarityScore']}")
        print(f"ID: {document['document']['id']}")
        print(f"Name: {document['document']['name']}")

Similarity Score: 0.5442121841874742
ID: 4
Name: Chai Tea
Similarity Score: 0.3787573327519321
ID: 3
Name: Baklava
Similarity Score: 0.3714795702906132
ID: 5
Name: Irish Coffee


In [8]:
query = "teas in recipe"
results = hybrid_search(query, 3)
print("Results for query: ", results)

Results for query:  [{'SimilarityScore': 0.5442121841874742, 'document': {'id': '4', 'name': 'Chai Tea', 'description': 'A spiced Indian tea made with black tea, milk, and a blend of aromatic spices like cardamom, cinnamon, and ginger. Chai Tea is typically enjoyed as a warm and comforting beverage, perfect for relaxing moments or as an accompaniment to breakfast or afternoon tea.', 'cuisine': 'Indian', 'difficulty': 'Easy', 'prepTime': '5 minutes', 'cookTime': '15 minutes', 'totalTime': '20 minutes', 'servings': 2, 'ingredients': ['2 cups water', '2 cups milk', '2 tablespoons loose black tea leaves', '2 tablespoons sugar (adjust to taste)', '2-3 whole green cardamom pods, lightly crushed', '1 small piece of cinnamon stick', '1/4 teaspoon grated ginger', 'Pinch of ground cloves (optional)'], 'instructions': ['In a saucepan, add water, crushed cardamom pods, cinnamon stick, grated ginger, and ground cloves (if using). Bring it to a boil.', 'Reduce the heat to low and add the loose black

## Send query results to a language model to generate response

In [7]:
def azure_cosmos_db_rag(query):
    from openai import AzureOpenAI

    # Azure OpenAI client
    openai_client = AzureOpenAI(
        api_version=azure_openai_api_version,
        azure_endpoint=azure_openai_endpoint,
        api_key=azure_openai_key)

    # Provide instructions to the model
    SYSTEM_PROMPT="""
    You are an AI assistant that helps users learn from the information found in the source material.
    Answer the query using only the sources provided below.
    Use bullets if the answer has multiple points.
    If the answer is longer than 3 sentences, provide a summary.
    Answer ONLY with the facts listed in the list of sources below. Cite your source when you answer the question
    If there isn't enough information below, say you don't know.
    Do not generate answers that don't use the sources below.
    Query: {query}
    Sources:\n{sources}
    """

    results = hybrid_search(query, 5)

    # Use a unique separator to make the sources distinct. 
    # We chose repeated equal signs (=) followed by a newline because it's unlikely the source documents contain this sequence.
    sources_formatted = "=================\n".join(
    [f'''Name: {document['document']['name']}, 
    Description: {document['document']['description']}, 
    Cuisine: {document['document']['cuisine']},
    Difficulty: {document['document']['difficulty']},
    Preparation Time: {document['document']['prepTime']},
    Cooking Time: {document['document']['cookTime']},
    Total Time: {document['document']['totalTime']},
    Servings: {document['document']['servings']},
    Ingredients: {document['document']['ingredients']},
    Instructions: {document['document']['instructions']}'''
    for document in results])

    response = openai_client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": SYSTEM_PROMPT.format(query=query, sources=sources_formatted)
            }
        ],
        model=azure_openai_deployment
    )

    print(response.choices[0].message.content)


In [17]:
# User Query
query = "What of the recipes use sugar?"
azure_cosmos_db_rag(query)

The recipes that use sugar are:

- **Chocolate Chip Cookies**: The ingredients include 3/4 cup granulated sugar and 3/4 cup brown sugar (source: Chocolate Chip Cookies).
- **Baklava**: The ingredients include 1 cup granulated sugar (source: Baklava).
- **Tiramisu**: The ingredients include 3/4 cup granulated sugar (source: Tiramisu).
- **Pancakes**: The ingredients include 2 tablespoons granulated sugar (source: Pancakes).


In [18]:
# User Query
query = "What pastas are available?"
azure_cosmos_db_rag(query)

The available pastas are:

- **Pasta Primavera**: Made with fresh seasonal vegetables, olive oil, and Parmesan cheese. It's a colorful, light, and healthy meal.
- **Pesto Pasta**: Prepared with fresh basil pesto, pasta, and Parmesan cheese. It's a quick and delicious option, perfect for busy weeknights or light lunches.

Citations:
- [Pasta Primavera](#)
- [Pesto Pasta](#)


In [19]:
# User Query
query = "How do we cook pesto pasta?"
azure_cosmos_db_rag(query)

To cook pesto pasta:

- **Cook the pasta**: Boil 8 oz of pasta (your choice of spaghetti, penne, etc.) according to package instructions until al dente. Drain and set aside.
- **Prepare the pesto**: In a food processor, combine 1 cup fresh basil leaves, 1/4 cup pine nuts (or walnuts), 1/2 cup grated Parmesan cheese, and 2 cloves garlic. Pulse until finely chopped.
- **Mix the pesto**: With the food processor running, slowly drizzle in 1/2 cup olive oil until the mixture becomes a smooth paste. Scrape down the sides as needed. Season with salt and black pepper to taste.
- **Combine pasta and pesto**: In a large mixing bowl, combine the cooked pasta with the pesto sauce. Toss until the pasta is evenly coated.
- **Serve**: Serve immediately, garnished with additional grated Parmesan cheese if desired. 

Enjoy this delicious and aromatic Pesto Pasta! 

Source: Pesto Pasta
