# CosmosDB for NoSQL RAG
<img src = "./cosmosdbrag.png">


### Installing important packages and libraries

In [3]:
%pip install azure-cosmos
%pip install openai

Note: you may need to restart the kernel to use updated packages.


c:\Users\dilmurodm\Desktop\Semantic_Kernel\semantic_kernel\venv_py313\Scripts\python.exe: No module named pip


Note: you may need to restart the kernel to use updated packages.


c:\Users\dilmurodm\Desktop\Semantic_Kernel\semantic_kernel\venv_py313\Scripts\python.exe: No module named pip


In [46]:
from azure.cosmos import CosmosClient, PartitionKey, exceptions
import os
import json
from dotenv import load_dotenv

load_dotenv()

# Set OpenAI API version for Azure AI Projects compatibility
if not os.getenv("OPENAI_API_VERSION"):
    os.environ["OPENAI_API_VERSION"] = "2024-10-01-preview"
    print(f"Set OPENAI_API_VERSION to {os.environ['OPENAI_API_VERSION']}")
else:
    print(f"Using existing OPENAI_API_VERSION: {os.environ['OPENAI_API_VERSION']}")

Using existing OPENAI_API_VERSION: 2024-10-01-preview


### Creating a connection to CosmosDB via connection string
and creating a database if it doesn't exist

In [47]:
cosmosdb_connection_string = os.getenv("COSMOS_DB_CONNECTION_STRING")

if cosmosdb_connection_string is None:
    raise ValueError("COSMOSDB_CONNECTION_STRING environment variable is not set.")
client = CosmosClient.from_connection_string(cosmosdb_connection_string)
database_name = os.getenv("DATABASE_NAME")

database = client.create_database_if_not_exists(id=database_name)


### Defining the vector embedding policy 

In [48]:
pk = "/category"

vector_embedding_policy = {
    "vectorEmbeddings": [
        {
            "path":"/vector",
            "dataType":"float32",
            "distanceFunction":"cosine",
            "dimensions":1536
        }
    ]
}

### Creating a vector index with diskANN algorithm


In [49]:
indexing_policy = {
    "vectorIndexes": [
        {
            "path":"/vector",
            "type":"diskANN"
        }

    ]
}

### Creating container inside of the database

In [50]:
try:
    container_name = os.getenv("CONTAINER_NAME")
    
    container = database.create_container_if_not_exists(
        id=container_name,
        partition_key=PartitionKey(path=pk),
        indexing_policy=indexing_policy,
        vector_embedding_policy=vector_embedding_policy
        
    )
    
except Exception as e:
    print(e)

### Creating Azure OpenAI Client

In [51]:
from azure.identity import DefaultAzureCredential

azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")  
azure_openai_key = os.getenv("AZURE_OPENAI_KEY")

# Check if this is an Azure AI Foundry endpoint
is_ai_foundry = "ai.azure.com" in azure_openai_endpoint or "services.ai.azure.com" in azure_openai_endpoint

if is_ai_foundry:
    # Use Azure AI Projects client for AI Foundry
    print("Detected Azure AI Foundry endpoint - using Azure AI Projects client")
    try:
        from azure.ai.projects import AIProjectClient
        
        credential = DefaultAzureCredential()
        project_client = AIProjectClient(
            endpoint=azure_openai_endpoint,
            credential=credential
        )
        
        # Get the OpenAI client from the project client
        azure_openai_client = project_client.get_openai_client()
        print("Using Azure AI Projects client for AI Foundry")
        
    except ImportError:
        print("Azure AI Projects package not installed. Installing...")
        import subprocess
        import sys
        subprocess.check_call([sys.executable, "-m", "pip", "install", "azure-ai-projects"])
        
        from azure.ai.projects import AIProjectClient
        credential = DefaultAzureCredential()
        project_client = AIProjectClient(
            endpoint=azure_openai_endpoint,
            credential=credential
        )
        azure_openai_client = project_client.get_openai_client()
        print("Installed and configured Azure AI Projects client")
        
    except Exception as e:
        print(f"Azure AI Projects client failed: {e}")
        raise e
else:
    # Use standard Azure OpenAI client for regular endpoints
    from openai import AzureOpenAI
    
    if azure_openai_key:
        azure_openai_client = AzureOpenAI(
            api_version="2024-06-01",
            api_key=azure_openai_key,
            azure_endpoint=azure_openai_endpoint
        )
        print("Using API key authentication")
    else:
        from azure.identity import get_bearer_token_provider
        
        credential = DefaultAzureCredential()
        token_provider = get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default")
        
        azure_openai_client = AzureOpenAI(
            api_version="2024-06-01",
            azure_ad_token_provider=token_provider,
            azure_endpoint=azure_openai_endpoint
        )
        print("Using Azure AD authentication")

Detected Azure AI Foundry endpoint - using Azure AI Projects client
Using Azure AI Projects client for AI Foundry
Using Azure AI Projects client for AI Foundry


### Creating Embedding Generation Function
embedding engine to be used: text-embedding-ada-002 
<br>
vector dimensions: 1536

In [55]:
def generate_embeddings(client, text):
    """
    Generate embeddings for the given text using Azure OpenAI.
    
    Args:
        client: Azure OpenAI client
        text: Text to generate embeddings for
        
    Returns:
        List of embedding values
    """
    embedding_model = os.getenv("EMBEDDING_ENGINE")
    
    if not embedding_model:
        raise ValueError("EMBEDDING_ENGINE environment variable is not set")
    
    if not text or not isinstance(text, str):
        raise ValueError("Text input must be a non-empty string")
    
    try:
        # Limit text length to avoid token limits (typically 8192 tokens for ada-002)
        max_text_length = 8000  # Conservative limit
        if len(text) > max_text_length:
            text = text[:max_text_length]
            print(f"Text truncated to {max_text_length} characters")
        
        response = client.embeddings.create(
            input=text,
            model=embedding_model
        )
        
        embeddings = response.model_dump()
        return embeddings['data'][0]['embedding']
        
    except Exception as e:
        print(f" Error generating embeddings: {e}")
        print(f" Text length: {len(text)}")
        print(f" Model: {embedding_model}")
        raise e

### Loading food dataset
the food dataset is stored in `"./fooditems.json"`
<br>
we will generate vector embedding for the `/description` field of each food object and store it in a new field `/vector`

In [53]:
# Validate environment variables before processing
print("🔍 Environment Variables Check:")
print(f"   COSMOS_DB_CONNECTION_STRING: {'✅ Set' if os.getenv('COSMOS_DB_CONNECTION_STRING') else '❌ Missing'}")
print(f"   DATABASE_NAME: {os.getenv('DATABASE_NAME') or '❌ Missing'}")
print(f"   CONTAINER_NAME: {os.getenv('CONTAINER_NAME') or '❌ Missing'}")
print(f"   AZURE_OPENAI_ENDPOINT: {os.getenv('AZURE_OPENAI_ENDPOINT') or '❌ Missing'}")
print(f"   AZURE_OPENAI_KEY: {'✅ Set' if os.getenv('AZURE_OPENAI_KEY') else '❌ Missing'}")
print(f"   EMBEDDING_ENGINE: {os.getenv('EMBEDDING_ENGINE') or '❌ Missing'}")
print(f"   GPT_ENGINE: {os.getenv('GPT_ENGINE') or '❌ Missing'}")

# Test the embeddings function with a simple text
try:
    print("\n🧪 Testing embeddings generation...")
    test_embeddings = generate_embeddings(azure_openai_client, "This is a test sentence for embedding generation.")
    print(f"✅ Embeddings test successful! Vector dimension: {len(test_embeddings)}")
except Exception as e:
    print(f"❌ Embeddings test failed: {e}")
    print("Please check your Azure OpenAI configuration and try again.")

🔍 Environment Variables Check:
   COSMOS_DB_CONNECTION_STRING: ✅ Set
   DATABASE_NAME: makdb
   CONTAINER_NAME: makcontainer
   AZURE_OPENAI_ENDPOINT: https://semantic-aifoundry.services.ai.azure.com/api/projects/firstProject
   AZURE_OPENAI_KEY: ✅ Set
   EMBEDDING_ENGINE: text-embedding-ada-002
   GPT_ENGINE: gpt-4o

🧪 Testing embeddings generation...
✅ Embeddings test successful! Vector dimension: 1536
✅ Embeddings test successful! Vector dimension: 1536


In [56]:
import json
import uuid
import time
from typing import List, Dict

file_path = "./food_items.json"

import os
if not os.path.exists(file_path):
    print(f"File not found: {file_path}")
    print("Please ensure the food_items.json file exists in the current directory")
else:
    print(f"Found data file: {file_path}")

try:
    with open(file_path) as f:
        data = json.load(f)
    
    print(f"📊 Loaded {len(data)} food items")
    
    # Process each item with progress tracking
    success_count = 0
    error_count = 0
    
    for i, obj in enumerate(data):
        try:
            print(f"Processing item {i+1}/{len(data)}: {obj.get('name', 'Unknown')}")
            
            # Generate unique ID
            guid = str(uuid.uuid4())
            
            # Check if description exists
            if 'description' not in obj or not obj['description']:
                print(f" Skipping item {i+1}: No description found")
                continue
            
            # Generate vector embeddings
            vector_embeddings = generate_embeddings(azure_openai_client, obj['description'])
            
            # Add vector and ID to the object
            obj['vector'] = vector_embeddings
            obj['id'] = guid
            
            # Upsert to Cosmos DB
            container.upsert_item(obj)
            
            success_count += 1
            print(f"✅ Successfully processed item {i+1}")
            
            # Add small delay to avoid rate limiting
            time.sleep(0.1)
            
        except Exception as e:
            error_count += 1
            print(f"❌ Error processing item {i+1}: {e}")
            # Continue with next item instead of stopping
            continue
    
    # Save updated dataset
    try:
        with open("./new_dataset.json", 'w') as f:
            json.dump(data, f, indent=2)
        print(f"💾 Saved updated dataset to new_dataset.json")
    except Exception as e:
        print(f"⚠️ Could not save dataset: {e}")
    
    # Summary
    print(f"\n📈 Processing Summary:")
    print(f"   ✅ Successful: {success_count}")
    print(f"   ❌ Errors: {error_count}")
    print(f"   📋 Total: {len(data)}")
    
except FileNotFoundError:
    print(f"❌ Could not find file: {file_path}")
except json.JSONDecodeError as e:
    print(f"❌ Invalid JSON in file: {e}")
except Exception as e:
    print(f"❌ Unexpected error: {e}")
    raise e

Found data file: ./food_items.json
📊 Loaded 151 food items
Processing item 1/151: Ashunti`Way Smoothie
✅ Successfully processed item 1
Processing item 2/151: Jimmy Jam Smoothie
✅ Successfully processed item 1
Processing item 2/151: Jimmy Jam Smoothie
✅ Successfully processed item 2
Processing item 3/151: Tejay Impact Smoothie
✅ Successfully processed item 2
Processing item 3/151: Tejay Impact Smoothie
✅ Successfully processed item 3
Processing item 4/151: Dayton 500 Smoothie
✅ Successfully processed item 3
Processing item 4/151: Dayton 500 Smoothie
✅ Successfully processed item 4
Processing item 5/151: Nappy Head Truth Smoothie
✅ Successfully processed item 4
Processing item 5/151: Nappy Head Truth Smoothie
✅ Successfully processed item 5
Processing item 6/151: Aw Shuckie Shuckie Now Smoothie
✅ Successfully processed item 5
Processing item 6/151: Aw Shuckie Shuckie Now Smoothie
✅ Successfully processed item 6
Processing item 7/151: Early Birds Get the Worm Smoothie
✅ Successfully proce

### Generating vector embeddings for the user query

In [76]:
user_query="Are there any food that is high in protein"
user_embeddings = generate_embeddings(azure_openai_client, user_query)
print(user_embeddings)


[0.003758044680580497, -0.029371747747063637, -0.02819174714386463, -0.015532395802438259, 0.01287098228931427, 0.008542176336050034, -0.014762830920517445, -0.022355876863002777, -0.03168044611811638, -0.006143697537481785, 0.02696044370532036, 0.02248413674533367, 0.010741851292550564, 0.0010052449069917202, -0.011395981535315514, 0.013980438932776451, 0.04853392764925957, 0.002608506241813302, 0.012627286836504936, -0.01421130821108818, -0.010049242526292801, 0.009420763701200485, 0.02778131328523159, -0.020406311377882957, 0.008048372343182564, 0.013287830166518688, 0.004954077769070864, -0.025241747498512268, 0.009388698264956474, -0.008728154934942722, 0.049046970903873444, 0.004460273310542107, -0.01968804933130741, 0.004604566376656294, -0.008625546470284462, 0.01154348161071539, -0.00991456862539053, 0.027088703587651253, 0.034168705344200134, 0.0060090236365795135, 0.026601312682032585, 0.0015711961314082146, 0.012794025242328644, 0.009658046066761017, -0.02283044159412384, 0

### Sending a query to database with filtering based upon VectorDistance

In [77]:
queryText = f""" SELECT TOP 5 c.category, c.name, c.description, c.price, VectorDistance(c.vector, {user_embeddings}) AS SimilarityScore
FROM c
ORDER BY VectorDistance(c.vector, {user_embeddings})"""
results = container.query_items(
    query=queryText,
    enable_cross_partition_query=True
)
dishes = []

for item in results:
    print(item)
    dishes.append(item)
    


{'category': 'Smoothies', 'name': 'Go Getter', 'description': 'banana, peanut butter, raw cacao, cinnamon, hemp seeds, walnuts, almond milk, chocolate vegan protein', 'price': '10.8 USD', 'SimilarityScore': 0.8253558126946419}
{'category': 'Smoothies', 'name': 'Go Getter', 'description': 'banana, peanut butter, raw cacao, cinnamon, hemp seeds, walnuts, almond milk, chocolate vegan protein', 'price': '10.8 USD', 'SimilarityScore': 0.8253558126946419}
{'category': 'Smoothies', 'name': 'K Blend Smoothie', 'description': 'Mango, pineapple, peaches pear apricot, cranberry, lemons, lemonade, flaxseeds, and strawberry protein. Our fruity tasty smoothies are blended to perfection.', 'price': '8.49 USD', 'SimilarityScore': 0.8198624605491195}
{'category': 'Smoothies', 'name': 'K Blend Smoothie', 'description': 'Mango, pineapple, peaches pear apricot, cranberry, lemons, lemonade, flaxseeds, and strawberry protein. Our fruity tasty smoothies are blended to perfection.', 'price': '8.49 USD', 'Simi

### Sending call to our GPT engine for summarisation 

In [78]:
system_message = f"""You are meant to behave as a RAG chatbot that derives its context from a database of food items stored in azure cosmosDB for noSQL API.
please asnwer strictly from the context from the database provided and if you dont have an answer please politely say so. dont include any extra 
information that is not in the context and dont include links as well.
the context passed to you will be in the form of a pythonic list with each object in the list containing details of a food item and
having structure as follows:

 "category": "the category of the food item like smoothies, burgers, etc",
 "name": "the name of the food item",
 "description": "the description of the food item",
"price": "the price of the food item in USD",


the pythonic list contains best 5 matches to the user query based on cosine similarity of the embeddings of the user query and the food item descriptions.
please structure your answers in a very professional manner and in such a way that the user does not get to know that its RAG working under the hood
and its as if they are talking to a human."""

user_message = f""" the user query is: {user_query}
the context is : {dishes}"""

chat_completions_response = azure_openai_client.chat.completions.create(
    model = os.getenv("GPT_ENGINE"),
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message}
    ],
    temperature=0.7
)

print(chat_completions_response.choices[0].message.content)
    

Certainly! Based on your query, here are some options that are high in protein:

1. **Go Getter (Smoothie)**  
   - **Description**: A blend of banana, peanut butter, raw cacao, cinnamon, hemp seeds, walnuts, almond milk, and chocolate vegan protein.  
   - **Price**: 10.8 USD  

2. **K Blend Smoothie**  
   - **Description**: A fruity mix of mango, pineapple, peaches, pear, apricot, cranberry, lemons, lemonade, flaxseeds, and strawberry protein. Blended to perfection for a tasty and nutritious experience.  
   - **Price**: 8.49 USD  

3. **Green Boi (Smoothie)**  
   - **Description**: A nutrient-rich combination of banana, strawberry, spinach, yogurt, flax, collagen, ashwagandha, spirulina, and agave. Designed to support strength and vitality.  
   - **Price**: 10.0 USD  

These smoothies are excellent choices for a protein boost! Let me know if you need further assistance.
