In [None]:
import asyncio
import datetime
import sys
import os
from typing import Any, Dict, List, Literal, Optional, TypedDict

import logging
import os

# set the directory to the location of the script
try:
    os.chdir("../../../")
    target_directory = os.getenv(
        "TARGET_DIRECTORY", os.getcwd()
    )  # Use environment variable if available
    if os.path.exists(target_directory):
        os.chdir(target_directory)
        print(f"Changed directory to: {os.getcwd()}")
        logging.info(f"Successfully changed directory to: {os.getcwd()}")
    else:
        logging.error(f"Directory does not exist: {target_directory}")
except Exception as e:
    logging.exception(f"An error occurred while changing directory: {e}")

from src.cosmosdb.manager import CosmosDBMongoCoreManager
from utils.ml_logging import get_logger
from pymongo.errors import NetworkTimeout, DuplicateKeyError

logger = get_logger("auth_cosmos_migration")

Changed directory to: c:\Users\pablosal\Desktop\art-voice-agent-accelerator


## Create the Azure AI search Index 

In [3]:
#!pip install azure-search-documents==11.6.0

In [11]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    SimpleField,
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile
)

SEARCH_ENDPOINT = os.environ["AZURE_AI_SEARCH_SERVICE_ENDPOINT"]   # e.g. https://<service>.search.windows.net
SEARCH_API_KEY  = os.environ["AZURE_AI_SEARCH_ADMIN_KEY"]  # admin key
INDEX_NAME = "clothing-index"

client = SearchIndexClient(SEARCH_ENDPOINT, AzureKeyCredential(SEARCH_API_KEY))

index = SearchIndex(
    name=INDEX_NAME,
    fields=[
        # key
        SimpleField(name="id", type=SearchFieldDataType.String, key=True, filterable=True),

        # enums / facets
        SimpleField(name="category",  type=SearchFieldDataType.String, filterable=True, facetable=True),
        SimpleField(name="gender",    type=SearchFieldDataType.String, filterable=True, facetable=True),
        SimpleField(name="formality", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SimpleField(name="fit",       type=SearchFieldDataType.String, filterable=True, facetable=True),

        SearchField(name="features",  type=SearchFieldDataType.Collection(SearchFieldDataType.String),
                    filterable=True, facetable=True),
        SearchField(name="climate",   type=SearchFieldDataType.Collection(SearchFieldDataType.String),
                    filterable=True, facetable=True),

        SearchField(name="colors",    type=SearchFieldDataType.Collection(SearchFieldDataType.String),
                    filterable=True, facetable=True),
        SearchField(name="materials", type=SearchFieldDataType.Collection(SearchFieldDataType.String),
                    filterable=True, facetable=True),

        SimpleField(name="image_url", type=SearchFieldDataType.String),

        # text + vector
        SearchField(name="rich_description", type=SearchFieldDataType.String, searchable=True),
        SearchField(
            name="desc_vector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            vector_search_dimensions=3072,
            vector_search_profile_name="v1"
        ),
    ],
    vector_search=VectorSearch(
        algorithms=[HnswAlgorithmConfiguration(name="hnsw")],
        profiles=[VectorSearchProfile(name="v1", algorithm_configuration_name="hnsw")]
    ),
)

# Create or update idempotently
created = client.create_or_update_index(index)
print(f"Index upserted: {created.name}")

Index upserted: clothing-index



## üèóÔ∏è Database Architecture Overview

### üéØ **Retail Database Design**
- **Database**: `retail-db` (unified retail customer experience)
- **Universal Key**: `user_id` / `product_id` (consistent across all collections)

### üìä **Collections Structure**
1. **`users`** - Complete customer 360¬∞ profiles (demographics, preferences, shopping patterns, Dynamics 365 integration)
2. **`products`** - Product catalog with pricing, inventory, assortment, and regional variations
3. **`shopping_sessions`** - User interaction history for personalized recommendations


In [15]:
DATABASE_NAME = "retail-db"

In [16]:
# Collection Manager Factory
def get_collection_manager(collection_name: str) -> CosmosDBMongoCoreManager:
    """Get a manager for a specific collection in retail-db"""
    manager = CosmosDBMongoCoreManager(
        database_name=DATABASE_NAME,
        collection_name=collection_name
    )
    return manager

# Test database connectivity
try:
    test_manager = get_collection_manager("users")
    print(f"‚úÖ Database Connection Successful")
    print(f"üè¢ Database: {test_manager.database.name}")
    print(f"üîó Cluster: {test_manager.cluster_host}")
except Exception as e:
    print(f"‚ùå Database Connection Failed: {e}")
    raise

‚úÖ Database Connection Successful
üè¢ Database: retail-db
üîó Cluster: cosmosdb-ai-factory-westus2.mongo.cosmos.azure.com

üè¢ Database: retail-db
üîó Cluster: cosmosdb-ai-factory-westus2.mongo.cosmos.azure.com


#### üë• Step 1: Users Collection - Complete Customer Profiles

**Purpose**: Store complete customer information for personalized shopping experiences

**Schema**:
- `_id` / `user_id`: Unique customer identifier
- `full_name`, `age`, `gender`: Demographics
- `location`: City, state, zip for regional recommendations
- `preferences`: Style, size, color preferences
- `shopping_patterns`: Purchase history, average spend, favorite categories
- `dynamics365_data`: Customer service history, loyalty points, past orders
- `conversation_memory`: Previous chat context for continuity

In [9]:
# Create sample user profiles
def create_sample_users():
    """Create diverse customer profiles for retail voice agent"""
    
    users = [
        {
            "_id": "sarah_johnson",
            "user_id": "sarah_johnson",
            "full_name": "Sarah Johnson",
            "age": 28,
            "gender": "female",
            "location": {
                "city": "Seattle",
                "state": "WA",
                "zip": "98101",
                "climate": "temperate_rainy"
            },
            "contact": {
                "email": "sarah.j@email.com",
                "phone": "+12065551234",
                "phone4": "1234"
            },
            "verification": {
                "account_last4": "5678",
                "birthdate_mmdd": "0515"
            },
            "preferences": {
                "style": ["casual", "athleisure", "minimalist"],
                "colors": ["navy", "grey", "black", "white"],
                "sizes": {"tops": "M", "bottoms": "8", "shoes": "8"},
                "brands": ["Nike", "Lululemon", "Everlane"],
                "fit_preference": "relaxed",
                "price_range": "mid"
            },
            "shopping_patterns": {
                "avg_monthly_spend": 250,
                "purchase_frequency": "bi-weekly",
                "favorite_categories": ["Activewear", "Casual Tops", "Sneakers"],
                "peak_shopping_times": ["weekend_morning", "weeknight_evening"],
                "preferred_channel": "mobile_app",
                "last_purchase_date": "2025-10-20",
                "total_lifetime_value": 4500
            },
            "dynamics365_data": {
                "customer_since": "2023-03-15",
                "loyalty_tier": "Gold",
                "loyalty_points": 3200,
                "customer_service_history": [
                    {"date": "2025-09-10", "reason": "sizing_question", "resolution": "size_guide_sent"},
                    {"date": "2025-08-05", "reason": "return_request", "resolution": "return_processed"}
                ],
                "past_orders_count": 18,
                "return_rate": 12,
                "satisfaction_score": 4.6
            },
            "conversation_memory": {
                "last_interaction": "2025-10-25",
                "recent_queries": ["yoga pants size 8", "running shoes for rain"],
                "known_context": [
                    "Training for a half marathon",
                    "Prefers sustainable brands",
                    "Usually shops during lunch breaks"
                ],
                "agent_notes": "Values quality over price, eco-conscious shopper"
            },
            "created_at": datetime.datetime.utcnow().isoformat() + "Z",
            "updated_at": datetime.datetime.utcnow().isoformat() + "Z"
        },
        {
            "_id": "michael_chen",
            "user_id": "michael_chen",
            "full_name": "Michael Chen",
            "age": 35,
            "gender": "male",
            "location": {
                "city": "San Francisco",
                "state": "CA",
                "zip": "94102",
                "climate": "mild_mediterranean"
            },
            "contact": {
                "email": "mchen@email.com",
                "phone": "+14155552345",
                "phone4": "2345"
            },
            "verification": {
                "account_last4": "9012",
                "birthdate_mmdd": "1203"
            },
            "preferences": {
                "style": ["business_casual", "modern", "tech_professional"],
                "colors": ["charcoal", "navy", "burgundy", "white"],
                "sizes": {"tops": "L", "bottoms": "34", "shoes": "10.5"},
                "brands": ["Bonobos", "J.Crew", "AllBirds"],
                "fit_preference": "slim",
                "price_range": "premium"
            },
            "shopping_patterns": {
                "avg_monthly_spend": 450,
                "purchase_frequency": "monthly",
                "favorite_categories": ["Dress Shirts", "Chinos", "Blazers"],
                "peak_shopping_times": ["sunday_afternoon"],
                "preferred_channel": "website",
                "last_purchase_date": "2025-10-22",
                "total_lifetime_value": 8900
            },
            "dynamics365_data": {
                "customer_since": "2021-06-10",
                "loyalty_tier": "Platinum",
                "loyalty_points": 7800,
                "customer_service_history": [
                    {"date": "2025-10-15", "reason": "tailoring_question", "resolution": "tailor_recommendation_provided"}
                ],
                "past_orders_count": 34,
                "return_rate": 8,
                "satisfaction_score": 4.8
            },
            "conversation_memory": {
                "last_interaction": "2025-10-22",
                "recent_queries": ["navy blazer 40R", "dress shirts no-iron"],
                "known_context": [
                    "Works in tech, needs business casual for office",
                    "Values fit and quality",
                    "Prefers online shopping with free returns"
                ],
                "agent_notes": "High-value customer, appreciates personalized service"
            },
            "created_at": datetime.datetime.utcnow().isoformat() + "Z",
            "updated_at": datetime.datetime.utcnow().isoformat() + "Z"
        },
        {
            "_id": "emma_rodriguez",
            "user_id": "emma_rodriguez",
            "full_name": "Emma Rodriguez",
            "age": 42,
            "gender": "female",
            "location": {
                "city": "Austin",
                "state": "TX",
                "zip": "78701",
                "climate": "hot_humid"
            },
            "contact": {
                "email": "emma.r@email.com",
                "phone": "+15125553456",
                "phone4": "3456"
            },
            "verification": {
                "account_last4": "3456",
                "birthdate_mmdd": "0728"
            },
            "preferences": {
                "style": ["boho", "comfortable", "colorful"],
                "colors": ["turquoise", "coral", "cream", "earth_tones"],
                "sizes": {"tops": "L", "bottoms": "12", "shoes": "9"},
                "brands": ["Free People", "Anthropologie", "Madewell"],
                "fit_preference": "relaxed",
                "price_range": "mid_premium"
            },
            "shopping_patterns": {
                "avg_monthly_spend": 320,
                "purchase_frequency": "every_3_weeks",
                "favorite_categories": ["Dresses", "Blouses", "Sandals", "Accessories"],
                "peak_shopping_times": ["thursday_evening", "saturday_afternoon"],
                "preferred_channel": "mobile_app",
                "last_purchase_date": "2025-10-18",
                "total_lifetime_value": 6200
            },
            "dynamics365_data": {
                "customer_since": "2022-01-20",
                "loyalty_tier": "Gold",
                "loyalty_points": 4900,
                "customer_service_history": [
                    {"date": "2025-09-28", "reason": "color_match_question", "resolution": "color_swatch_sent"},
                    {"date": "2025-07-15", "reason": "exchange_request", "resolution": "exchange_completed"}
                ],
                "past_orders_count": 26,
                "return_rate": 15,
                "satisfaction_score": 4.7
            },
            "conversation_memory": {
                "last_interaction": "2025-10-18",
                "recent_queries": ["flowy summer dresses", "turquoise jewelry"],
                "known_context": [
                    "Works from home, prefers comfortable clothing",
                    "Loves color and unique pieces",
                    "Often asks for styling advice"
                ],
                "agent_notes": "Creative professional, values personal expression through fashion"
            },
            "created_at": datetime.datetime.utcnow().isoformat() + "Z",
            "updated_at": datetime.datetime.utcnow().isoformat() + "Z"
        }
    ]
    
    return users

# Generate sample users
sample_users = create_sample_users()
print(f"‚úÖ Created {len(sample_users)} sample user profiles")
for user in sample_users:
    print(f"   - {user['full_name']} ({user['age']}, {user['location']['city']}) - {user['dynamics365_data']['loyalty_tier']} member")

‚úÖ Created 3 sample user profiles
   - Sarah Johnson (28, Seattle) - Gold member
   - Michael Chen (35, San Francisco) - Platinum member
   - Emma Rodriguez (42, Austin) - Gold member


In [10]:
# Insert users into Cosmos DB
async def insert_users():
    """Insert sample user profiles into Cosmos DB"""
    users_manager = get_collection_manager("users")
    
    try:
        for user in sample_users:
            result = await asyncio.to_thread(
                users_manager.upsert_document,
                document=user,
                query={"_id": user["_id"]}
            )
            print(f"‚úÖ Upserted user: {user['full_name']}")
        
        print(f"\nüéâ Successfully inserted {len(sample_users)} users into Cosmos DB!")
        return True
    except Exception as e:
        print(f"‚ùå Failed to insert users: {e}")
        return False

# Run the insertion
await insert_users()

‚úÖ Upserted user: Sarah Johnson
‚úÖ Upserted user: Michael Chen
‚úÖ Upserted user: Emma Rodriguez

üéâ Successfully inserted 3 users into Cosmos DB!


True

#### üõçÔ∏è Step 2: Products Collection - Complete Product Catalog

**Purpose**: Store detailed product information with pricing, inventory, and assortment

**Schema**:
- `_id` / `product_id`: Matches Azure AI Search `id` field (CRITICAL for RAG pattern)
- `name`, `category`, `gender`, `brand`: Basic product info
- `pricing`: Base price, discount tiers, regional pricing variations
- `inventory`: Stock levels by region/store, availability status
- `assortment`: Which regions/stores carry this product
- `specifications`: Size, color, material, care instructions
- `merchandising`: Display priority, cross-sell recommendations

In [11]:
# Create sample products with detailed pricing and assortment
def create_sample_products():
    """Create product catalog entries matching Azure AI Search schema"""
    
    products = [
        {
            "_id": "PROD-WM-TOP-001",
            "product_id": "PROD-WM-TOP-001",  # Matches Azure AI Search id
            "name": "Classic Crew Neck T-Shirt",
            "category": "Tops",
            "gender": "Women",
            "brand": "Everlane",
            "formality": "casual",
            "fit": "relaxed",
            "features": ["breathable", "organic_cotton", "machine_washable"],
            "climate": ["warm", "mild"],
            "pricing": {
                "base_price": 28.00,
                "currency": "USD",
                "discount_tiers": {
                    "member": 25.20,  # 10% off
                    "gold": 23.80,     # 15% off
                    "platinum": 22.40  # 20% off
                },
                "regional_pricing": {
                    "US_WEST": 28.00,
                    "US_EAST": 28.00,
                    "US_SOUTH": 26.00  # Lower cost region
                },
                "sale_price": None,
                "on_sale": False
            },
            "inventory": {
                "total_stock": 450,
                "by_region": {
                    "US_WEST": {"stock": 180, "reserved": 12, "available": 168},
                    "US_EAST": {"stock": 150, "reserved": 8, "available": 142},
                    "US_SOUTH": {"stock": 120, "reserved": 5, "available": 115}
                },
                "low_stock_threshold": 50,
                "restock_date": None
            },
            "assortment": {
                "available_regions": ["US_WEST", "US_EAST", "US_SOUTH"],
                "stores": ["Seattle_Downtown", "SF_Union_Square", "Austin_Domain"],
                "online_only": False,
                "seasonal": False,
                "launch_date": "2024-01-15"
            },
            "specifications": {
                "colors": ["white", "black", "navy", "grey"],
                "sizes": ["XS", "S", "M", "L", "XL"],
                "materials": ["100% organic cotton"],
                "care_instructions": ["machine_wash_cold", "tumble_dry_low"],
                "country_of_origin": "USA"
            },
            "merchandising": {
                "display_priority": 85,
                "featured": True,
                "cross_sell": ["PROD-WM-BOTTOM-005", "PROD-WM-SHOE-012"],
                "frequently_bought_with": ["PROD-WM-BOTTOM-003"],
                "customer_rating": 4.7,
                "review_count": 342
            },
            "image_url": "https://retail-images.blob.core.windows.net/products/PROD-WM-TOP-001.jpg",
            "created_at": datetime.datetime.utcnow().isoformat() + "Z",
            "updated_at": datetime.datetime.utcnow().isoformat() + "Z"
        },
        {
            "_id": "PROD-MN-SHIRT-022",
            "product_id": "PROD-MN-SHIRT-022",
            "name": "Slim Fit Oxford Shirt",
            "category": "Shirts",
            "gender": "Men",
            "brand": "Bonobos",
            "formality": "business_casual",
            "fit": "slim",
            "features": ["wrinkle_resistant", "stretch_fabric", "machine_washable"],
            "climate": ["warm", "mild", "cold"],
            "pricing": {
                "base_price": 89.00,
                "currency": "USD",
                "discount_tiers": {
                    "member": 80.10,
                    "gold": 75.65,
                    "platinum": 71.20
                },
                "regional_pricing": {
                    "US_WEST": 89.00,
                    "US_EAST": 89.00,
                    "US_SOUTH": 85.00
                },
                "sale_price": 69.00,
                "on_sale": True
            },
            "inventory": {
                "total_stock": 280,
                "by_region": {
                    "US_WEST": {"stock": 120, "reserved": 15, "available": 105},
                    "US_EAST": {"stock": 100, "reserved": 10, "available": 90},
                    "US_SOUTH": {"stock": 60, "reserved": 3, "available": 57}
                },
                "low_stock_threshold": 40,
                "restock_date": "2025-11-15"
            },
            "assortment": {
                "available_regions": ["US_WEST", "US_EAST", "US_SOUTH"],
                "stores": ["SF_Union_Square", "Seattle_Bellevue"],
                "online_only": False,
                "seasonal": False,
                "launch_date": "2024-08-20"
            },
            "specifications": {
                "colors": ["white", "light_blue", "navy", "charcoal"],
                "sizes": ["S", "M", "L", "XL", "XXL"],
                "materials": ["97% cotton", "3% spandex"],
                "care_instructions": ["machine_wash_cold", "hang_dry"],
                "country_of_origin": "Portugal"
            },
            "merchandising": {
                "display_priority": 92,
                "featured": True,
                "cross_sell": ["PROD-MN-PANT-008", "PROD-MN-BLAZER-003"],
                "frequently_bought_with": ["PROD-MN-PANT-008"],
                "customer_rating": 4.8,
                "review_count": 521
            },
            "image_url": "https://retail-images.blob.core.windows.net/products/PROD-MN-SHIRT-022.jpg",
            "created_at": datetime.datetime.utcnow().isoformat() + "Z",
            "updated_at": datetime.datetime.utcnow().isoformat() + "Z"
        },
        {
            "_id": "PROD-WM-SHOE-045",
            "product_id": "PROD-WM-SHOE-045",
            "name": "Ultra Boost Running Shoes",
            "category": "Footwear",
            "gender": "Women",
            "brand": "Nike",
            "formality": "athletic",
            "fit": "standard",
            "features": ["cushioned", "breathable", "arch_support", "reflective"],
            "climate": ["warm", "mild", "cold", "rainy"],
            "pricing": {
                "base_price": 140.00,
                "currency": "USD",
                "discount_tiers": {
                    "member": 133.00,
                    "gold": 126.00,
                    "platinum": 119.00
                },
                "regional_pricing": {
                    "US_WEST": 140.00,
                    "US_EAST": 140.00,
                    "US_SOUTH": 135.00
                },
                "sale_price": None,
                "on_sale": False
            },
            "inventory": {
                "total_stock": 320,
                "by_region": {
                    "US_WEST": {"stock": 150, "reserved": 22, "available": 128},
                    "US_EAST": {"stock": 110, "reserved": 18, "available": 92},
                    "US_SOUTH": {"stock": 60, "reserved": 8, "available": 52}
                },
                "low_stock_threshold": 60,
                "restock_date": None
            },
            "assortment": {
                "available_regions": ["US_WEST", "US_EAST", "US_SOUTH"],
                "stores": ["Seattle_Downtown", "SF_Union_Square", "Austin_Domain"],
                "online_only": False,
                "seasonal": False,
                "launch_date": "2024-03-10"
            },
            "specifications": {
                "colors": ["black_white", "navy_coral", "grey_pink"],
                "sizes": ["6", "6.5", "7", "7.5", "8", "8.5", "9", "9.5", "10"],
                "materials": ["synthetic_mesh", "rubber_sole"],
                "care_instructions": ["spot_clean", "air_dry"],
                "country_of_origin": "Vietnam"
            },
            "merchandising": {
                "display_priority": 95,
                "featured": True,
                "cross_sell": ["PROD-WM-APPAREL-078", "PROD-WM-ACC-034"],
                "frequently_bought_with": ["PROD-WM-SOCK-012"],
                "customer_rating": 4.9,
                "review_count": 1247
            },
            "image_url": "https://retail-images.blob.core.windows.net/products/PROD-WM-SHOE-045.jpg",
            "created_at": datetime.datetime.utcnow().isoformat() + "Z",
            "updated_at": datetime.datetime.utcnow().isoformat() + "Z"
        }
    ]
    
    return products

# Generate sample products
sample_products = create_sample_products()
print(f"‚úÖ Created {len(sample_products)} sample products")
for product in sample_products:
    price = product['pricing']['base_price']
    stock = product['inventory']['total_stock']
    print(f"   - {product['name']} (${price:.2f}) - Stock: {stock} units")

‚úÖ Created 3 sample products
   - Classic Crew Neck T-Shirt ($28.00) - Stock: 450 units
   - Slim Fit Oxford Shirt ($89.00) - Stock: 280 units
   - Ultra Boost Running Shoes ($140.00) - Stock: 320 units


In [12]:
# Insert products into Cosmos DB
async def insert_products():
    """Insert sample products into Cosmos DB"""
    products_manager = get_collection_manager("products")
    
    try:
        for product in sample_products:
            result = await asyncio.to_thread(
                products_manager.upsert_document,
                document=product,
                query={"_id": product["_id"]}
            )
            print(f"‚úÖ Upserted product: {product['name']}")
        
        print(f"\nüéâ Successfully inserted {len(sample_products)} products into Cosmos DB!")
        print(f"‚ö° Product IDs match Azure AI Search IDs for RAG pattern")
        return True
    except Exception as e:
        print(f"‚ùå Failed to insert products: {e}")
        return False

# Run the insertion
await insert_products()

‚úÖ Upserted product: Classic Crew Neck T-Shirt
‚úÖ Upserted product: Slim Fit Oxford Shirt
‚úÖ Upserted product: Ultra Boost Running Shoes

üéâ Successfully inserted 3 products into Cosmos DB!
‚ö° Product IDs match Azure AI Search IDs for RAG pattern


True

In [14]:
# Create sample shopping sessions
def create_sample_sessions():
    """Create shopping session examples for tracking user behavior"""
    
    sessions = [
        {
            "_id": "SESSION-2025-10-25-001",
            "session_id": "SESSION-2025-10-25-001",
            "user_id": "sarah_johnson",
            "session_start": "2025-10-25T14:32:00Z",
            "session_end": "2025-10-25T14:58:00Z",
            "duration_minutes": 26,
            "products_viewed": [
                {"product_id": "PROD-WM-SHOE-045", "time_spent_seconds": 180, "timestamp": "2025-10-25T14:35:00Z"},
                {"product_id": "PROD-WM-TOP-001", "time_spent_seconds": 90, "timestamp": "2025-10-25T14:42:00Z"},
                {"product_id": "PROD-WM-APPAREL-078", "time_spent_seconds": 120, "timestamp": "2025-10-25T14:48:00Z"}
            ],
            "products_searched": [
                {"query": "running shoes for rain", "results_count": 12, "timestamp": "2025-10-25T14:33:00Z"},
                {"query": "yoga pants size 8", "results_count": 24, "timestamp": "2025-10-25T14:50:00Z"}
            ],
            "cart_items": [
                {
                    "product_id": "PROD-WM-SHOE-045",
                    "quantity": 1,
                    "size": "8",
                    "color": "navy_coral",
                    "price": 133.00,  # Member discount applied
                    "added_at": "2025-10-25T14:38:00Z"
                }
            ],
            "purchase_intent": {
                "score": 0.85,
                "signals": ["item_in_cart", "long_view_time", "member_discount_applied"],
                "predicted_conversion": True
            },
            "agent_interactions": {
                "channel": "voice",
                "query_count": 3,
                "key_questions": [
                    "Do you have running shoes good for rainy weather?",
                    "What size do you recommend for someone who usually wears 8?",
                    "Can I use my loyalty points on this purchase?"
                ],
                "recommendations_provided": ["PROD-WM-SHOE-045", "PROD-WM-APPAREL-078"],
                "satisfaction_rating": None  # Not yet completed
            },
            "context": {
                "device": "mobile",
                "location": "Seattle, WA",
                "referrer": "mobile_app_home",
                "campaign": None
            },
            "created_at": datetime.datetime.utcnow().isoformat() + "Z",
            "updated_at": datetime.datetime.utcnow().isoformat() + "Z"
        },
        {
            "_id": "SESSION-2025-10-22-045",
            "session_id": "SESSION-2025-10-22-045",
            "user_id": "michael_chen",
            "session_start": "2025-10-22T19:15:00Z",
            "session_end": "2025-10-22T19:42:00Z",
            "duration_minutes": 27,
            "products_viewed": [
                {"product_id": "PROD-MN-SHIRT-022", "time_spent_seconds": 240, "timestamp": "2025-10-22T19:18:00Z"},
                {"product_id": "PROD-MN-PANT-008", "time_spent_seconds": 150, "timestamp": "2025-10-22T19:28:00Z"},
                {"product_id": "PROD-MN-BLAZER-003", "time_spent_seconds": 180, "timestamp": "2025-10-22T19:35:00Z"}
            ],
            "products_searched": [
                {"query": "navy blazer 40R", "results_count": 8, "timestamp": "2025-10-22T19:16:00Z"},
                {"query": "dress shirts no-iron", "results_count": 15, "timestamp": "2025-10-22T19:25:00Z"}
            ],
            "cart_items": [
                {
                    "product_id": "PROD-MN-SHIRT-022",
                    "quantity": 2,
                    "size": "L",
                    "color": "white",
                    "price": 138.00,  # 2 x $69 sale price
                    "added_at": "2025-10-22T19:22:00Z"
                },
                {
                    "product_id": "PROD-MN-SHIRT-022",
                    "quantity": 1,
                    "size": "L",
                    "color": "light_blue",
                    "price": 69.00,
                    "added_at": "2025-10-22T19:23:00Z"
                }
            ],
            "purchase_intent": {
                "score": 0.92,
                "signals": ["multiple_items_in_cart", "sale_price", "high_value_customer", "repeat_buyer"],
                "predicted_conversion": True
            },
            "agent_interactions": {
                "channel": "chat",
                "query_count": 5,
                "key_questions": [
                    "Is the slim fit shirt true to size?",
                    "Can I get free returns if these don't fit?",
                    "Do you have matching pants for business casual?",
                    "What's your fastest shipping option?",
                    "Can I apply my loyalty points to this order?"
                ],
                "recommendations_provided": ["PROD-MN-PANT-008", "PROD-MN-BLAZER-003"],
                "satisfaction_rating": 5  # Excellent
            },
            "context": {
                "device": "desktop",
                "location": "San Francisco, CA",
                "referrer": "google_search",
                "campaign": "fall_sale_2025"
            },
            "created_at": datetime.datetime.utcnow().isoformat() + "Z",
            "updated_at": datetime.datetime.utcnow().isoformat() + "Z"
        }
    ]
    
    return sessions

# Generate sample sessions
sample_sessions = create_sample_sessions()
print(f"‚úÖ Created {len(sample_sessions)} sample shopping sessions")
for session in sample_sessions:
    user = session['user_id']
    items = len(session['cart_items'])
    intent = session['purchase_intent']['score']
    print(f"   - {session['session_id']}: User {user}, {items} cart items, {intent:.0%} purchase intent")

‚úÖ Created 2 sample shopping sessions
   - SESSION-2025-10-25-001: User sarah_johnson, 1 cart items, 85% purchase intent
   - SESSION-2025-10-22-045: User michael_chen, 2 cart items, 92% purchase intent


In [15]:
# Insert shopping sessions into Cosmos DB
async def insert_sessions():
    """Insert sample shopping sessions into Cosmos DB"""
    sessions_manager = get_collection_manager("shopping_sessions")
    
    try:
        for session in sample_sessions:
            result = await asyncio.to_thread(
                sessions_manager.upsert_document,
                document=session,
                query={"_id": session["_id"]}
            )
            print(f"‚úÖ Upserted session: {session['session_id']}")
        
        print(f"\nüéâ Successfully inserted {len(sample_sessions)} shopping sessions into Cosmos DB!")
        return True
    except Exception as e:
        print(f"‚ùå Failed to insert sessions: {e}")
        return False

# Run the insertion
await insert_sessions()

‚úÖ Upserted session: SESSION-2025-10-25-001
‚úÖ Upserted session: SESSION-2025-10-22-045

üéâ Successfully inserted 2 shopping sessions into Cosmos DB!


True

## üì∏ Step 4: Process Images with Azure OpenAI - Index to Azure AI Search & Cosmos DB

**Pipeline Flow**:
1. Load image from `utils/data/clothes/`
2. **Azure OpenAI GPT-4o Vision**: Extract product details (category, colors, materials, style)
3. **Generate Product ID**: Unique identifier
4. **Upload to Azure Blob Storage**: Get real public URL for image retrieval
5. **Azure OpenAI Embeddings**: Generate 3072-dim vector from description
6. **Generate Realistic Data**: Pricing, inventory, assortment (using AI)
7. **Validate with Pydantic**: Schema validation
8. **Insert into Azure AI Search**: Searchable index with real image URL
9. **Insert into Cosmos DB**: Complete product details with real image URL

**Models Used**:
- Vision: `gpt-4o` for image analysis
- Embeddings: `text-embedding-3-large` (3072 dimensions)

**Storage**:
- Images: Azure Blob Storage container `retail-products`
- Search: Azure AI Search index `clothing-index`
- Data: Cosmos DB database `retail-db` collection `products`

In [2]:
# Step 1: Setup Azure OpenAI Clients
import os
import json
import base64
from pathlib import Path
from pydantic import BaseModel, Field
from typing import List, Optional, Literal
from openai import AzureOpenAI

# Azure OpenAI Configuration
AOAI_KEY = os.environ["AZURE_OPENAI_KEY"]
AOAI_ENDPOINT = os.environ["AZURE_OPENAI_ENDPOINT"]
AOAI_API_VERSION = "2024-08-01-preview"
EMBEDDING_MODEL = os.environ.get("AZURE_OPENAI_CHAT_EMBEDDING_ID", "text-embedding-3-large")
EMBEDDING_DIMENSIONS = int(os.environ.get("AZURE_OPENAI_EMBEDDING_DIMENSIONS", "3072"))
VISION_MODEL = "gpt-4o"  # For image analysis

# Initialize Azure OpenAI client
aoai_client = AzureOpenAI(
    api_key=AOAI_KEY,
    api_version=AOAI_API_VERSION,
    azure_endpoint=AOAI_ENDPOINT
)

print(f"‚úÖ Azure OpenAI Client Initialized")
print(f"   üîç Vision Model: {VISION_MODEL}")
print(f"   üìä Embedding Model: {EMBEDDING_MODEL} ({EMBEDDING_DIMENSIONS} dimensions)")
print(f"   üåê Endpoint: {AOAI_ENDPOINT}")

‚úÖ Azure OpenAI Client Initialized
   üîç Vision Model: gpt-4o
   üìä Embedding Model: text-embedding-3-large (3072 dimensions)
   üåê Endpoint: https://aoai-ai-factory-eus-dev.openai.azure.com/

   üîç Vision Model: gpt-4o
   üìä Embedding Model: text-embedding-3-large (3072 dimensions)
   üåê Endpoint: https://aoai-ai-factory-eus-dev.openai.azure.com/


In [3]:
# Step 2: Define Pydantic Schemas for Validation

class ProductExtraction(BaseModel):
    """Schema for data extracted from product image"""
    name: str = Field(..., description="Product name/title")
    category: Literal["Jeans", "Sweaters", "Tops", "Bottoms", "Footwear", "Accessories"]
    gender: Literal["Men", "Women", "Unisex"]
    formality: Literal["casual", "business_casual", "formal", "athletic"]
    fit: Literal["slim", "relaxed", "standard", "oversized"]
    colors: List[str] = Field(..., description="Main colors visible in product")
    materials: List[str] = Field(..., description="Perceived materials (cotton, denim, wool, etc)")
    features: List[str] = Field(..., description="Special features (stretch, waterproof, pockets, etc)")
    climate: List[Literal["warm", "mild", "cold", "rainy"]] = Field(..., description="Suitable climates")
    rich_description: str = Field(..., description="Detailed product description for search")
    style_tags: List[str] = Field(..., description="Style descriptors (vintage, modern, classic, etc)")

class PricingData(BaseModel):
    """Pricing tiers and regional variations"""
    base_price: float
    currency: str = "USD"
    discount_tiers: dict = Field(default={"member": 0, "gold": 0, "platinum": 0})
    regional_pricing: dict = Field(default={"US_WEST": 0, "US_EAST": 0, "US_SOUTH": 0})
    sale_price: Optional[float] = None
    on_sale: bool = False

class InventoryData(BaseModel):
    """Stock levels by region"""
    total_stock: int
    by_region: dict
    low_stock_threshold: int
    restock_date: Optional[str] = None

class CompleteProduct(BaseModel):
    """Complete product for Cosmos DB"""
    product_id: str
    name: str
    category: str
    gender: str
    brand: str
    formality: str
    fit: str
    features: List[str]
    climate: List[str]
    pricing: dict
    inventory: dict
    assortment: dict
    specifications: dict
    merchandising: dict
    image_url: str
    rich_description: str
    desc_vector: List[float]
    
print("‚úÖ Pydantic schemas defined for validation")

‚úÖ Pydantic schemas defined for validation



In [4]:
# Step 3: Image Processing with Azure OpenAI Vision

def encode_image_to_base64(image_path: Path) -> str:
    """Encode image to base64 for Azure OpenAI"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def analyze_product_image(image_path: Path) -> ProductExtraction:
    """Use GPT-4o Vision to extract product details from image"""
    
    # Encode image
    base64_image = encode_image_to_base64(image_path)
    
    # Create prompt for structured extraction
    system_prompt = """You are a professional fashion product analyst. Analyze the clothing image and extract detailed product information.
    
Be precise and realistic. Use actual fashion industry terminology.

Return JSON with this exact structure:
{
    "name": "Product name",
    "category": "Jeans|Sweaters|Tops|Bottoms|Footwear|Accessories",
    "gender": "Men|Women|Unisex",
    "formality": "casual|business_casual|formal|athletic",
    "fit": "slim|relaxed|standard|oversized",
    "colors": ["color1", "color2"],
    "materials": ["material1", "material2"],
    "features": ["feature1", "feature2"],
    "climate": ["warm|mild|cold|rainy"],
    "rich_description": "Detailed 2-3 sentence product description for search indexing",
    "style_tags": ["tag1", "tag2"]
}"""
    
    try:
        response = aoai_client.chat.completions.create(
            model=VISION_MODEL,
            messages=[
                {"role": "system", "content": system_prompt},
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": f"Analyze this product image: {image_path.name}. Extract all product details in JSON format."
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{base64_image}"
                            }
                        }
                    ]
                }
            ],
            response_format={"type": "json_object"},
            temperature=0.3,
            max_tokens=1000
        )
        
        # Parse response
        result = json.loads(response.choices[0].message.content)
        
        # Validate with Pydantic
        product_data = ProductExtraction(**result)
        
        return product_data
        
    except Exception as e:
        print(f"‚ùå Error analyzing image: {e}")
        raise

print("‚úÖ Image analysis function ready")

‚úÖ Image analysis function ready



In [5]:
# Step 4: Generate Embeddings for Text Description

def generate_embedding(text: str) -> List[float]:
    """Generate 3072-dim embedding using text-embedding-3-large"""
    try:
        response = aoai_client.embeddings.create(
            model=EMBEDDING_MODEL,
            input=text,
            dimensions=EMBEDDING_DIMENSIONS
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"‚ùå Error generating embedding: {e}")
        raise

print(f"‚úÖ Embedding generation ready ({EMBEDDING_DIMENSIONS} dimensions)")

‚úÖ Embedding generation ready (3072 dimensions)



In [6]:
# Step 5: Generate Realistic Product Data with AI

import random
import hashlib

def generate_product_id(image_path: Path, gender: str, category: str) -> str:
    """Generate unique product ID based on image and metadata"""
    # Create hash from filename for consistency
    hash_input = f"{image_path.name}_{gender}_{category}"
    hash_short = hashlib.md5(hash_input.encode()).hexdigest()[:8].upper()
    
    # Format: PROD-{GENDER_CODE}-{CATEGORY_CODE}-{HASH}
    gender_code = "MN" if gender == "Men" else "WM" if gender == "Women" else "UN"
    category_code = category[:4].upper()
    
    return f"PROD-{gender_code}-{category_code}-{hash_short}"

async def generate_realistic_product_data(
    product_extraction: ProductExtraction,
    product_id: str,
    image_path: Path
) -> dict:
    """Use Azure OpenAI to generate realistic pricing, inventory, and assortment data"""
    
    prompt = f"""Generate realistic retail product data for this clothing item:
    
Product: {product_extraction.name}
Category: {product_extraction.category}
Gender: {product_extraction.gender}
Materials: {', '.join(product_extraction.materials)}
Features: {', '.join(product_extraction.features)}

Generate realistic data for a mid-to-premium retail brand. Return JSON with:

{{
    "brand": "realistic brand name for this product type",
    "base_price": realistic_price_in_dollars,
    "total_stock": realistic_inventory_count,
    "sizes": ["appropriate sizes for this product"],
    "customer_rating": rating_out_of_5,
    "review_count": number_of_reviews,
    "display_priority": priority_score_0_to_100
}}

Be realistic - jeans typically $60-120, sweaters $40-90, etc."""

    try:
        response = aoai_client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a retail merchandising expert. Generate realistic product data."},
                {"role": "user", "content": prompt}
            ],
            response_format={"type": "json_object"},
            temperature=0.7,
            max_tokens=500
        )
        
        ai_data = json.loads(response.choices[0].message.content)
        
        # Calculate pricing tiers
        base_price = ai_data["base_price"]
        pricing = {
            "base_price": base_price,
            "currency": "USD",
            "discount_tiers": {
                "member": round(base_price * 0.90, 2),  # 10% off
                "gold": round(base_price * 0.85, 2),     # 15% off
                "platinum": round(base_price * 0.80, 2)  # 20% off
            },
            "regional_pricing": {
                "US_WEST": base_price,
                "US_EAST": base_price,
                "US_SOUTH": round(base_price * 0.95, 2)  # 5% cheaper
            },
            "sale_price": round(base_price * 0.75, 2) if random.random() < 0.3 else None,
            "on_sale": random.random() < 0.3
        }
        
        # Generate inventory by region
        total_stock = ai_data["total_stock"]
        west_stock = int(total_stock * 0.40)
        east_stock = int(total_stock * 0.35)
        south_stock = total_stock - west_stock - east_stock
        
        inventory = {
            "total_stock": total_stock,
            "by_region": {
                "US_WEST": {
                    "stock": west_stock,
                    "reserved": random.randint(5, 20),
                    "available": west_stock - random.randint(5, 20)
                },
                "US_EAST": {
                    "stock": east_stock,
                    "reserved": random.randint(3, 15),
                    "available": east_stock - random.randint(3, 15)
                },
                "US_SOUTH": {
                    "stock": south_stock,
                    "reserved": random.randint(2, 10),
                    "available": south_stock - random.randint(2, 10)
                }
            },
            "low_stock_threshold": int(total_stock * 0.15),
            "restock_date": None if total_stock > 100 else "2025-11-15"
        }
        
        # Assortment data
        assortment = {
            "available_regions": ["US_WEST", "US_EAST", "US_SOUTH"],
            "stores": ["Seattle_Downtown", "SF_Union_Square", "Austin_Domain"],
            "online_only": False,
            "seasonal": "Sweaters" in product_extraction.category,
            "launch_date": "2024-09-01"
        }
        
        # Specifications
        specifications = {
            "colors": product_extraction.colors,
            "sizes": ai_data["sizes"],
            "materials": product_extraction.materials,
            "care_instructions": ["machine_wash_cold", "tumble_dry_low"] if "cotton" in str(product_extraction.materials).lower() else ["dry_clean_only"],
            "country_of_origin": random.choice(["USA", "Italy", "Portugal", "Vietnam"])
        }
        
        # Merchandising
        merchandising = {
            "display_priority": ai_data["display_priority"],
            "featured": ai_data["display_priority"] > 85,
            "cross_sell": [],  # Will be populated later
            "frequently_bought_with": [],
            "customer_rating": ai_data["customer_rating"],
            "review_count": ai_data["review_count"]
        }
        
        return {
            "brand": ai_data["brand"],
            "pricing": pricing,
            "inventory": inventory,
            "assortment": assortment,
            "specifications": specifications,
            "merchandising": merchandising
        }
        
    except Exception as e:
        print(f"‚ùå Error generating product data: {e}")
        raise

print("‚úÖ Realistic data generation ready")

‚úÖ Realistic data generation ready



In [7]:
# Step 5.5: Upload Image to Azure Blob Storage (MANAGED IDENTITY)

from azure.storage.blob import BlobServiceClient, ContentSettings
from azure.identity import DefaultAzureCredential
from azure.core.exceptions import ResourceExistsError

# Azure Blob Storage Configuration (Managed Identity - NO CONNECTION STRING)
AZURE_STORAGE_ACCOUNT_NAME = os.environ.get("AZURE_STORAGE_ACCOUNT_NAME", "storagefactoryeastus")
BLOB_CONTAINER_NAME = os.environ.get("AZURE_BLOB_CONTAINER_PRODUCTS", "clothesimages")

async def upload_image_to_blob(image_path: Path, product_id: str) -> str:
    """
    Upload product image to Azure Blob Storage using MANAGED IDENTITY
    
    NOTE: This storage account has Shared Key authorization DISABLED.
    Must use Managed Identity (DefaultAzureCredential) instead of connection string.
    
    Args:
        image_path: Path to local image file
        product_id: Unique product identifier
    
    Returns:
        Public URL to the uploaded blob
    """
    try:
        # Initialize Blob Service Client with Managed Identity
        account_url = f"https://{AZURE_STORAGE_ACCOUNT_NAME}.blob.core.windows.net"
        credential = DefaultAzureCredential()
        blob_service_client = BlobServiceClient(account_url=account_url, credential=credential)
        
        print(f"   üîê Using Managed Identity for storage account: {AZURE_STORAGE_ACCOUNT_NAME}")
        
        # Create container if it doesn't exist
        try:
            container_client = blob_service_client.create_container(BLOB_CONTAINER_NAME)
            print(f"   üì¶ Created container: {BLOB_CONTAINER_NAME}")
        except ResourceExistsError:
            container_client = blob_service_client.get_container_client(BLOB_CONTAINER_NAME)
        
        # Determine file extension and content type
        file_ext = image_path.suffix.lower()
        content_type_map = {
            ".jpg": "image/jpeg",
            ".jpeg": "image/jpeg",
            ".png": "image/png",
            ".webp": "image/webp"
        }
        content_type = content_type_map.get(file_ext, "image/jpeg")
        
        # Create blob name with product ID
        blob_name = f"products/{product_id}{file_ext}"
        
        # Upload image
        blob_client = blob_service_client.get_blob_client(
            container=BLOB_CONTAINER_NAME,
            blob=blob_name
        )
        
        with open(image_path, "rb") as data:
            blob_client.upload_blob(
                data,
                overwrite=True,
                content_settings=ContentSettings(content_type=content_type)
            )
        
        # Generate public URL
        blob_url = blob_client.url
        
        print(f"   ‚úÖ Uploaded to blob: {blob_name}")
        print(f"   üåê URL: {blob_url}")
        
        return blob_url
        
    except Exception as e:
        print(f"   ‚ùå Failed to upload to blob storage: {e}")
        print(f"   üí° Make sure you're logged in with 'az login' and have 'Storage Blob Data Contributor' role")
        # Return a fallback URL pattern
        fallback_url = f"https://{AZURE_STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{BLOB_CONTAINER_NAME}/products/{product_id}{image_path.suffix}"
        print(f"   ‚ö†Ô∏è  Using fallback URL: {fallback_url}")
        return fallback_url

print("‚úÖ Blob storage upload function ready (Managed Identity)")
print(f"   Storage Account: {AZURE_STORAGE_ACCOUNT_NAME}")
print(f"   Container: {BLOB_CONTAINER_NAME}")
print(f"   üîê Auth: Managed Identity (Shared Key disabled)")


‚úÖ Blob storage upload function ready (Managed Identity)
   Storage Account: storagefactoryeastus
   Container: clothesimages
   üîê Auth: Managed Identity (Shared Key disabled)

   Storage Account: storagefactoryeastus
   Container: clothesimages
   üîê Auth: Managed Identity (Shared Key disabled)


In [8]:
# Step 6: Complete Pipeline - Process ONE Image

async def process_single_image(image_path: Path) -> dict:
    """
    Complete pipeline: Image ‚Üí Vision Analysis ‚Üí Embedding ‚Üí Data Generation ‚Üí Index
    
    Returns: Complete product data ready for Azure AI Search + Cosmos DB
    """
    print(f"\n{'='*70}")
    print(f"üñºÔ∏è  Processing: {image_path.name}")
    print(f"{'='*70}")
    
    # Step 1: Extract from folder structure
    parts = image_path.parts
    gender_folder = parts[-2]  # "men" or "woman"
    category_folder = parts[-3]  # "jeans" or "sweaters"
    
    gender_map = {"men": "Men", "woman": "Women"}
    category_map = {"jeans": "Jeans", "sweaters": "Sweaters"}
    
    detected_gender = gender_map.get(gender_folder.lower(), "Unisex")
    detected_category = category_map.get(category_folder.lower(), "Clothing")
    
    print(f"üìÅ Detected: {detected_category} / {detected_gender}")
    
    # Step 2: Analyze image with GPT-4o Vision
    print(f"\n1Ô∏è‚É£ Analyzing image with GPT-4o Vision...")
    product_extraction = analyze_product_image(image_path)
    print(f"   ‚úÖ Extracted: {product_extraction.name}")
    print(f"   üé® Colors: {', '.join(product_extraction.colors)}")
    print(f"   üìù Description: {product_extraction.rich_description[:100]}...")
    
    # Step 3: Generate product ID
    product_id = generate_product_id(image_path, detected_gender, detected_category)
    print(f"\n2Ô∏è‚É£ Generated Product ID: {product_id}")
    
    # Step 4: Upload image to Azure Blob Storage (IMPORTANT!)
    print(f"\n3Ô∏è‚É£ Uploading image to Azure Blob Storage...")
    blob_url = await upload_image_to_blob(image_path, product_id)
    
    # Step 5: Generate embedding
    print(f"\n4Ô∏è‚É£ Generating embedding ({EMBEDDING_DIMENSIONS} dimensions)...")
    embedding = generate_embedding(product_extraction.rich_description)
    print(f"   ‚úÖ Embedding generated: {len(embedding)} dimensions")
    
    # Step 6: Generate realistic product data with AI
    print(f"\n5Ô∏è‚É£ Generating realistic product data with AI...")
    ai_generated_data = await generate_realistic_product_data(
        product_extraction,
        product_id,
        image_path
    )
    print(f"   ‚úÖ Brand: {ai_generated_data['brand']}")
    print(f"   üí∞ Price: ${ai_generated_data['pricing']['base_price']:.2f}")
    print(f"   üì¶ Stock: {ai_generated_data['inventory']['total_stock']} units")
    
    # Step 7: Build complete product document for Cosmos DB
    complete_product = {
        "_id": product_id,
        "product_id": product_id,
        "name": product_extraction.name,
        "category": product_extraction.category,
        "gender": product_extraction.gender,
        "brand": ai_generated_data["brand"],
        "formality": product_extraction.formality,
        "fit": product_extraction.fit,
        "features": product_extraction.features,
        "climate": product_extraction.climate,
        "pricing": ai_generated_data["pricing"],
        "inventory": ai_generated_data["inventory"],
        "assortment": ai_generated_data["assortment"],
        "specifications": ai_generated_data["specifications"],
        "merchandising": ai_generated_data["merchandising"],
        "image_url": blob_url,  # REAL blob storage URL!
        "rich_description": product_extraction.rich_description,
        "desc_vector": embedding,
        "created_at": datetime.datetime.utcnow().isoformat() + "Z",
        "updated_at": datetime.datetime.utcnow().isoformat() + "Z"
    }
    
    # Step 7: Build document for Azure AI Search (subset of fields)
    search_document = {
        "id": product_id,
        "category": product_extraction.category,
        "gender": product_extraction.gender,
        "formality": product_extraction.formality,
        "fit": product_extraction.fit,
        "features": product_extraction.features,
        "climate": product_extraction.climate,
        "colors": product_extraction.colors,
        "materials": product_extraction.materials,
        "image_url": complete_product["image_url"],
        "rich_description": product_extraction.rich_description,
        "desc_vector": embedding
    }
    
    print(f"\n{'='*70}")
    print(f"‚úÖ Processing complete!")
    print(f"{'='*70}\n")
    
    return {
        "cosmos_document": complete_product,
        "search_document": search_document,
        "product_id": product_id
    }

print("‚úÖ Complete pipeline function ready")

‚úÖ Complete pipeline function ready



In [9]:
# Step 7: TEST with ONE IMAGE - Complete Flow

# Pick first men's jeans image
test_image_path = Path("utils/data/clothes/jeans/men/Black Lenny Washed Jeans.png")

print(f"üß™ TESTING COMPLETE PIPELINE WITH ONE IMAGE")
print(f"üì∏ Test Image: {test_image_path}")
print(f"\nThis will:")
print(f"  1. Analyze image with GPT-4o Vision")
print(f"  2. Generate product ID")
print(f"  3. Create embedding (3072-dim)")
print(f"  4. Generate realistic pricing/inventory with AI")
print(f"  5. Prepare for Azure AI Search + Cosmos DB")
print(f"\n{'='*70}\n")

# Run the pipeline
result = await process_single_image(test_image_path)

print(f"\nüìä RESULTS:")
print(f"   Product ID: {result['product_id']}")
print(f"   Cosmos DB Document: {len(str(result['cosmos_document']))} chars")
print(f"   Azure AI Search Document: {len(str(result['search_document']))} chars")
print(f"   Embedding: {len(result['search_document']['desc_vector'])} dimensions")

# Show sample of Cosmos DB document
print(f"\nüìÑ Cosmos DB Document Sample:")
print(json.dumps({
    "product_id": result['cosmos_document']['product_id'],
    "name": result['cosmos_document']['name'],
    "brand": result['cosmos_document']['brand'],
    "base_price": result['cosmos_document']['pricing']['base_price'],
    "total_stock": result['cosmos_document']['inventory']['total_stock'],
    "description": result['cosmos_document']['rich_description'][:150] + "..."
}, indent=2))

üß™ TESTING COMPLETE PIPELINE WITH ONE IMAGE
üì∏ Test Image: utils\data\clothes\jeans\men\Black Lenny Washed Jeans.png

This will:
  1. Analyze image with GPT-4o Vision
  2. Generate product ID

üì∏ Test Image: utils\data\clothes\jeans\men\Black Lenny Washed Jeans.png

This will:
  1. Analyze image with GPT-4o Vision
  2. Generate product ID
  3. Create embedding (3072-dim)
  4. Generate realistic pricing/inventory with AI
  5. Prepare for Azure AI Search + Cosmos DB



üñºÔ∏è  Processing: Black Lenny Washed Jeans.png
  4. Generate realistic pricing/inventory with AI
  5. Prepare for Azure AI Search + Cosmos DB



üñºÔ∏è  Processing: Black Lenny Washed Jeans.png
üìÅ Detected: Jeans / Men

1Ô∏è‚É£ Analyzing image with GPT-4o Vision...

üìÅ Detected: Jeans / Men

1Ô∏è‚É£ Analyzing image with GPT-4o Vision...
   ‚úÖ Extracted: Lenny Washed Wide-Leg Jeans
   üé® Colors: blue
   üìù Description: The Lenny Washed Wide-Leg Jeans offer a relaxed fit with a contemporary wide-leg silhou

In [12]:
# Step 8: Insert into Azure AI Search

SEARCH_ENDPOINT = os.environ["AZURE_AI_SEARCH_SERVICE_ENDPOINT"]   # e.g. https://<service>.search.windows.net
SEARCH_API_KEY  = os.environ["AZURE_AI_SEARCH_ADMIN_KEY"]  # admin key
INDEX_NAME = "clothing-index"

from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential

async def insert_to_azure_search(search_document: dict) -> bool:
    """Insert document into Azure AI Search"""
    try:
        search_client = SearchClient(
            endpoint=SEARCH_ENDPOINT,
            index_name=INDEX_NAME,
            credential=AzureKeyCredential(SEARCH_API_KEY)
        )
        
        # Upload document
        result = search_client.upload_documents(documents=[search_document])
        
        if result[0].succeeded:
            print(f"‚úÖ Inserted into Azure AI Search: {search_document['id']}")
            return True
        else:
            print(f"‚ùå Failed to insert into Azure AI Search: {result[0].error_message}")
            return False
            
    except Exception as e:
        print(f"‚ùå Azure AI Search error: {e}")
        return False

# Insert the test product into Azure AI Search
print(f"\n{'='*70}")
print(f"üì§ Inserting into Azure AI Search...")
print(f"{'='*70}\n")

search_success = await insert_to_azure_search(result['search_document'])

if search_success:
    print(f"\nüéâ Product indexed in Azure AI Search!")
    print(f"   Index: {INDEX_NAME}")
    print(f"   Product ID: {result['product_id']}")



üì§ Inserting into Azure AI Search...üì§ Inserting into Azure AI Search...



‚úÖ Inserted into Azure AI Search: PROD-MN-JEAN-2F9B66DF

üéâ Product indexed in Azure AI Search!
   Index: clothing-index
‚úÖ Inserted into Azure AI Search: PROD-MN-JEAN-2F9B66DF

üéâ Product indexed in Azure AI Search!
   Index: clothing-index
   Product ID: PROD-MN-JEAN-2F9B66DF
   Product ID: PROD-MN-JEAN-2F9B66DF


In [17]:
# Step 9: Insert into Cosmos DB

async def insert_to_cosmos(cosmos_document: dict) -> bool:
    """Insert complete product document into Cosmos DB"""
    try:
        products_manager = get_collection_manager("products")
        
        result = await asyncio.to_thread(
            products_manager.upsert_document,
            document=cosmos_document,
            query={"_id": cosmos_document["_id"]}
        )
        
        print(f"‚úÖ Inserted into Cosmos DB: {cosmos_document['product_id']}")
        return True
        
    except Exception as e:
        print(f"‚ùå Cosmos DB error: {e}")
        return False

# Insert the test product into Cosmos DB
print(f"\n{'='*70}")
print(f"üì§ Inserting into Cosmos DB...")
print(f"{'='*70}\n")

cosmos_success = await insert_to_cosmos(result['cosmos_document'])

if cosmos_success:
    print(f"\nüéâ Product saved in Cosmos DB!")
    print(f"   Database: {DATABASE_NAME}")
    print(f"   Collection: products")
    print(f"   Product ID: {result['product_id']}")
    
print(f"\n{'='*70}")
print(f"‚úÖ COMPLETE PIPELINE SUCCESS!")
print(f"{'='*70}")
print(f"""
Product "{result['cosmos_document']['name']}" is now:
  ‚úÖ Indexed in Azure AI Search (searchable with vector similarity)
  ‚úÖ Stored in Cosmos DB (complete product details)
  
Product ID: {result['product_id']}
  - Azure AI Search 'id' = Cosmos DB 'product_id' ‚úÖ
  - Ready for RAG pattern! üöÄ
""")


üì§ Inserting into Cosmos DB...


üì§ Inserting into Cosmos DB...

‚úÖ Inserted into Cosmos DB: PROD-MN-JEAN-2F9B66DF

üéâ Product saved in Cosmos DB!‚úÖ Inserted into Cosmos DB: PROD-MN-JEAN-2F9B66DF

üéâ Product saved in Cosmos DB!
   Database: retail-db
   Collection: products

   Database: retail-db
   Collection: products
   Product ID: PROD-MN-JEAN-2F9B66DF

‚úÖ COMPLETE PIPELINE SUCCESS!   Product ID: PROD-MN-JEAN-2F9B66DF

‚úÖ COMPLETE PIPELINE SUCCESS!

Product "Lenny Washed Wide-Leg Jeans" is now:
  ‚úÖ Indexed in Azure AI Search (searchable with vector similarity)
  ‚úÖ Stored in Cosmos DB (complete product details)

Product ID: PROD-MN-JEAN-2F9B66DF
  - Azure AI Search 'id' = Cosmos DB 'product_id' ‚úÖ
  - Ready for RAG pattern! üöÄ



Product "Lenny Washed Wide-Leg Jeans" is now:
  ‚úÖ Indexed in Azure AI Search (searchable with vector similarity)
  ‚úÖ Stored in Cosmos DB (complete product details)

Product ID: PROD-MN-JEAN-2F9B66DF
  - Azure AI Search 'id' = Cosmos

## üéØ Pipeline Summary - What We Just Built

### ‚úÖ **Complete Image-to-Database Pipeline**

**Input**: Image file from `utils/data/clothes/jeans/men/Black Lenny Washed Jeans.png`

**Processing Steps**:
1. ‚úÖ **GPT-4o Vision** - Extracted product details (name, colors, materials, description)
2. ‚úÖ **Product ID Generation** - Created unique ID: `PROD-MN-JEAN-{HASH}`
3. ‚úÖ **Azure Blob Storage Upload** - Uploaded image to blob, got real public URL ‚≠ê **NEW!**
4. ‚úÖ **Embeddings** - Generated 3072-dim vector from description using `text-embedding-3-large`
5. ‚úÖ **AI Data Generation** - Used GPT-4o to create realistic:
   - Brand name
   - Pricing tiers (base, member, gold, platinum)
   - Regional pricing (US_WEST, US_EAST, US_SOUTH)
   - Inventory by region with stock levels
   - Assortment (stores, regions, availability)
   - Customer ratings & reviews
6. ‚úÖ **Pydantic Validation** - All data validated with schemas
7. ‚úÖ **Azure AI Search** - Indexed with vector for semantic search (with REAL blob URL)
8. ‚úÖ **Cosmos DB** - Saved complete product details (with REAL blob URL)

**Output**: 
- Product searchable in Azure AI Search by text + vector similarity
- Complete product data in Cosmos DB with pricing, inventory, assortment
- **Product ID matches across both systems** (enables RAG pattern)
- **Real image URLs from Azure Blob Storage** (critical for product retrieval!)

### üîÑ **Next Steps**:
- Test with ONE image first to verify blob upload works
- Process remaining images in `utils/data/clothes/`
- Batch processing for all jeans (men + women)
- Process sweaters category
- Build product recommendation engine

## üîç Step 8: Test RAG Retrieval Flow (Azure AI Search ‚Üí Cosmos DB ‚Üí LLM Format)

**Objective**: Simulate the complete retrieval augmented generation (RAG) flow:
1. **User Query**: Natural language search (e.g., "casual jeans for men")
2. **Azure AI Search**: Semantic vector search to find matching products
3. **Get Product IDs**: Extract product identifiers from search results
4. **Cosmos DB Lookup**: Retrieve full product details (pricing, inventory, metadata)
5. **LLM Format**: Structure data as context for the voice agent to respond

This mirrors what the backend will do when a user asks about products during a conversation.


In [20]:
# Step 8.1: Initialize Azure AI Search Client for Querying

from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential

# Create search client for querying the clothing index
search_query_client = SearchClient(
    endpoint=SEARCH_ENDPOINT,
    index_name=INDEX_NAME,
    credential=AzureKeyCredential(SEARCH_API_KEY)
)

print(f"‚úÖ Search query client initialized")
print(f"   Index: {INDEX_NAME}")
print(f"   Endpoint: {SEARCH_ENDPOINT}")


‚úÖ Search query client initialized
   Index: clothing-index
   Endpoint: https://search-ai-factory-centralus.search.windows.net


In [29]:
# Step 8.2: Create Semantic Search Function (Text ‚Üí Vector ‚Üí Search)

async def semantic_product_search(user_query: str, top_k: int = 3) -> List[Dict]:
    """
    Perform semantic search for products based on natural language query
    
    Args:
        user_query: Natural language search query (e.g., "casual jeans for men")
        top_k: Number of top results to return
    
    Returns:
        List of product matches with scores and metadata
    """
    print(f"üîç Searching for: '{user_query}'")
    print(f"   Top results: {top_k}")
    
    # Step 1: Generate embedding for the user query
    print(f"\nüìä Step 1: Generate query embedding...")
    query_embedding = generate_embedding(user_query)  # NOT async - synchronous call
    print(f"   ‚úÖ Embedding generated ({len(query_embedding)} dimensions)")
    
    # Step 2: Perform vector search in Azure AI Search
    print(f"\nüîé Step 2: Vector search in Azure AI Search...")
    from azure.search.documents.models import VectorizedQuery
    
    vector_query = VectorizedQuery(
        vector=query_embedding,
        k_nearest_neighbors=top_k,
        fields="desc_vector"
    )
    
    results = search_query_client.search(
        search_text=None,  # Pure vector search
        vector_queries=[vector_query],
        select=["id", "category", "gender", "formality", "fit", "features", 
                "climate", "colors", "materials", "rich_description"],
        top=top_k
    )
    
    # Step 3: Collect results with scores
    matches = []
    for idx, result in enumerate(results, 1):
        match_data = {
            "product_id": result["id"],
            "score": result.get("@search.score", 0),
            "category": result.get("category"),
            "gender": result.get("gender"),
            "formality": result.get("formality"),
            "fit": result.get("fit"),
            "features": result.get("features", []),
            "climate": result.get("climate", []),
            "colors": result.get("colors", []),
            "materials": result.get("materials", []),
            "description": result.get("rich_description", "")
        }
        matches.append(match_data)
        
        print(f"\n   ‚úÖ Match {idx} (Score: {match_data['score']:.4f}):")
        print(f"      Product ID: {match_data['product_id']}")
        print(f"      Category: {match_data['category']} | Gender: {match_data['gender']}")
        print(f"      Formality: {match_data['formality']} | Fit: {match_data['fit']}")
        print(f"      Colors: {', '.join(match_data['colors'][:3])}")
        print(f"      Description: {match_data['description'][:100]}...")
    
    print(f"\n‚úÖ Found {len(matches)} products")
    return matches

print("‚úÖ Semantic search function ready")


‚úÖ Semantic search function ready


In [None]:
# Step 8.3: Retrieve Full Product Details from Cosmos DB

async def get_product_details_from_cosmos(product_ids: List[str]) -> List[Dict]:
    """
    Retrieve complete product information from Cosmos DB
    
    Args:
        product_ids: List of product identifiers
    
    Returns:
        List of complete product documents with pricing, inventory, metadata
    """
    print(f"üíæ Retrieving {len(product_ids)} products from Cosmos DB...")
    
    products_manager = get_collection_manager("products")
    
    # Query for all matching product IDs using $in operator
    query = {"product_id": {"$in": product_ids}}
    
    # Use asyncio.to_thread for synchronous query_documents method
    documents = await asyncio.to_thread(
        products_manager.query_documents,
        query=query
    )
    
    products = []
    for doc in documents:
        # Remove MongoDB _id for cleaner output
        doc.pop("_id", None)
        products.append(doc)
        
        print(f"\n   ‚úÖ Retrieved: {doc['product_id']}")
        print(f"      Brand: {doc.get('brand', 'N/A')}")
        print(f"      Name: {doc.get('name', 'N/A')}")
        print(f"      Base Price: ${doc.get('pricing', {}).get('base_price', 0)}")
        print(f"      Total Stock: {doc.get('inventory', {}).get('total_stock', 0)} units")
        
        # Handle different image URL formats
        image_url = doc.get('image_url', 'N/A')
        if image_url and len(image_url) > 60:
            print(f"      Image URL: {image_url[:60]}...")
        else:
            print(f"      Image URL: {image_url}")
    
    print(f"\n‚úÖ Retrieved {len(products)} complete product records")
    return products

print("‚úÖ Cosmos DB retrieval function ready")


‚úÖ Cosmos DB retrieval function ready



In [37]:
# Step 8.4: Format Product Data for LLM Context

def format_products_for_llm(products: List[Dict], search_matches: List[Dict]) -> str:
    """
    Format product data as structured context for LLM
    
    This creates a clean, parseable format that the voice agent can use
    to respond to customer queries with accurate product information.
    
    Args:
        products: Full product details from Cosmos DB
        search_matches: Search metadata (scores, categories) from Azure AI Search
    
    Returns:
        Formatted string ready for LLM context injection
    """
    # Create a mapping of product_id to search score
    score_map = {match["product_id"]: match["score"] for match in search_matches}
    
    llm_context = "### AVAILABLE PRODUCTS (Ranked by Relevance)\n\n"
    
    for idx, product in enumerate(products, 1):
        product_id = product["product_id"]
        relevance_score = score_map.get(product_id, 0)
        
        llm_context += f"**Product {idx}** (Relevance: {relevance_score:.2f})\n"
        llm_context += f"- **Brand**: {product.get('brand', 'N/A')}\n"
        llm_context += f"- **Name**: {product.get('name', 'N/A')}\n"
        llm_context += f"- **Category**: {product.get('category', 'N/A')} | Gender: {product.get('gender', 'N/A')}\n"
        llm_context += f"- **Style**: {product.get('formality', 'N/A')} | Fit: {product.get('fit', 'N/A')}\n"
        
        # Colors, materials, features from specifications
        specs = product.get('specifications', {})
        llm_context += f"- **Colors**: {', '.join(specs.get('colors', []))}\n"
        llm_context += f"- **Materials**: {', '.join(specs.get('materials', []))}\n"
        llm_context += f"- **Features**: {', '.join(product.get('features', []))}\n"
        llm_context += f"- **Climate**: {', '.join(product.get('climate', []))}\n"
        
        # Pricing
        pricing = product.get('pricing', {})
        base_price = pricing.get('base_price', 0)
        discount_tiers = pricing.get('discount_tiers', {})
        
        llm_context += f"\n**Pricing**:\n"
        llm_context += f"  - Base Price: ${base_price:.2f}\n"
        
        if discount_tiers:
            member_discount = discount_tiers.get('member', 0)
            gold_discount = discount_tiers.get('gold', 0)
            platinum_discount = discount_tiers.get('platinum', 0)
            
            llm_context += f"  - Member Price: ${member_discount:.2f}\n"
            llm_context += f"  - Gold Price: ${gold_discount:.2f}\n"
            llm_context += f"  - Platinum Price: ${platinum_discount:.2f}\n"
        
        # Sale information
        if pricing.get('on_sale'):
            llm_context += f"  - üî• ON SALE: ${pricing.get('sale_price', 0):.2f}\n"
        
        # Inventory
        inventory = product.get('inventory', {})
        total_stock = inventory.get('total_stock', 0)
        by_region = inventory.get('by_region', {})
        
        llm_context += f"\n**Inventory** (Total: {total_stock} units):\n"
        for region, region_data in by_region.items():
            if isinstance(region_data, dict):
                available = region_data.get('available', region_data.get('stock', 0))
                llm_context += f"  - {region}: {available} available\n"
            else:
                llm_context += f"  - {region}: {region_data} units\n"
        
        # Image
        llm_context += f"\n**Image**: {product.get('image_url', 'N/A')}\n"
        
        # Merchandising info
        merch = product.get('merchandising', {})
        rating = merch.get('customer_rating', 0)
        reviews = merch.get('review_count', 0)
        if rating and reviews:
            llm_context += f"**Rating**: ‚≠ê {rating}/5.0 ({reviews} reviews)\n"
        
        llm_context += "\n" + "-"*80 + "\n\n"
    
    return llm_context

print("‚úÖ LLM formatting function ready")


‚úÖ LLM formatting function ready



In [32]:
# Step 8.5: Complete RAG Flow Test

async def test_rag_retrieval(user_query: str, top_k: int = 3):
    """
    Test the complete retrieval flow: Search ‚Üí Retrieve ‚Üí Format
    
    This simulates what happens when a user asks the voice agent:
    "Show me casual jeans for men"
    
    Args:
        user_query: Natural language product search
        top_k: Number of products to retrieve
    """
    print("="*80)
    print(f"üéØ RAG RETRIEVAL TEST")
    print("="*80)
    print(f"User Query: '{user_query}'")
    print(f"Top K Results: {top_k}")
    print("="*80)
    
    # Step 1: Semantic search in Azure AI Search
    print("\n" + "="*80)
    print("STEP 1: AZURE AI SEARCH (Semantic Vector Search)")
    print("="*80)
    search_matches = await semantic_product_search(user_query, top_k)
    
    if not search_matches:
        print("‚ùå No products found matching the query")
        return
    
    # Step 2: Extract product IDs
    product_ids = [match["product_id"] for match in search_matches]
    print(f"\nüìã Product IDs to retrieve: {product_ids}")
    
    # Step 3: Get full details from Cosmos DB
    print("\n" + "="*80)
    print("STEP 2: COSMOS DB (Retrieve Full Product Details)")
    print("="*80)
    products = await get_product_details_from_cosmos(product_ids)
    
    if not products:
        print("‚ùå No products found in Cosmos DB")
        return
    
    # Step 4: Format for LLM
    print("\n" + "="*80)
    print("STEP 3: FORMAT FOR LLM (Structured Context)")
    print("="*80)
    llm_context = format_products_for_llm(products, search_matches)
    
    print("‚úÖ LLM Context Generated:")
    print("\n" + "="*80)
    print(llm_context)
    print("="*80)
    
    # Summary
    print("\n" + "="*80)
    print("üìä RETRIEVAL SUMMARY")
    print("="*80)
    print(f"‚úÖ Query: '{user_query}'")
    print(f"‚úÖ Azure AI Search: {len(search_matches)} matches found")
    print(f"‚úÖ Cosmos DB: {len(products)} products retrieved")
    print(f"‚úÖ LLM Context: {len(llm_context)} characters")
    print(f"‚úÖ Ready to pass to voice agent LLM")
    print("="*80)
    
    return {
        "query": user_query,
        "search_matches": search_matches,
        "products": products,
        "llm_context": llm_context
    }

print("‚úÖ Complete RAG test function ready")


‚úÖ Complete RAG test function ready



In [38]:
# Step 8.6: Run RAG Test with Different Queries

# Test Query 1: Casual jeans for men
print("üß™ Test 1: Casual jeans for men")
result1 = await test_rag_retrieval("casual jeans for men", top_k=3)


üß™ Test 1: Casual jeans for men
üéØ RAG RETRIEVAL TEST

üéØ RAG RETRIEVAL TEST
User Query: 'casual jeans for men'
Top K Results: 3
User Query: 'casual jeans for men'
Top K Results: 3

STEP 1: AZURE AI SEARCH (Semantic Vector Search)

STEP 1: AZURE AI SEARCH (Semantic Vector Search)
üîç Searching for: 'casual jeans for men'
   Top results: 3

üìä Step 1: Generate query embedding...

   Top results: 3

üìä Step 1: Generate query embedding...
   ‚úÖ Embedding generated (3072 dimensions)

üîé Step 2: Vector search in Azure AI Search...
   ‚úÖ Embedding generated (3072 dimensions)

üîé Step 2: Vector search in Azure AI Search...

   ‚úÖ Match 1 (Score: 0.6610):
      Product ID: PROD-MN-JEAN-2F9B66DF
      Category: Jeans | Gender: Men
      Formality: casual | Fit: relaxed
      Colors: blue
      Description: The Lenny Washed Wide-Leg Jeans offer a relaxed fit with a contemporary wide-leg silhouette, perfect...

   ‚úÖ Match 1 (Score: 0.6610):
      Product ID: PROD-MN-JEAN-2F9B6

In [40]:
result1['llm_context']

'### AVAILABLE PRODUCTS (Ranked by Relevance)\n\n**Product 1** (Relevance: 0.66)\n- **Brand**: Urban Edge\n- **Name**: Lenny Washed Wide-Leg Jeans\n- **Category**: Jeans | Gender: Men\n- **Style**: casual | Fit: relaxed\n- **Colors**: blue\n- **Materials**: denim, cotton\n- **Features**: wide-leg silhouette, washed finish, five-pocket design\n- **Climate**: mild, warm\n\n**Pricing**:\n  - Base Price: $95.00\n  - Member Price: $85.50\n  - Gold Price: $80.75\n  - Platinum Price: $76.00\n\n**Inventory** (Total: 350 units):\n  - US_WEST: 130 available\n  - US_EAST: 118 available\n  - US_SOUTH: 80 available\n\n**Image**: https://storagefactoryeastus.blob.core.windows.net/clothesimages/products/PROD-MN-JEAN-2F9B66DF.png\n**Rating**: ‚≠ê 4.5/5.0 (87 reviews)\n\n--------------------------------------------------------------------------------\n\n'

In [None]:
# Alternative Test Queries (uncomment to try different searches)

# Test Query 2: Formal wear
# result2 = await test_rag_retrieval("formal business attire for women", top_k=2)

# Test Query 3: Winter clothing
# result3 = await test_rag_retrieval("warm winter clothing", top_k=3)

# Test Query 4: Specific colors
# result4 = await test_rag_retrieval("black jeans", top_k=2)

# Test Query 5: Comfortable fit
# result5 = await test_rag_retrieval("comfortable relaxed fit clothing", top_k=3)

print("\nüí° To test other queries, uncomment the lines above and run this cell")


## ‚úÖ RAG Retrieval Flow Complete!

### What We Just Tested:
1. **Azure AI Search**: Converted user query ‚Üí embedding ‚Üí vector search ‚Üí ranked results
2. **Product IDs**: Extracted product identifiers from top matches
3. **Cosmos DB**: Retrieved full product details (pricing, inventory, metadata)
4. **LLM Context**: Formatted structured data ready for voice agent

### Output Format for Voice Agent:
The `llm_context` string contains:
- **Product Rankings** with relevance scores
- **Complete Details**: Brand, name, category, style, colors, materials, features
- **Pricing**: Base price + tier discounts (member/gold/platinum)
- **Inventory**: Total stock + regional breakdown (US_WEST/EAST/SOUTH)
- **Image URLs**: Real blob storage URLs for display
- **Descriptions**: Rich product descriptions from GPT-4o Vision

### Next Steps for Backend Integration:
1. **Create Tool Function**: Wrap `test_rag_retrieval()` as a callable tool
2. **Add to Agent**: Register as available function for LLM to call
3. **Voice Flow**: 
   - User: "Show me casual jeans for men"
   - Agent calls tool ‚Üí gets LLM context
   - Agent responds with product recommendations + image URLs
4. **Display Images**: Frontend shows product images from blob URLs

This is the **complete RAG pattern** ready for production! üéâ


## ü§ñ Step 9: Multi-Agent Tool Functions (Concierge + Stylist Agents)

**Architecture Overview**:

We're building **two specialized agents** with distinct tool functions:

### **1Ô∏è‚É£ Concierge Agent** - Direct Product Search
- **User Query**: "Show me casual jeans for men"
- **Tool**: `search_products_general(query, top_k)`
- **Flow**: Simple semantic search ‚Üí Return top products
- **Use Case**: Direct product lookups, inventory checks, price queries

### **2Ô∏è‚É£ Stylist Agent** - Personalized Recommendations
- **User Query**: "Find an outfit for my grandma's birthday party"
- **Tool**: `search_products_filtered(occasion, weather, formality, gender, age_group, colors, top_k)`
- **Flow**: Multi-turn conversation ‚Üí Gather context ‚Üí Filtered search ‚Üí Personalized recommendations
- **Use Case**: Styling advice, gift recommendations, event-specific outfits

**Key Difference**: 
- Concierge = **Fast & Direct** (1 query, immediate results)
- Stylist = **Conversational & Contextual** (multiple filters, personalized)


In [42]:
# Import required types for agent tools
from typing import Any

print("‚úÖ Agent tool imports ready")


‚úÖ Agent tool imports ready


In [43]:
# Tool 1: Concierge Agent - General Product Search (Simple & Fast)

async def search_products_general(
    query: str,
    top_k: int = 5
) -> Dict[str, Any]:
    """
    üõéÔ∏è CONCIERGE AGENT TOOL - General product search
    
    Use this for DIRECT product queries:
    - "Show me jeans"
    - "Do you have blue shirts?"
    - "What running shoes do you have?"
    
    Args:
        query: Natural language product search query
        top_k: Number of products to return (default: 5)
    
    Returns:
        Dict with:
            - products: List of product details
            - llm_response: Formatted text ready for voice response
            - count: Number of products found
    """
    print(f"üõéÔ∏è CONCIERGE AGENT: Searching for '{query}'...")
    
    try:
        # Step 1: Semantic search in Azure AI Search
        search_matches = await semantic_product_search(query, top_k)
        
        if not search_matches:
            return {
                "products": [],
                "llm_response": f"I'm sorry, I couldn't find any products matching '{query}'. Would you like to try a different search?",
                "count": 0,
                "query": query
            }
        
        # Step 2: Get full details from Cosmos DB
        product_ids = [match["product_id"] for match in search_matches]
        products = await get_product_details_from_cosmos(product_ids)
        
        if not products:
            return {
                "products": [],
                "llm_response": "I found some matches but couldn't retrieve the details. Please try again.",
                "count": 0,
                "query": query
            }
        
        # Step 3: Format for LLM
        llm_context = format_products_for_llm(products, search_matches)
        
        # Step 4: Create voice-friendly response
        llm_response = f"I found {len(products)} product{'s' if len(products) > 1 else ''} for you. Here's what we have:\n\n"
        llm_response += llm_context
        
        return {
            "products": products,
            "search_matches": search_matches,
            "llm_response": llm_response,
            "count": len(products),
            "query": query,
            "tool": "concierge_search"
        }
        
    except Exception as e:
        print(f"‚ùå Concierge search error: {e}")
        return {
            "products": [],
            "llm_response": f"I encountered an error while searching. Please try again.",
            "count": 0,
            "query": query,
            "error": str(e)
        }

print("‚úÖ Concierge Agent tool ready: search_products_general()")


‚úÖ Concierge Agent tool ready: search_products_general()


In [44]:
# Tool 2: Stylist Agent - Filtered Search with Context (Personalized)

async def search_products_filtered(
    query: str,
    occasion: Optional[str] = None,
    weather: Optional[str] = None,
    formality: Optional[str] = None,
    gender: Optional[str] = None,
    age_group: Optional[str] = None,
    colors: Optional[List[str]] = None,
    top_k: int = 5
) -> Dict[str, Any]:
    """
    üëî STYLIST AGENT TOOL - Contextual product search with filters
    
    Use this for COMPLEX styling queries:
    - "Find an outfit for my grandma's birthday party"
    - "What should I wear to a wedding in winter?"
    - "Help me find casual summer clothes for my teenage son"
    
    The agent should gather context through conversation BEFORE calling this tool:
    - Ask about the occasion (birthday, wedding, work, casual)
    - Ask about the weather/climate (hot, cold, rainy, mild)
    - Ask about formality (casual, business_casual, formal, athletic)
    - Infer or ask about gender and age group
    
    Args:
        query: Natural language search query (used for semantic search)
        occasion: Event type (birthday, wedding, work, date_night, gym, casual_outing)
        weather: Climate filter (warm, mild, cold, rainy)
        formality: Style level (casual, business_casual, formal, athletic)
        gender: Target gender (Men, Women, Unisex)
        age_group: Age category (teen, young_adult, adult, senior) - helps refine style
        colors: Preferred color palette (e.g., ["blue", "black", "navy"])
        top_k: Number of products to return (default: 5)
    
    Returns:
        Dict with personalized recommendations and styling advice
    """
    print(f"üëî STYLIST AGENT: Building personalized recommendations...")
    print(f"   Query: '{query}'")
    print(f"   Filters: occasion={occasion}, weather={weather}, formality={formality}")
    print(f"   Gender: {gender}, Age: {age_group}, Colors: {colors}")
    
    try:
        # Step 1: Build filtered query for Azure AI Search
        # We'll combine semantic search with attribute filters
        
        from azure.search.documents.models import VectorizedQuery
        
        # Generate embedding for semantic search
        query_embedding = generate_embedding(query)
        
        # Build filter string for Azure AI Search OData syntax
        filter_parts = []
        
        if formality:
            filter_parts.append(f"formality eq '{formality}'")
        
        if gender:
            filter_parts.append(f"gender eq '{gender}'")
        
        if weather and weather in ["warm", "mild", "cold", "rainy"]:
            filter_parts.append(f"climate/any(c: c eq '{weather}')")
        
        if colors:
            # Filter for any matching color
            color_filters = [f"colors/any(col: col eq '{color}')" for color in colors]
            if color_filters:
                filter_parts.append(f"({' or '.join(color_filters)})")
        
        # Combine all filters with AND
        filter_expression = " and ".join(filter_parts) if filter_parts else None
        
        print(f"\nüîç Azure AI Search Filter: {filter_expression or 'None (pure semantic search)'}")
        
        # Step 2: Execute filtered vector search
        vector_query = VectorizedQuery(
            vector=query_embedding,
            k_nearest_neighbors=top_k * 2,  # Get more results to filter
            fields="desc_vector"
        )
        
        results = search_query_client.search(
            search_text=None,
            vector_queries=[vector_query],
            filter=filter_expression,
            select=["id", "category", "gender", "formality", "fit", "features", 
                    "climate", "colors", "materials", "rich_description"],
            top=top_k
        )
        
        # Step 3: Collect filtered results
        search_matches = []
        for idx, result in enumerate(results, 1):
            if idx > top_k:  # Limit to top_k
                break
                
            match_data = {
                "product_id": result["id"],
                "score": result.get("@search.score", 0),
                "category": result.get("category"),
                "gender": result.get("gender"),
                "formality": result.get("formality"),
                "fit": result.get("fit"),
                "features": result.get("features", []),
                "climate": result.get("climate", []),
                "colors": result.get("colors", []),
                "materials": result.get("materials", []),
                "description": result.get("rich_description", "")
            }
            search_matches.append(match_data)
            
            print(f"\n   ‚úÖ Match {idx} (Score: {match_data['score']:.4f}):")
            print(f"      Product ID: {match_data['product_id']}")
            print(f"      {match_data['category']} | {match_data['gender']} | {match_data['formality']}")
        
        if not search_matches:
            return {
                "products": [],
                "llm_response": f"I couldn't find products matching those specific criteria. Let's try adjusting the filters - maybe try a different color or style?",
                "count": 0,
                "query": query,
                "filters_applied": {
                    "occasion": occasion,
                    "weather": weather,
                    "formality": formality,
                    "gender": gender,
                    "age_group": age_group,
                    "colors": colors
                }
            }
        
        # Step 4: Get full details from Cosmos DB
        product_ids = [match["product_id"] for match in search_matches]
        products = await get_product_details_from_cosmos(product_ids)
        
        if not products:
            return {
                "products": [],
                "llm_response": "I found some matches but couldn't retrieve the details. Let me try again.",
                "count": 0,
                "query": query
            }
        
        # Step 5: Format for LLM with styling context
        llm_context = format_products_for_llm(products, search_matches)
        
        # Step 6: Create personalized stylist response
        llm_response = "Based on your needs, here are my personalized recommendations:\n\n"
        
        if occasion:
            llm_response += f"**Perfect for**: {occasion.replace('_', ' ').title()}\n"
        if weather:
            llm_response += f"**Weather**: {weather.title()}\n"
        if formality:
            llm_response += f"**Style Level**: {formality.replace('_', ' ').title()}\n"
        
        llm_response += f"\n**I found {len(products)} great option{'s' if len(products) > 1 else ''} for you:**\n\n"
        llm_response += llm_context
        
        # Add styling tips based on occasion
        styling_tips = ""
        if occasion == "wedding":
            styling_tips = "\nüí° **Styling Tip**: For weddings, consider adding accessories like a tie or elegant jewelry to complete the look!"
        elif occasion == "birthday":
            styling_tips = "\nüí° **Styling Tip**: Birthday parties are great for showing personality - don't be afraid to add some color!"
        elif occasion == "work":
            styling_tips = "\nüí° **Styling Tip**: Keep it professional with neutral colors and classic cuts. You can always add personality with subtle accessories."
        
        llm_response += styling_tips
        
        return {
            "products": products,
            "search_matches": search_matches,
            "llm_response": llm_response,
            "count": len(products),
            "query": query,
            "filters_applied": {
                "occasion": occasion,
                "weather": weather,
                "formality": formality,
                "gender": gender,
                "age_group": age_group,
                "colors": colors
            },
            "tool": "stylist_search_filtered"
        }
        
    except Exception as e:
        print(f"‚ùå Stylist search error: {e}")
        import traceback
        traceback.print_exc()
        return {
            "products": [],
            "llm_response": f"I encountered an error while building recommendations. Let me try a simpler search.",
            "count": 0,
            "query": query,
            "error": str(e)
        }

print("‚úÖ Stylist Agent tool ready: search_products_filtered()")


‚úÖ Stylist Agent tool ready: search_products_filtered()


In [45]:
# Test Tool 1: Concierge Agent - Simple product search

print("="*80)
print("üß™ TEST 1: CONCIERGE AGENT - Direct Product Search")
print("="*80)

result_concierge = await search_products_general("casual jeans for men", top_k=3)

print("\nüìä CONCIERGE RESULT:")
print(f"   Products Found: {result_concierge['count']}")
print(f"   Tool Used: {result_concierge.get('tool')}")
print("\nüé§ Voice Response Preview:")
print(result_concierge['llm_response'][:500] + "..." if len(result_concierge['llm_response']) > 500 else result_concierge['llm_response'])


üß™ TEST 1: CONCIERGE AGENT - Direct Product Search
üõéÔ∏è CONCIERGE AGENT: Searching for 'casual jeans for men'...
üîç Searching for: 'casual jeans for men'
   Top results: 3

üìä Step 1: Generate query embedding...
   ‚úÖ Embedding generated (3072 dimensions)

üîé Step 2: Vector search in Azure AI Search...

   ‚úÖ Match 1 (Score: 0.6610):
      Product ID: PROD-MN-JEAN-2F9B66DF
      Category: Jeans | Gender: Men
      Formality: casual | Fit: relaxed
      Colors: blue
      Description: The Lenny Washed Wide-Leg Jeans offer a relaxed fit with a contemporary wide-leg silhouette, perfect...

‚úÖ Found 1 products
üíæ Retrieving 1 products from Cosmos DB...

   ‚úÖ Retrieved: PROD-MN-JEAN-2F9B66DF
      Brand: Urban Edge
      Name: Lenny Washed Wide-Leg Jeans
      Base Price: $95
      Total Stock: 350 units
      Image URL: https://storagefactoryeastus.blob.core.windows.net/clothesim...

‚úÖ Retrieved 1 complete product records

üìä CONCIERGE RESULT:
   Products Found: 1
   

In [46]:
# Test Tool 2: Stylist Agent - Filtered search with context

print("="*80)
print("üß™ TEST 2: STYLIST AGENT - Contextual Recommendation")
print("="*80)
print("Scenario: User asks 'Find an outfit for my grandma's birthday party'")
print("Agent gathers context through conversation...")
print("="*80)

# Stylist would gather this info through multi-turn conversation:
result_stylist = await search_products_filtered(
    query="elegant comfortable clothing for older woman",
    occasion="birthday",
    weather="mild",
    formality="casual",
    gender="Women",
    age_group="senior",
    colors=["navy", "black", "grey"],
    top_k=3
)

print("\nüìä STYLIST RESULT:")
print(f"   Products Found: {result_stylist['count']}")
print(f"   Tool Used: {result_stylist.get('tool')}")
print(f"   Filters Applied: {result_stylist.get('filters_applied')}")
print("\nüé§ Voice Response Preview:")
print(result_stylist['llm_response'][:600] + "..." if len(result_stylist['llm_response']) > 600 else result_stylist['llm_response'])


üß™ TEST 2: STYLIST AGENT - Contextual Recommendation
Scenario: User asks 'Find an outfit for my grandma's birthday party'
Agent gathers context through conversation...
üëî STYLIST AGENT: Building personalized recommendations...
   Query: 'elegant comfortable clothing for older woman'
   Filters: occasion=birthday, weather=mild, formality=casual
   Gender: Women, Age: senior, Colors: ['navy', 'black', 'grey']

üîç Azure AI Search Filter: formality eq 'casual' and gender eq 'Women' and climate/any(c: c eq 'mild') and (colors/any(col: col eq 'navy') or colors/any(col: col eq 'black') or colors/any(col: col eq 'grey'))

üìä STYLIST RESULT:
   Products Found: 0
   Tool Used: None
   Filters Applied: {'occasion': 'birthday', 'weather': 'mild', 'formality': 'casual', 'gender': 'Women', 'age_group': 'senior', 'colors': ['navy', 'black', 'grey']}

üé§ Voice Response Preview:
I couldn't find products matching those specific criteria. Let's try adjusting the filters - maybe try a different 

## üìã Multi-Agent Tool Architecture Summary

### **Tool Design Philosophy**:

#### **1Ô∏è‚É£ Concierge Agent Tool** (`search_products_general`)
**When to Use**: Direct, simple product queries
- "Show me jeans"
- "Do you have running shoes?"
- "What's the price of this shirt?"

**Parameters**: Just `query` and `top_k`
**Filters**: ‚ùå None - Pure semantic search
**Speed**: ‚ö° Fast (1-2 seconds)
**Response**: Direct product list

#### **2Ô∏è‚É£ Stylist Agent Tool** (`search_products_filtered`)
**When to Use**: Complex styling/recommendation scenarios
- "Find an outfit for my grandma's birthday"
- "What should I wear to a wedding in winter?"
- "Help me dress for a job interview"

**Parameters**: 
- `query` - Semantic search base
- `occasion` - Event context
- `weather` - Climate filter
- `formality` - Style level
- `gender` - Target audience
- `age_group` - Age-appropriate styling
- `colors` - Color preferences
- `top_k` - Result limit

**Filters**: ‚úÖ Azure AI Search OData filters applied
**Speed**: üê¢ Slower (2-4 seconds) - More precise
**Response**: Personalized recommendations + styling tips

### **Agent Orchestration Flow**:

```
User: "Can you help me find something for my grandma?"
  ‚Üì
System detects: COMPLEX QUERY ‚Üí Route to STYLIST AGENT
  ‚Üì
Stylist Agent:
  - Asks: "What's the occasion?" ‚Üí "Birthday party"
  - Asks: "What's the weather like?" ‚Üí "Mild, spring weather"
  - Asks: "Formal or casual?" ‚Üí "Casual but nice"
  - Infers: Gender=Women, Age=Senior
  ‚Üì
Calls: search_products_filtered(
  query="elegant comfortable clothing",
  occasion="birthday",
  weather="mild",
  formality="casual",
  gender="Women",
  age_group="senior"
)
  ‚Üì
Returns: 3-5 personalized products + styling advice
```

### **Key Benefits**:
1. ‚úÖ **Separation of Concerns**: Simple queries don't need complex filtering
2. ‚úÖ **Performance**: Concierge is fast, Stylist is precise
3. ‚úÖ **User Experience**: Natural conversation flow for complex needs
4. ‚úÖ **Scalability**: Easy to add more specialized agents (e.g., Gift Advisor, Wardrobe Builder)

### **Next Steps for Backend Integration**:
1. Register both tools in your agent framework
2. Add intent detection to route to correct agent
3. Implement multi-turn conversation state for Stylist agent
4. Add tool schemas for LLM function calling


In [47]:
# Test Tool 2b: Stylist Agent with broader filters (more realistic)

print("="*80)
print("üß™ TEST 2B: STYLIST AGENT - Wedding Outfit (Winter)")
print("="*80)
print("Scenario: User asks 'What should I wear to a winter wedding?'")
print("Agent gathers: Male user, formal event, cold weather")
print("="*80)

# More realistic scenario with products we have
result_stylist2 = await search_products_filtered(
    query="formal elegant clothing",
    occasion="wedding",
    weather="cold",
    formality="formal",
    gender="Men",
    colors=["navy", "black"],
    top_k=3
)

print("\nüìä STYLIST RESULT:")
print(f"   Products Found: {result_stylist2['count']}")
print(f"   Tool Used: {result_stylist2.get('tool')}")
print(f"   Filters Applied: {result_stylist2.get('filters_applied')}")

if result_stylist2['count'] > 0:
    print("\nüé§ Voice Response Preview:")
    print(result_stylist2['llm_response'][:700] + "...")
else:
    print(f"\nüé§ Response: {result_stylist2['llm_response']}")


üß™ TEST 2B: STYLIST AGENT - Wedding Outfit (Winter)
Scenario: User asks 'What should I wear to a winter wedding?'
Agent gathers: Male user, formal event, cold weather
üëî STYLIST AGENT: Building personalized recommendations...
   Query: 'formal elegant clothing'
   Filters: occasion=wedding, weather=cold, formality=formal
   Gender: Men, Age: None, Colors: ['navy', 'black']

üîç Azure AI Search Filter: formality eq 'formal' and gender eq 'Men' and climate/any(c: c eq 'cold') and (colors/any(col: col eq 'navy') or colors/any(col: col eq 'black'))

üìä STYLIST RESULT:
   Products Found: 0
   Tool Used: None
   Filters Applied: {'occasion': 'wedding', 'weather': 'cold', 'formality': 'formal', 'gender': 'Men', 'age_group': None, 'colors': ['navy', 'black']}

üé§ Response: I couldn't find products matching those specific criteria. Let's try adjusting the filters - maybe try a different color or style?


In [48]:
# Test Tool 2c: Stylist Agent with data we have (casual men's jeans)

print("="*80)
print("üß™ TEST 2C: STYLIST AGENT - Casual Summer Jeans")
print("="*80)
print("Scenario: User asks 'Find comfortable jeans for summer'")
print("="*80)

# Test with filters matching our sample data
result_stylist3 = await search_products_filtered(
    query="comfortable jeans summer",
    occasion="casual_outing",
    weather="warm",
    formality="casual",
    gender="Men",
    top_k=3
)

print("\nüìä STYLIST RESULT:")
print(f"   Products Found: {result_stylist3['count']}")
print(f"   Tool Used: {result_stylist3.get('tool')}")
print(f"   Filters Applied: {result_stylist3.get('filters_applied')}")

if result_stylist3['count'] > 0:
    print("\n‚úÖ SUCCESS! Found products with filtered search")
    print("\nüé§ Voice Response Preview:")
    response = result_stylist3['llm_response']
    print(response[:800] + "..." if len(response) > 800 else response)
else:
    print(f"\nüé§ Response: {result_stylist3['llm_response']}")


üß™ TEST 2C: STYLIST AGENT - Casual Summer Jeans
Scenario: User asks 'Find comfortable jeans for summer'
üëî STYLIST AGENT: Building personalized recommendations...
   Query: 'comfortable jeans summer'
   Filters: occasion=casual_outing, weather=warm, formality=casual
   Gender: Men, Age: None, Colors: None

üîç Azure AI Search Filter: formality eq 'casual' and gender eq 'Men' and climate/any(c: c eq 'warm')

   ‚úÖ Match 1 (Score: 0.6547):
      Product ID: PROD-MN-JEAN-2F9B66DF
      Jeans | Men | casual
üíæ Retrieving 1 products from Cosmos DB...

   ‚úÖ Retrieved: PROD-MN-JEAN-2F9B66DF
      Brand: Urban Edge
      Name: Lenny Washed Wide-Leg Jeans
      Base Price: $95
      Total Stock: 350 units
      Image URL: https://storagefactoryeastus.blob.core.windows.net/clothesim...

‚úÖ Retrieved 1 complete product records

üìä STYLIST RESULT:
   Products Found: 1
   Tool Used: stylist_search_filtered
   Filters Applied: {'occasion': 'casual_outing', 'weather': 'warm', 'formality'