# Generating your product search index
Thereis notebook is designed to automatically create the product search index for you. It uses the [product catalog](products.csv) file to create the index. In order to do so it needs names ane keys for the following services:

- Azure Search Service
- Azure OpenAI Service

You can find the names and keys in the Azure Portal. These need to be entered in a `.env` file in the root of this repository. The `.env` file is not checked in to source control. You can use the [`.env.sample`](../../.env.sample) file as a template.

In [32]:
import os
import pandas as pd
from azure.identity import DefaultAzureCredential
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    HnswParameters,
    HnswAlgorithmConfiguration,
    SemanticPrioritizedFields,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SemanticSearch,
    SemanticConfiguration,
    SemanticField,
    SimpleField,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchAlgorithmMetric,
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    VectorSearchProfile,
)
from typing import List, Dict
from openai import AzureOpenAI
from dotenv import load_dotenv

from pathlib import Path

# Load environment variables from .env file
print("üìÇ Loading environment variables from .env...")

# The notebook is at: python/data/product_info/create-azure-search.ipynb
# The .env is at: python/.env
# Go up 2 levels from current notebook directory
notebook_dir = Path.cwd()
env_path = notebook_dir.parent.parent / '.env'

print(f"Current directory: {notebook_dir}")
print(f"Looking for .env at: {env_path}")

if env_path.exists():
    print(f"‚úÖ Found .env file at {env_path}")
    load_dotenv(env_path, override=True)
else:
    # Try alternative path
    env_path_alt = Path(__file__).parent.parent / '.env'
    print(f"‚ö†Ô∏è  .env not found, trying: {env_path_alt}")
    if env_path_alt.exists():
        print(f"‚úÖ Found .env file at {env_path_alt}")
        load_dotenv(env_path_alt, override=True)
    else:
        print(f"‚ö†Ô∏è  .env file not found. Looking in current directory...")
        load_dotenv(override=True)

# Load variables
search_endpoint = os.getenv("SEARCH_ENDPOINT", "").strip()
search_key = os.getenv("SEARCH_KEY", "").strip()
open_ai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT", "").strip()
open_ai_key = os.getenv("AZURE_OPENAI_API_KEY", "").strip()

print("\nüìä Loaded Configuration:")
print(f"  SEARCH_ENDPOINT: {search_endpoint[:60] if search_endpoint else 'NOT SET'}{'...' if search_endpoint and len(search_endpoint) > 60 else ''}")
print(f"  SEARCH_KEY: {'Set' if search_key else 'NOT SET'}")
print(f"  AZURE_OPENAI_ENDPOINT: {open_ai_endpoint[:60] if open_ai_endpoint else 'NOT SET'}{'...' if open_ai_endpoint and len(open_ai_endpoint) > 60 else ''}")
print(f"  AZURE_OPENAI_API_KEY: {'Set' if open_ai_key else 'NOT SET'}")

# Validate
if not search_endpoint or "your_" in search_endpoint.lower():
    raise ValueError(f"‚ùå SEARCH_ENDPOINT is invalid or not set: '{search_endpoint}'")
if not search_key or len(search_key) < 10:
    raise ValueError(f"‚ùå SEARCH_KEY is invalid or not set")
if not open_ai_endpoint or "your_" in open_ai_endpoint.lower():
    raise ValueError(f"‚ùå AZURE_OPENAI_ENDPOINT is invalid or not set")
if not open_ai_key or len(open_ai_key) < 10:
    raise ValueError(f"‚ùå AZURE_OPENAI_API_KEY is invalid or not set")

print("\n‚úÖ All environment variables validated successfully!")

üìÇ Loading environment variables from .env...
Current directory: c:\Azure-Deleveries\AI-3016-ENU-PowerPoint\RAG-Live-demo\rag-app\python\data\product_info
Looking for .env at: c:\Azure-Deleveries\AI-3016-ENU-PowerPoint\RAG-Live-demo\rag-app\python\.env
‚úÖ Found .env file at c:\Azure-Deleveries\AI-3016-ENU-PowerPoint\RAG-Live-demo\rag-app\python\.env

üìä Loaded Configuration:
  SEARCH_ENDPOINT: https://apsearchfoundary123.search.windows.net/
  SEARCH_KEY: Set
  AZURE_OPENAI_ENDPOINT: https://aphubaiservices.openai.azure.com/
  AZURE_OPENAI_API_KEY: Set

‚úÖ All environment variables validated successfully!


In [33]:
def delete_index(search_index_client: SearchIndexClient, search_index: str):
    try:
        print(f"deleting index {search_index}")
        search_index_client.delete_index(search_index)
        print(f"‚úÖ index {search_index} deleted")
    except Exception as e:
        print(f"‚ö†Ô∏è could not delete index {search_index}: {e}")
        print("This is normal if the index doesn't exist yet")

In [34]:
def create_index_definition(name: str) -> SearchIndex:
    """
    Returns an Azure Cognitive Search index with the given name.
    """
    # The fields we want to index. The "embedding" field is a vector field that will
    # be used for vector search.
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(name="content", type=SearchFieldDataType.String),
        SimpleField(name="filepath", type=SearchFieldDataType.String),
        SearchableField(name="title", type=SearchFieldDataType.String),
        SimpleField(name="url", type=SearchFieldDataType.String),
        SearchField(
            name="contentVector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            # Size of the vector created by the text-embedding-ada-002 model.
            vector_search_dimensions=1536,
            vector_search_profile_name="myHnswProfile",
        ),
    ]

    # The "content" field should be prioritized for semantic ranking.
    semantic_config = SemanticConfiguration(
        name="default",
        prioritized_fields=SemanticPrioritizedFields(
            title_field=SemanticField(field_name="title"),
            keywords_fields=[],
            content_fields=[SemanticField(field_name="content")],
        ),
    )

    # For vector search, we want to use the HNSW (Hierarchical Navigable Small World)
    # algorithm (a type of approximate nearest neighbor search algorithm) with cosine
    # distance.
    vector_search = VectorSearch(
        algorithms=[
            HnswAlgorithmConfiguration(
                name="myHnsw",
                kind=VectorSearchAlgorithmKind.HNSW,
                parameters=HnswParameters(
                    m=4,
                    ef_construction=400,
                    ef_search=500,
                    metric=VectorSearchAlgorithmMetric.COSINE,
                ),
            ),
            ExhaustiveKnnAlgorithmConfiguration(
                name="myExhaustiveKnn",
                kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
                parameters=ExhaustiveKnnParameters(
                    metric=VectorSearchAlgorithmMetric.COSINE
                ),
            ),
        ],
        profiles=[
            VectorSearchProfile(
                name="myHnswProfile",
                algorithm_configuration_name="myHnsw",
            ),
            VectorSearchProfile(
                name="myExhaustiveKnnProfile",
                algorithm_configuration_name="myExhaustiveKnn",
            ),
        ],
    )

    # Create the semantic settings with the configuration
    semantic_search = SemanticSearch(configurations=[semantic_config])

    # Create the search index.
    index = SearchIndex(
        name=name,
        fields=fields,
        semantic_search=semantic_search,
        vector_search=vector_search,
    )

    return index

In [35]:
def gen_contoso_products(
    path: str,
) -> List[Dict[str, any]]:
    # Use the environment variables loaded in cell 2
    client = AzureOpenAI(
        api_version="2024-12-01-preview",
        azure_endpoint=open_ai_endpoint,
        api_key=open_ai_key
    )

    products = pd.read_csv(path)
    items = []
    for product in products.to_dict("records"):
        content = product["description"]
        id = str(product["id"])
        title = product["name"]
        url = f"/products/{title.lower().replace(' ', '-')}"
        emb = client.embeddings.create(input=content, model="text-embedding-ada-002")
        rec = {
            "id": id,
            "content": content,
            "filepath": f"{title.lower().replace(' ', '-')}",
            "title": title,
            "url": url,
            "contentVector": emb.data[0].embedding,
        }
        items.append(rec)

    return items

In [36]:
contoso_search = search_endpoint
index_name = "contoso-products"

# Ensure endpoint URL ends with / for proper formatting
if not contoso_search.endswith('/'):
    contoso_search = contoso_search + '/'

print(f"üîç Connecting to Azure Search at: {contoso_search}")
print(f"üìã Index name: {index_name}")

try:
    search_index_client = SearchIndexClient(
        endpoint=contoso_search, 
        credential=AzureKeyCredential(search_key)
    )
    print("‚úÖ Successfully connected to Azure Search")
except Exception as e:
    print(f"‚ùå Error connecting to Azure Search: {e}")
    raise

delete_index(search_index_client, index_name)
index = create_index_definition(index_name)
print(f"creating index {index_name}")
search_index_client.create_or_update_index(index)
print(f"index {index_name} created")

üîç Connecting to Azure Search at: https://apsearchfoundary123.search.windows.net/
üìã Index name: contoso-products
‚úÖ Successfully connected to Azure Search
deleting index contoso-products
‚úÖ index contoso-products deleted
creating index contoso-products
index contoso-products created


In [37]:
print(f"üìÑ indexing documents")
try:
    docs = gen_contoso_products("products.csv")
    print(f"‚úÖ Generated {len(docs)} product embeddings")
except Exception as e:
    print(f"‚ùå Error generating embeddings: {e}")
    raise

# Upload our data to the index.
try:
    search_client = SearchClient(
        endpoint=contoso_search,
        index_name=index_name,
        credential=AzureKeyCredential(search_key),
    )
    print(f"‚úÖ Connected to search index {index_name}")
    
    print(f"uploading {len(docs)} documents to index {index_name}")
    results = search_client.upload_documents(docs)
    
    successful = sum(1 for r in results if r.succeeded)
    print(f"‚úÖ Successfully uploaded {successful}/{len(docs)} documents")
    
except Exception as e:
    print(f"‚ùå Error uploading documents: {e}")
    raise

üìÑ indexing documents
‚úÖ Generated 20 product embeddings
‚úÖ Connected to search index contoso-products
uploading 20 documents to index contoso-products
‚úÖ Successfully uploaded 20/20 documents
