In [4]:
%pip install azure-ai-projects azure-search-documents azure-identity python-dotenv

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [5]:
import os
from pathlib import Path
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
)
from azure.identity import DefaultAzureCredential
from dotenv import load_dotenv
from openai import AzureOpenAI

print("üìÇ Loading environment variables...")

# Load .env from parent directory (up 2 levels)
notebook_dir = Path.cwd()
env_path = notebook_dir.parent.parent / '.env'

if env_path.exists():
    print(f"‚úÖ Found .env at {env_path}")
    load_dotenv(env_path, override=True)
else:
    print(f"‚ö†Ô∏è .env not found at {env_path}, using current directory")
    load_dotenv()

# Load configuration
search_endpoint = os.getenv("SEARCH_ENDPOINT", "").strip()
search_key = os.getenv("SEARCH_KEY", "").strip()
openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT", "").strip()
openai_key = os.getenv("AZURE_OPENAI_API_KEY", "").strip()

print("\nüìã Configuration loaded:")
print(f"  Search Endpoint: {search_endpoint[:60] if search_endpoint else 'NOT SET'}...")
print(f"  Search Key: {'Set' if search_key else 'NOT SET'}")
print(f"  OpenAI Endpoint: {openai_endpoint[:60] if openai_endpoint else 'NOT SET'}...")
print(f"  OpenAI Key: {'Set' if openai_key else 'NOT SET'}")

if not all([search_endpoint, search_key, openai_endpoint, openai_key]):
    raise ValueError("‚ùå Missing required environment variables. Check your .env file.")

index_name = "contoso-manuals-index"
path_to_data = "./manuals"

print(f"\nüìä Index Configuration:")
print(f"  Index Name: {index_name}")
print(f"  Data Path: {path_to_data}")

üìÇ Loading environment variables...
‚úÖ Found .env at c:\Azure-Deleveries\AI-3016-ENU-PowerPoint\RAG-Live-demo\rag-app\python\.env

üìã Configuration loaded:
  Search Endpoint: https://apsearchfoundary123.search.windows.net/...
  Search Key: Set
  OpenAI Endpoint: https://aphubaiservices.openai.azure.com/...
  OpenAI Key: Set

üìä Index Configuration:
  Index Name: contoso-manuals-index
  Data Path: ./manuals


In [7]:
# Initialize Azure Search client
print(f"üîç Connecting to Azure Search at: {search_endpoint}")

search_credential = AzureKeyCredential(search_key)
search_index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)
search_client = SearchClient(endpoint=search_endpoint, index_name=index_name, credential=search_credential)

# Define the index schema for product manuals
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SearchableField(name="content", type=SearchFieldDataType.String, searchable=True),
    SimpleField(name="source", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="filename", type=SearchFieldDataType.String, searchable=True),
    SearchField(
        name="embedding",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=1536,
        vector_search_profile_name="myHnswProfile",
    ),
]

# Configure vector search
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 500,
                "metric": "cosine",
            }
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
        )
    ],
)

# Create index definition
index = SearchIndex(
    name=index_name,
    fields=fields,
    vector_search=vector_search,
)

# Delete existing index if it exists
try:
    print(f"üóëÔ∏è Attempting to delete existing index '{index_name}'...")
    search_index_client.delete_index(index_name)
    print(f"‚úÖ Deleted existing index")
except Exception as e:
    print(f"‚ö†Ô∏è Index doesn't exist or couldn't be deleted (this is normal on first run): {e}")

# Create new index
try:
    print(f"üìù Creating new index '{index_name}'...")
    search_index_client.create_index(index)
    print(f"‚úÖ Index '{index_name}' created successfully")
except Exception as e:
    print(f"‚ùå Error creating index: {e}")
    raise

üîç Connecting to Azure Search at: https://apsearchfoundary123.search.windows.net/
üóëÔ∏è Attempting to delete existing index 'contoso-manuals-index'...
‚úÖ Deleted existing index
üìù Creating new index 'contoso-manuals-index'...
‚úÖ Index 'contoso-manuals-index' created successfully


In [8]:
# Load manual files and generate embeddings
print(f"\nüìÑ Loading manual files from {path_to_data}...")

# Initialize OpenAI client for embeddings
openai_client = AzureOpenAI(
    api_version="2024-12-01-preview",
    azure_endpoint=openai_endpoint,
    api_key=openai_key
)

# Load all markdown files
manual_dir = Path(path_to_data)
documents = []
doc_id = 0

if not manual_dir.exists():
    print(f"‚ùå Directory not found: {path_to_data}")
    raise FileNotFoundError(f"Manual directory not found: {path_to_data}")

markdown_files = list(manual_dir.glob("*.md"))
print(f"Found {len(markdown_files)} markdown files")

for md_file in markdown_files:
    try:
        print(f"  Processing {md_file.name}...", end=" ")
        
        # Read file content
        with open(md_file, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Generate embedding for the content
        response = openai_client.embeddings.create(
            input=content,
            model="text-embedding-ada-002"
        )
        embedding = response.data[0].embedding
        
        # Create document
        doc = {
            "id": str(doc_id),
            "content": content[:2000],  # Limit content size
            "source": "product_manual",
            "filename": md_file.name,
            "embedding": embedding,
        }
        documents.append(doc)
        doc_id += 1
        
        print("‚úÖ")
        
    except Exception as e:
        print(f"‚ùå Error: {e}")
        continue

print(f"\n‚úÖ Prepared {len(documents)} documents for upload")

# Upload documents to Azure Search
if documents:
    try:
        print(f"üì§ Uploading {len(documents)} documents to Azure Search...")
        results = search_client.upload_documents(documents)
        
        successful = sum(1 for r in results if r.succeeded)
        print(f"‚úÖ Successfully uploaded {successful}/{len(documents)} documents")
        
    except Exception as e:
        print(f"‚ùå Error uploading documents: {e}")
        raise
else:
    print("‚ö†Ô∏è No documents to upload")


üìÑ Loading manual files from ./manuals...
Found 20 markdown files
  Processing product_info_1.md... ‚úÖ
  Processing product_info_10.md... ‚úÖ
  Processing product_info_11.md... ‚úÖ
  Processing product_info_12.md... ‚úÖ
  Processing product_info_13.md... ‚úÖ
  Processing product_info_14.md... ‚úÖ
  Processing product_info_15.md... ‚úÖ
  Processing product_info_16.md... ‚úÖ
  Processing product_info_17.md... ‚úÖ
  Processing product_info_18.md... ‚úÖ
  Processing product_info_19.md... ‚úÖ
  Processing product_info_2.md... ‚úÖ
  Processing product_info_20.md... ‚úÖ
  Processing product_info_3.md... ‚úÖ
  Processing product_info_4.md... ‚úÖ
  Processing product_info_5.md... ‚úÖ
  Processing product_info_6.md... ‚úÖ
  Processing product_info_7.md... ‚úÖ
  Processing product_info_8.md... ‚úÖ
  Processing product_info_9.md... ‚úÖ

‚úÖ Prepared 20 documents for upload
üì§ Uploading 20 documents to Azure Search...
‚úÖ Successfully uploaded 20/20 documents


In [10]:
print("\n‚úÖ Manual index creation complete!")


‚úÖ Manual index creation complete!


In [11]:
# Optional: Verify index was created
try:
    index_stats = search_index_client.get_index(index_name)
    print(f"\nüìä Index Statistics:")
    print(f"  Index Name: {index_stats.name}")
    print(f"  Number of Fields: {len(index_stats.fields)}")
    print(f"  Fields: {', '.join([f.name for f in index_stats.fields])}")
except Exception as e:
    print(f"Could not retrieve index stats: {e}")


üìä Index Statistics:
  Index Name: contoso-manuals-index
  Number of Fields: 5
  Fields: id, content, source, filename, embedding
