# Generating your document search index
This notebook is designed to automatically create a document search index for you. It uses markdown files from a local folder to create the index. In order to do so it needs names and keys for the following services:

- Azure Search Service
- Azure OpenAI Service

You can find the names and keys in the Azure Portal. These need to be entered in a `.env` file in the root of this repository. The `.env` file is not checked in to source control. You can use the [`.env.sample`](../../.env.sample) file as a template.

In [7]:
import os
import pandas as pd
from azure.identity import DefaultAzureCredential
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    HnswParameters,
    HnswAlgorithmConfiguration,
    SemanticPrioritizedFields,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SemanticSearch,
    SemanticConfiguration,
    SemanticField,
    SimpleField,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchAlgorithmMetric,
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters
)
from typing import List, Dict
from openai import AzureOpenAI
from dotenv import load_dotenv

from pathlib import Path

load_dotenv()

True

In [8]:
def delete_index(search_index_client: SearchIndexClient, search_index: str):
    print(f"deleting index {search_index}")
    search_index_client.delete_index(search_index)

In [9]:
def create_index_definition(name: str) -> SearchIndex:
    """
    Returns an Azure Cognitive Search index with the given name.
    """
    # The fields we want to index. The "embedding" field is a vector field that will
    # be used for vector search.
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(name="content", type=SearchFieldDataType.String),
        SimpleField(name="filepath", type=SearchFieldDataType.String),
        SearchableField(name="title", type=SearchFieldDataType.String),
        SimpleField(name="url", type=SearchFieldDataType.String),
        # Additional metadata fields for better referencing
        SimpleField(name="originalFilename", type=SearchFieldDataType.String),
        SearchableField(name="originalTitle", type=SearchFieldDataType.String),
        SimpleField(name="documentStem", type=SearchFieldDataType.String),
        SimpleField(name="chunkIndex", type=SearchFieldDataType.Int32),
        SimpleField(name="totalChunks", type=SearchFieldDataType.Int32),
        SimpleField(name="isChunked", type=SearchFieldDataType.Boolean),
        SearchField(
            name="contentVector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            # Size of the vector created by the text-embedding-3-large model.
            vector_search_dimensions=3072,
            vector_search_profile_name="myHnswProfile",
        ),
        SearchField(
            name="titleVector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            # Size of the vector created by the text-embedding-3-large model.
            vector_search_dimensions=3072,
            vector_search_profile_name="myHnswProfile",
        ),    
    ]

    # The "content" field should be prioritized for semantic ranking.
    semantic_config = SemanticConfiguration(
        name="default",
        prioritized_fields=SemanticPrioritizedFields(
            title_field=SemanticField(field_name="title"),
            keywords_fields=[],
            content_fields=[SemanticField(field_name="content")],
        ),
    )

    # For vector search, we want to use the HNSW (Hierarchical Navigable Small World)
    # algorithm (a type of approximate nearest neighbor search algorithm) with cosine
    # distance.
    vector_search = VectorSearch(
        algorithms=[
            HnswAlgorithmConfiguration(
                name="myHnsw",
                kind=VectorSearchAlgorithmKind.HNSW,
                parameters=HnswParameters(
                    m=4,
                    ef_construction=400,
                    ef_search=500,
                    metric=VectorSearchAlgorithmMetric.COSINE,
                ),
            ),
            ExhaustiveKnnAlgorithmConfiguration(
                name="myExhaustiveKnn",
                kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
                parameters=ExhaustiveKnnParameters(
                    metric=VectorSearchAlgorithmMetric.COSINE
                ),
            ),
        ],
        profiles=[
            VectorSearchProfile(
                name="myHnswProfile",
                algorithm_configuration_name="myHnsw",
                vectorizer_name="myvectorizer"
            ),
            VectorSearchProfile(
                name="myExhaustiveKnnProfile",
                algorithm_configuration_name="myExhaustiveKnn",
            ),
        ],
        vectorizers=[  
            AzureOpenAIVectorizer(  
                vectorizer_name="myvectorizer",  
                kind="azureOpenAI",  
                parameters=AzureOpenAIVectorizerParameters(  
                    resource_url=os.environ["AZURE_OPENAI_ENDPOINT"],  
                    deployment_name=os.environ["AZURE_EMBEDDING_NAME"],
                    model_name=os.environ["AZURE_EMBEDDING_NAME"],
                    # Todo: there is some issue with managed identity for AI search
                    # authIdentity="/subscriptions/db4948d5-b90c-47d4-91b5-d8c4c43493d2/resourcegroups/rg-chainlit-agent/providers/Microsoft.ManagedIdentity/userAssignedIdentities/id-7r5g6is3dx73u"
                ),
            ),  
        ],  
    )

    # Create the semantic settings with the configuration
    semantic_search = SemanticSearch(configurations=[semantic_config])

    # Create the search index.
    index = SearchIndex(
        name=name,
        fields=fields,
        semantic_search=semantic_search,
        vector_search=vector_search,
    )

    return index

In [None]:
def chunk_text(text: str, max_tokens: int = 6500) -> List[str]:
    """
    Split text into chunks that fit within the token limit.
    Using a more conservative estimate of ~3 characters per token.
    """
    # More conservative estimate: 1 token ≈ 3 characters (down from 4)
    # And using 6000 tokens max instead of 7000 for extra safety
    max_chars = max_tokens * 3
    
    # Split by paragraphs first (double newlines)
    paragraphs = text.split('\n\n')
    
    chunks = []
    current_chunk = ""
    
    for paragraph in paragraphs:
        # If adding this paragraph would exceed the limit, start a new chunk
        if len(current_chunk) + len(paragraph) + 2 > max_chars and current_chunk:
            chunks.append(current_chunk.strip())
            current_chunk = paragraph
        else:
            if current_chunk:
                current_chunk += "\n\n" + paragraph
            else:
                current_chunk = paragraph
    
    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    # Handle edge case where a single paragraph is too long
    final_chunks = []
    for chunk in chunks:
        if len(chunk) <= max_chars:
            final_chunks.append(chunk)
        else:
            # Split by sentences if paragraph is too long
            sentences = chunk.split('. ')
            temp_chunk = ""
            for sentence in sentences:
                if len(temp_chunk) + len(sentence) + 2 > max_chars and temp_chunk:
                    final_chunks.append(temp_chunk.strip())
                    temp_chunk = sentence
                else:
                    if temp_chunk:
                        temp_chunk += ". " + sentence
                    else:
                        temp_chunk = sentence
            if temp_chunk:
                final_chunks.append(temp_chunk.strip())
    
    # Final safety check - if any chunk is still too large, split it by words
    safe_chunks = []
    for chunk in final_chunks:
        if len(chunk) <= max_chars:
            safe_chunks.append(chunk)
        else:
            # Emergency word-level splitting
            words = chunk.split()
            temp_chunk = ""
            for word in words:
                if len(temp_chunk) + len(word) + 1 > max_chars and temp_chunk:
                    safe_chunks.append(temp_chunk.strip())
                    temp_chunk = word
                else:
                    if temp_chunk:
                        temp_chunk += " " + word
                    else:
                        temp_chunk = word
            if temp_chunk:
                safe_chunks.append(temp_chunk.strip())
    
    return safe_chunks

def gen_markdown_documents(
    folder_path: str,
) -> List[Dict[str, any]]:
    openai_service_endoint = os.environ["AZURE_OPENAI_ENDPOINT"]
    openai_deployment = os.environ["AZURE_EMBEDDING_NAME"]

    token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")    
    client = AzureOpenAI(
        api_version="2023-07-01-preview",
        azure_endpoint=openai_service_endoint,
        azure_deployment=openai_deployment,
        azure_ad_token_provider=token_provider
    )

    items = []
    folder = Path(folder_path)
    
    # Find all markdown files in the folder and subfolders
    md_files = list(folder.glob("**/*.md"))
    
    doc_id = 1
    for md_file in md_files:
        # Read the markdown file content
        with open(md_file, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Extract title from the first heading or use filename
        title = md_file.stem  # Default to filename without extension
        lines = content.split('\n')
        for line in lines:
            if line.startswith('# '):
                title = line[2:].strip()
                break
        
        # Preserve document metadata for referencing
        original_filename = md_file.name
        document_path = str(md_file.relative_to(folder))
        document_stem = md_file.stem
        
        # Split content into chunks
        content_chunks = chunk_text(content)
        
        print(f"Document: {title} - Split into {len(content_chunks)} chunks")
        
        # Process each chunk as a separate document
        for chunk_idx, chunk in enumerate(content_chunks):
            # Create unique ID for each chunk
            chunk_id = f"{doc_id}_{chunk_idx + 1}" if len(content_chunks) > 1 else str(doc_id)
            
            # Preserve original title for reference, add chunk info only for display
            chunk_title = title
            display_title = title
            if len(content_chunks) > 1:
                display_title = f"{title} (Part {chunk_idx + 1})"
            
            # Enhanced content with document reference information
            enhanced_content = f"Source Document: {original_filename}\n"
            if len(content_chunks) > 1:
                enhanced_content += f"Document Section: Part {chunk_idx + 1} of {len(content_chunks)}\n"
            enhanced_content += f"Original Title: {title}\n\n{chunk}"
            
            url = f"/docs/{document_path.replace('.md', '').replace('/', '-').lower()}"
            if len(content_chunks) > 1:
                url += f"-part-{chunk_idx + 1}"
            
            # Generate embeddings for enhanced content and title
            try:
                print(f"Processing chunk {chunk_idx + 1}: {len(enhanced_content)} chars (estimated {len(enhanced_content)//3} tokens)")
                
                content_emb = client.embeddings.create(input=enhanced_content, model=openai_deployment)
                title_emb = client.embeddings.create(input=display_title, model=openai_deployment)
                
                rec = {
                    "id": chunk_id,
                    "content": enhanced_content,  # Contains source reference
                    "filepath": document_path,     # Original relative path
                    "title": display_title,       # Display title with chunk info
                    "url": url,
                    "contentVector": content_emb.data[0].embedding,
                    "titleVector": title_emb.data[0].embedding,
                    # Additional metadata for better referencing
                    "originalFilename": original_filename,
                    "originalTitle": title,
                    "documentStem": document_stem,
                    "chunkIndex": chunk_idx + 1,
                    "totalChunks": len(content_chunks),
                    "isChunked": len(content_chunks) > 1
                }
                items.append(rec)
                print(f"✓ Successfully processed: {display_title}")
                
            except Exception as e:
                print(f"✗ Error processing chunk {chunk_idx + 1} of {title}: {e}")
                print(f"  Enhanced content length: {len(enhanced_content)} characters (estimated {len(enhanced_content)//3} tokens)")
                print(f"  Original chunk length: {len(chunk)} characters")
                continue
        
        doc_id += 1

    return items

In [11]:
search_endpoint = os.environ["AZURE_SEARCH_ENDPOINT"]
index_name = os.environ["AZURE_SEARCH_INDEX"]

search_index_client = SearchIndexClient(
    search_endpoint, DefaultAzureCredential()
)

delete_index(search_index_client, index_name)
index = create_index_definition(index_name)
print(f"creating index {index_name}")
search_index_client.create_or_update_index(index)
print(f"index {index_name} created")

deleting index parssedindexer1
creating index parssedindexer1
creating index parssedindexer1
index parssedindexer1 created
index parssedindexer1 created


In [12]:
print(f"indexing documents")
# Change this path to point to your folder containing markdown files
markdown_folder = "./data/"  # Update this path as needed
docs = gen_markdown_documents(markdown_folder)
# Upload our data to the index.
search_client = SearchClient(
    endpoint=search_endpoint,
    index_name=index_name,
    credential=DefaultAzureCredential(),
)
print(f"uploading {len(docs)} documents to index {index_name}")
ds = search_client.upload_documents(docs)

indexing documents
Error processing chunk 1 of Goeckenjan_2019_FluchtalsSicherheitsproblem_converted: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens, however you requested 8725 tokens (8725 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.", 'type': 'invalid_request_error', 'param': None, 'code': None}}
Chunk length: 27513 characters
Error processing chunk 2 of Goeckenjan_2019_FluchtalsSicherheitsproblem_converted: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens, however you requested 8408 tokens (8408 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.", 'type': 'invalid_request_error', 'param': None, 'code': None}}
Chunk length: 25783 characters
Error processing chunk 1 of Goeckenjan_2019_FluchtalsSicherheitsproblem_converted: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens, howev