## Import libraries, load configuration variables and create clients

In [None]:
#%pip install azure-ai-documentintelligence
#%pip install langchain
#%pip install python-dotenv
#%pip install tiktoken
#%pip install openai
#%pip install azure-search-documents
#%pip install pg8000

In [1]:
# Import libraries
import os
import sys
import requests
import json
import time
import pandas as pd

from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import DocumentContentFormat
from langchain.text_splitter import TokenTextSplitter

sys.path.append(os.path.abspath('../..'))
from common_utils import *

# Load Azure OpenAI and AI Search variables and create clients
openai_config, ai_search_config = load_config()

# Load Document Intelligence configuration
doc_intel_endpoint = os.getenv("DOC_INTEL_ENDPOINT")
doc_intel_key = os.getenv("DOC_INTEL_KEY")
doc_intel_client = DocumentIntelligenceClient(endpoint=doc_intel_endpoint, credential=AzureKeyCredential(doc_intel_key))
print(f'doc_intel_endpoint: {doc_intel_endpoint}')

MAX_TOKENS = 512
OVERLAP_TOKENS = 128 # 25% of 512 tokens is 128 tokens

aoai_endpoint: https://openai-asc-swit-north.openai.azure.com/
aoai_deployment_name: gpt-4o
oai_embedding_model: ada
aoai_rerank_model: gpt-4o-mini
ai_search_index_name_regs: rag-index-regs
ai_search_index_name_docs: rag-index-docs
doc_intel_endpoint: https://doc-intel-asc.cognitiveservices.azure.com/


In [2]:
# Create AI Search index
def create_index(index_name):
    # Create an Azure AI Search index client
    index_client = SearchIndexClient(endpoint=ai_search_config["ai_search_endpoint"], credential=ai_search_config["ai_search_credential"])
    
    # Fields definition
    fields = [
        SimpleField(name="doc_id", type=SearchFieldDataType.String, filterable=True),
        SimpleField(name="chunk_id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
        SearchableField(name="title", type=SearchFieldDataType.String), #analyzer="es.microsoft"),
        SearchableField(name="content", type=SearchFieldDataType.String), #analyzer="es.microsoft"),
        SearchableField(name="description", type=SearchFieldDataType.String), #analyzer="es.microsoft"),
        SearchableField(name="author", type=SearchFieldDataType.String), #analyzer="es.microsoft"),
        SearchableField(name="content_id", type=SearchFieldDataType.String), #analyzer="es.microsoft"),
        SearchableField(name="creation_date", type=SearchFieldDataType.String), #analyzer="es.microsoft"),
        SearchableField(name="update_date", type=SearchFieldDataType.String), #analyzer="es.microsoft"),
        SearchField(name="embeddingTitle", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, vector_search_dimensions=EMBEDDINGS_DIMENSIONS, vector_search_profile_name="myHnswProfile"),
        SearchField(name="embeddingContent", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, vector_search_dimensions=EMBEDDINGS_DIMENSIONS, vector_search_profile_name="myHnswProfile")
    ]

    # Configure the vector search configuration
    vector_search = VectorSearch(
        algorithms=[
            HnswAlgorithmConfiguration(
                name="myHnsw",
                kind=VectorSearchAlgorithmKind.HNSW,
                parameters=HnswParameters(
                    m=4,
                    ef_construction=400,
                    ef_search=500,
                    metric=VectorSearchAlgorithmMetric.COSINE
                )
            )
        ],
        profiles=[
            VectorSearchProfile(
                name="myHnswProfile",
                algorithm_configuration_name="myHnsw",
            )
        ]
    )

    # Semantic ranker configuration
    semantic_config = SemanticConfiguration(
        name="semantic-config",
        prioritized_fields=SemanticPrioritizedFields(
            title_field=SemanticField(field_name="title"),
            content_fields=[SemanticField(field_name="description"), SemanticField(field_name="content")]
        )
    )

    # Create the semantic settings with the configuration
    semantic_search = SemanticSearch(configurations=[semantic_config])

    # Create the search index with the semantic settings
    index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)
    result = index_client.create_or_update_index(index)
    print(f"Index '{result.name}' created")




In [3]:
# Create the index
create_index("cms_index")

Index 'cms_index' created
