# Creación de índice en Azure AI Search

### Definir un Azure AI Search índice

In [1]:
#%pip install azure-search-documents
#%pip install azure-identity

import os
import dotenv

from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchField,
    SearchIndex,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchAlgorithmKind,
    HnswParameters,
    VectorSearchAlgorithmMetric,
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    VectorSearchProfile,
    SemanticSearch
)


In [2]:
def delete_index(search_index_client: SearchIndexClient, search_index: str):
    print(f"deleting index {search_index}")
    search_index_client.delete_index(search_index)

In [None]:
def create_index_definition(name: str) -> SearchIndex:
    """
    Returns an Azure Cognitive Search index with the given name.
    The index includes a vector search with the default HNSW algorithm
    """
    # The fields we want to index. The "embedding" field is a vector field that will
    # be used for vector search.
    fields=[
            SimpleField(name="id", type=SearchFieldDataType.String, key=True),
            SearchableField(name="title", type=SearchFieldDataType.String, searchable=True, filterable=True, facetable=True),
            SearchableField(name="content", type=SearchFieldDataType.String, searchable=True),
            SearchableField(name="keywords", type=SearchFieldDataType.String, searchable=True, filterable=True, facetable=True),
            SearchField(
                name="embedding",
                type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True,
                # Size of the vector created by the text-embedding-3-small model.
                vector_search_dimensions=1536,
                vector_search_profile_name="myHnswProfile",
            ),
        ]

    # The "content" field should be prioritized for semantic ranking.
    semantic_config = SemanticConfiguration(
        name="default",
        prioritized_fields=SemanticPrioritizedFields(
            content_fields=[SemanticField(field_name="content")],
            title_field=SemanticField(field_name="title"),
            keywords_fields=[SemanticField(field_name="keywords")],
        ),
    )

    # For vector search, we want to use the HNSW (Hierarchical Navigable Small World)
    # algorithm (a type of approximate nearest neighbor search algorithm) with cosine
    # distance.
    vector_search = VectorSearch(
        algorithms=[
            HnswAlgorithmConfiguration(
                name="myHnsw")
        ],
        profiles=[
            VectorSearchProfile(
                name="myHnswProfile",
                algorithm_configuration_name="myHnsw",
            )
        ]
    )

    # Create the semantic settings with the configuration
    semantic_search = SemanticSearch(configurations=[semantic_config])

    # Create the search index.
    index = SearchIndex(
        name=name,
        fields=fields,
        semantic_search=semantic_search,
        vector_search=vector_search,
    )

    return index

In [None]:
def create_index_definition_vs(name: str) -> SearchIndex:
    """
    Returns an Azure Cognitive Search index with the given name.
    This version includes both HNSW and exhaustive KNN algorithms for vector search.
    """
    # The fields we want to index. The "embedding" field is a vector field that will
    # be used for vector search.
    fields=[
            SimpleField(name="id", type=SearchFieldDataType.String, key=True),
            SearchableField(name="title", type=SearchFieldDataType.String, searchable=True, filterable=True, facetable=True),
            SearchableField(name="content", type=SearchFieldDataType.String, searchable=True),
            SearchableField(name="keywords", type=SearchFieldDataType.String, searchable=True, filterable=True, facetable=True),
            SearchField(
                name="embedding",
                type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True,
                # Size of the vector created by the text-embedding-3-small model.
                vector_search_dimensions=1536,
                vector_search_profile_name="myHnswProfile",
            ),
        ]

    # The "content" field should be prioritized for semantic ranking.
    semantic_config = SemanticConfiguration(
        name="default",
        prioritized_fields=SemanticPrioritizedFields(
            content_fields=[SemanticField(field_name="content")],
            title_field=SemanticField(field_name="title"),
            keywords_fields=[SemanticField(field_name="keywords")],
        ),
    )

    # For vector search, we want to use the HNSW (Hierarchical Navigable Small World)
    # algorithm (a type of approximate nearest neighbor search algorithm) with cosine
    # distance.
    vector_search = VectorSearch(
        algorithms=[
            HnswAlgorithmConfiguration(
                name="myHnsw",
                kind=VectorSearchAlgorithmKind.HNSW,
                parameters=HnswParameters(
                    m=4,
                    ef_construction=400,
                    ef_search=500,
                    metric=VectorSearchAlgorithmMetric.COSINE,
                ),
            ),
            ExhaustiveKnnAlgorithmConfiguration(
                name="myExhaustiveKnn",
                kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
                parameters=ExhaustiveKnnParameters(
                    metric=VectorSearchAlgorithmMetric.COSINE
                ),
            ),
        ],
        profiles=[
            VectorSearchProfile(
                name="myHnswProfile",
                algorithm_configuration_name="myHnsw",
            ),
            VectorSearchProfile(
                name="myExhaustiveKnnProfile",
                algorithm_configuration_name="myExhaustiveKnn",
            ),
        ],
    )

    # Create the semantic settings with the configuration
    semantic_search = SemanticSearch(configurations=[semantic_config])

    # Create the search index.
    index = SearchIndex(
        name=name,
        fields=fields,
        semantic_search=semantic_search,
        vector_search=vector_search,
    )

    return index

### Crear el índice en Azure AI Search

In [5]:
#!%az login --scope https://search.azure.com/.default

In [27]:
dotenv.load_dotenv()

azure_credential = DefaultAzureCredential()
azure_search_endpoint = os.getenv("AZURE_AI_SEARCH_ENDPOINT")

search_index_client = SearchIndexClient(
    endpoint=azure_search_endpoint,
    credential=azure_credential,
)
index_name = "my-index"

delete_index(search_index_client, index_name)
index = create_index_definition(index_name) # Creates an index with default vector search algorithm
# index = create_index_definition_vs(index_name) # creates an index with two vector search algorithms
print(f"creating index {index_name}")
search_index_client.create_or_update_index(index)
print(f"index {index_name} created")

deleting index my-index
creating index my-index
index my-index created


### Indexar un documento en Azure AI Search

In [12]:
import openai
from azure.search.documents import SearchClient
from typing import Dict, List

dotenv.load_dotenv()


True

In [8]:
content_input = "Hoja de vida: Lionel Messi. Futbolista argentino, considerado uno de los mejores jugadores de fútbol de todos los tiempos."

In [25]:
def gen_index_document() -> List[Dict[str, any]]:
    
    openai_client = openai.OpenAI(
    base_url="https://models.inference.ai.azure.com",
    api_key=os.environ["GITHUB_TOKEN"]
    )
    MODEL_NAME = "text-embedding-3-small"

    content_input = [{
        "Title": "Lionel Messi",
        "Content": "Hoja de vida: Lionel Messi. Futbolista argentino, considerado uno de los mejores jugadores de fútbol de todos los tiempos."
    },
    {
        "Title": "Diego Zumárraga Mera",
        "Content": "Hoja de Vida: Diego Zumárraga Mera. Ingeniero de software con experiencia en desarrollo de aplicaciones web y móviles, apasionado por la inteligencia artificial y el aprendizaje automático."
    }]
    
    items = []
    for ix, item in enumerate(content_input):
        content = item["Content"]
        print(f"Processing item {ix}: {content}")
        embeddings_response = openai_client.embeddings.create(
            model=MODEL_NAME,
            input=content,
        )
        embedding = embeddings_response.data[0].embedding
        print(len(embedding))
        print(embedding)

        items.append({
            "id": f"doc-{ix}",
            "title": item["Title"],
            "content": content,
            "keywords": item["Title"].replace(" ", ", "), # split the title into keywords
            "embedding": embedding
        })

    return items

In [26]:
docs = gen_index_document()
print(f"Generated {len(docs)} documents for indexing.")

Processing item 0: Hoja de vida: Lionel Messi. Futbolista argentino, considerado uno de los mejores jugadores de fútbol de todos los tiempos.
1536
[0.01874757558107376, -0.014915247447788715, -0.011708425357937813, 0.006682347971946001, 0.005184656009078026, -0.010924339294433594, -0.005924691911786795, 0.03307255730032921, -0.014457129873335361, 0.0005310748820193112, -0.022694434970617294, 0.018959015607833862, -0.004288243595510721, -0.07058532536029816, 0.01264228019863367, 0.04563554748892784, -0.04767945408821106, 0.022588714957237244, 0.012677519582211971, -0.003032825654372573, 0.011109348386526108, 0.021901538595557213, -0.014897627755999565, 0.014959297142922878, 0.06618035584688187, -0.0017619902500882745, -0.012977058067917824, -0.01072171051055193, -0.01226345170289278, 0.013514464721083641, 0.052190151065588, -0.013796383515000343, 0.0053035905584692955, -0.009016985073685646, -0.02380448766052723, -0.020403847098350525, 0.025566477328538895, -0.013250166550278664, -0.001

In [28]:
print(f"indexing documents")

# Upload our data to the index.
search_client = SearchClient(
    endpoint=azure_search_endpoint,
    index_name=index_name,
    credential=DefaultAzureCredential(),
)
print(f"uploading {len(docs)} documents to index {index_name}")
ds = search_client.upload_documents(docs)

indexing documents
uploading 2 documents to index my-index
