# Azure AI Search con Embeddings Pre-calculados

Este notebook utiliza los datos generados en el módulo anterior (04-document-ingestion) que ya incluyen:
- Embeddings vectoriales
- Categorías
- Tags

Aquí NO necesitamos generar embeddings durante la ingesta, simplemente indexamos los vectores que ya existen.


In [None]:
from __future__ import annotations

import json
import os
import pathlib
import time
from typing import Any, Dict, List

from azure.core.credentials import AzureKeyCredential
from azure.core.exceptions import ResourceExistsError
from azure.storage.blob import BlobServiceClient

from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.search.documents.models import VectorizedQuery
from azure.search.documents.indexes.models import (
    HnswAlgorithmConfiguration,
    IndexingParameters,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SearchIndexer,
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection,
    SimpleField,
    SearchableField,
    VectorSearch,
    VectorSearchProfile,
)
from dotenv import load_dotenv
from openai import AzureOpenAI

## 1. Configuración


In [None]:
load_dotenv(override=True)

# Azure Search
AZURE_SEARCH_ENDPOINT = os.getenv("AZURE_SEARCH_ENDPOINT")
AZURE_SEARCH_KEY = os.getenv("AZURE_SEARCH_KEY")

# Azure Storage
AZURE_STORAGE_CONNECTION_STRING = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
BLOB_CONTAINER_NAME = os.getenv("AZURE_STORAGE_CONTAINER", "documents-precomputed-vectors")
BLOB_PREFIX = "json-chunks"

# Resource names
INDEX_NAME = "chunks-precomputed-vectors-index"
DATA_SOURCE_NAME = "chunks-blob-datasource"
INDEXER_NAME = "chunks-blob-indexer"
VECTOR_PROFILE_NAME = "chunksProfile"
ALGORITHM_NAME = "chunksHnsw"

# Data file from previous module
DATA_FILE = pathlib.Path("..") / "04-document-ingestion" / "data" / "rag_ingested_chunks.json"

print("Configuración cargada")
print(f"  - Archivo de datos: {DATA_FILE}")
print(f"  - Índice: {INDEX_NAME}")
print(f"  - Contenedor: {BLOB_CONTAINER_NAME}")


## 2. Inicializar Clientes


In [None]:
# Validate environment variables
required_vars = [
    "AZURE_SEARCH_ENDPOINT",
    "AZURE_SEARCH_KEY",
    "AZURE_STORAGE_CONNECTION_STRING",
    "AZURE_OPENAI_API_KEY",
    "AZURE_OPENAI_ENDPOINT",
    "AZURE_OPENAI_ENDPOINT_US"
]

missing = [var for var in required_vars if os.getenv(var) is None]
if missing:
    raise EnvironmentError(f"Faltan variables de entorno: {', '.join(missing)}")

# Initialize Azure Search clients
credential = AzureKeyCredential(AZURE_SEARCH_KEY)
index_client = SearchIndexClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=credential)
indexer_client = SearchIndexerClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=credential)
search_client = SearchClient(endpoint=AZURE_SEARCH_ENDPOINT, index_name=INDEX_NAME, credential=credential)

# Initialize Azure OpenAI client for chat (US endpoint)
chat_client = AzureOpenAI(
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT_US"]
)

# Initialize Azure OpenAI client for embeddings (default endpoint)
embeddings_client = AzureOpenAI(
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"]
)

EMBEDDING_DEPLOYMENT = os.environ["AZURE_OPENAI_DEPLOYMENT_EMBEDDING"]

## 3. Cargar Documentos con Embeddings Pre-calculados


In [None]:
if not DATA_FILE.exists():
    raise FileNotFoundError(f"No se encuentra el archivo: {DATA_FILE}")

with open(DATA_FILE, "r", encoding="utf-8") as f:
    documents = json.load(f)

print(f"Cargados {len(documents)} documentos")


## 4. Subir Documentos a Blob Storage


In [None]:
print("\nSubiendo documentos al contenedor de Blob Storage...")
blob_service = BlobServiceClient.from_connection_string(AZURE_STORAGE_CONNECTION_STRING)
container_client = blob_service.get_container_client(BLOB_CONTAINER_NAME)

try:
    container_client.create_container()
    print(f"  Contenedor '{BLOB_CONTAINER_NAME}' creado")
except ResourceExistsError:
    print(f"  Contenedor '{BLOB_CONTAINER_NAME}' ya existe, se reutiliza")

uploaded_count = 0
for doc in documents:
    blob_name = f"{BLOB_PREFIX}/{doc['id']}.json"
    data = json.dumps(doc, ensure_ascii=False).encode("utf-8")
    container_client.upload_blob(name=blob_name, data=data, overwrite=True)
    uploaded_count += 1
    if uploaded_count % 100 == 0:
        print(f"  Subidos {uploaded_count}/{len(documents)} documentos...")

print(f"  {uploaded_count} documentos subidos correctamente")


## 5. Crear Índice de Azure AI Search

El índice acepta vectores pre-calculados. No necesitamos vectorizer para la ingesta.


In [None]:
print("\nCreando índice para vectores pre-calculados...")

# Define index fields
fields = [
    SimpleField(
        name="id", 
        type=SearchFieldDataType.String, 
        key=True, 
        filterable=True, 
        sortable=True
    ),
    SearchableField(
        name="content", 
        type=SearchFieldDataType.String, 
        searchable=True
    ),
    SimpleField(
        name="category", 
        type=SearchFieldDataType.String, 
        filterable=True, 
        facetable=True, 
        sortable=True
    ),
    SimpleField(
        name="source", 
        type=SearchFieldDataType.String, 
        filterable=True, 
        facetable=True
    ),
    SimpleField(
        name="tags",
        type=SearchFieldDataType.Collection(SearchFieldDataType.String),
        filterable=True,
        facetable=True,
    ),
    SearchField(
        name="embedding",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=1536,
        vector_search_profile_name=VECTOR_PROFILE_NAME,
    ),
]

# Configure vector search
vector_search = VectorSearch(
    profiles=[
        VectorSearchProfile(
            name=VECTOR_PROFILE_NAME,
            algorithm_configuration_name=ALGORITHM_NAME,
        )
    ],
    algorithms=[HnswAlgorithmConfiguration(name=ALGORITHM_NAME)],
)

index = SearchIndex(name=INDEX_NAME, fields=fields, vector_search=vector_search)

# Delete previous index if exists
try:
    index_client.delete_index(INDEX_NAME)
    print("  Índice anterior eliminado")
except Exception:
    pass

index_client.create_index(index)
print(f"  Índice '{INDEX_NAME}' creado")


## 6. Crear Data Source


In [None]:
print("\nCreando data source del contenedor...")

# Preventive cleanup of indexer
try:
    indexer_client.delete_indexer(INDEXER_NAME)
except Exception:
    pass

container = SearchIndexerDataContainer(name=BLOB_CONTAINER_NAME, query=f"{BLOB_PREFIX}")
data_source = SearchIndexerDataSourceConnection(
    name=DATA_SOURCE_NAME,
    type="azureblob",
    connection_string=AZURE_STORAGE_CONNECTION_STRING,
    container=container,
    description="Blob container con chunks y embeddings pre-calculados",
)

try:
    indexer_client.delete_data_source_connection(DATA_SOURCE_NAME)
except Exception:
    pass

indexer_client.create_data_source_connection(data_source)
print(f"  Data source '{DATA_SOURCE_NAME}' creada")


## 7. Crear Indexer (SIN Skillset)

Como los embeddings ya están calculados, NO necesitamos un skillset.
El indexer simplemente lee los campos del JSON y los mapea al índice.


In [None]:
print("\nCreando indexer (sin skillset, vectores pre-calculados)...")

indexer = SearchIndexer(
    name=INDEXER_NAME,
    data_source_name=DATA_SOURCE_NAME,
    target_index_name=INDEX_NAME,
    description="Indexer para chunks con embeddings pre-calculados",
    parameters=IndexingParameters(configuration={"parsingMode": "json"}),
)

try:
    indexer_client.delete_indexer(INDEXER_NAME)
except Exception:
    pass

indexer_client.create_indexer(indexer)
print(f"  Indexer '{INDEXER_NAME}' creado")


## 8. Ejecutar Indexer y Esperar


In [None]:
print("\nEjecutando indexer y esperando a que finalice...")
indexer_client.run_indexer(INDEXER_NAME)

timeout_seconds = 300
waited = 0
poll_interval = 5

while waited < timeout_seconds:
    status = indexer_client.get_indexer_status(INDEXER_NAME)
    last_result = status.last_result
    
    if last_result and last_result.status == "success":
        print(f"  Indexer completado exitosamente")
        print(f"    - Documentos procesados: {last_result.item_count}")
        print(f"    - Documentos fallidos: {last_result.failed_item_count}")
        if last_result.failed_item_count > 0:
            print(f"    Errores: {last_result.error_message}")
        break
    elif last_result and last_result.status == "transientFailure":
        print(f"  Fallo transitorio: {last_result.error_message}")
    elif last_result and last_result.status == "error":
        raise RuntimeError(f"Indexer falló: {last_result.error_message}")
    
    print(f"  Estado: {status.status} (esperados {waited}/{timeout_seconds}s)")
    time.sleep(poll_interval)
    waited += poll_interval
else:
    raise TimeoutError("El indexer no completó dentro del tiempo esperado")


## 9. Función auxiliar para generar embeddings de queries


In [None]:
def generate_query_embedding(text: str) -> List[float]:
    """
    Generate embedding for a query using Azure OpenAI.
    """
    response = embeddings_client.embeddings.create(
        model=EMBEDDING_DEPLOYMENT,
        input=text
    )
    return response.data[0].embedding

print("Función de embedding lista")

## 10. Búsqueda Híbrida (Texto + Vector)


In [None]:
def hybrid_search(query: str, top_k: int = 3) -> None:
    print(f"\nBúsqueda híbrida (texto + vector): '{query}'")
    
    # Generate query embedding
    query_vector = generate_query_embedding(query)
    
    # Create vectorized query
    vector_query = VectorizedQuery(
        vector=query_vector,
        k_nearest_neighbors=top_k,
        fields="embedding"
    )
    
    # Execute hybrid search (text + vector)
    results = search_client.search(
        search_text=query,  # Text search also
        vector_queries=[vector_query],
        select=["id", "content", "category", "tags"],
        top=top_k
    )
    
    print(f"\nResultados:")
    found = False
    for i, result in enumerate(results, 1):
        found = True
        snippet = result["content"].replace("\n", " ")
        score = result.get("@search.score", 0.0)
        category = result.get("category", "N/A")
        tags = result.get("tags", [])
        
        print(f"\n{i}. Score: {score:.4f}")
        print(f"   ID: {result['id']}")
        print(f"   Categoría: {category}")
        print(f"   Tags: {', '.join(tags) if tags else 'N/A'}")
        print(f"   Contenido: {snippet}...")
    
    if not found:
        print("  (No se encontraron resultados)")



## 11. Búsqueda con Filtros (por categoría y tags)


In [None]:
def filtered_search(query: str, category: str = None, tags: List[str] = None, top_k: int = 3) -> None:
    filters = []
    if category:
        filters.append(f"category eq '{category}'")
    if tags:
        tag_filters = [f"tags/any(t: t eq '{tag}')" for tag in tags]
        filters.extend(tag_filters)
    
    filter_str = " and ".join(filters) if filters else None
    
    print(f"\nBúsqueda con filtros: '{query}'")
    if filter_str:
        print(f"   Filtros: {filter_str}")
    
    # Generate query embedding
    query_vector = generate_query_embedding(query)
    
    # Create vectorized query
    vector_query = VectorizedQuery(
        vector=query_vector,
        k_nearest_neighbors=50,  # Search more candidates before filtering
        fields="embedding"
    )
    
    # Execute search with filters
    results = search_client.search(
        search_text=query,
        vector_queries=[vector_query],
        filter=filter_str,
        select=["id", "content", "category", "tags"],
        top=top_k
    )
    
    print(f"\nResultados:")
    found = False
    for i, result in enumerate(results, 1):
        found = True
        snippet = result["content"].replace("\n", " ")
        score = result.get("@search.score", 0.0)
        category = result.get("category", "N/A")
        tags = result.get("tags", [])
        
        print(f"\n{i}. Score: {score:.4f}")
        print(f"   ID: {result['id']}")
        print(f"   Categoría: {category}")
        print(f"   Tags: {', '.join(tags) if tags else 'N/A'}")
        print(f"   Contenido: {snippet}...")
    
    if not found:
        print("  (No se encontraron resultados)")

## 12. Ejemplos de búsqueda con filtros


In [None]:
filtered_search("comer", category="Bilbao")

filtered_search("museos", tags=["museos"])


## 13. Realizar búsquedas híbridas


In [None]:
hybrid_search("Guggenheim")

hybrid_search("deportes")
