## Vector Search on Documents: Azure Cognitive Search via Python SDK

### Load environment variables

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

acs_key  = os.getenv("COGNITIVE_SEARCH_KEY")
if acs_key is None or acs_key == "":
    print("COGNITIVE_SEARCH_KEY environment variable not set.")
    exit()

aoai_key  = os.getenv("AZURE_OPENAI_KEY")
if aoai_key is None or aoai_key == "":
    print("AZURE_OPENAI_KEY environment variable not set.")
    exit()
    
index_name = 'doc-sample'
acs_endpoint = 'https://cogsearch02.search.windows.net'
acs_index_definition = 'index_definition/index_definition_text.json'
acs_api_version = '2023-07-01-Preview'
aoai_endpoint = 'https://azure-openai-dnai.openai.azure.com'
aoai_api_version = '2023-08-01-preview'
aoai_embedding_deployed_model = 'embedding-ada'

### Helper methods

In [None]:
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchField,
    SearchableField,
    SearchFieldDataType,
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,  
    SemanticSettings,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SearchIndex
)

def get_index_client() -> SearchIndexClient:
    return SearchIndexClient(acs_endpoint, AzureKeyCredential(acs_key))

def create_index(index_name, fields, vector_search, semantic_title_field_name, semantic_content_field_names):
    semantic_settings = SemanticSettings(
        configurations=[SemanticConfiguration(
            name='default',
            prioritized_fields=PrioritizedFields(
                title_field=SemanticField(field_name=semantic_title_field_name), prioritized_content_fields=[SemanticField(field_name=field_name) for field_name in semantic_content_field_names]))])
    index = SearchIndex(
        name=index_name,
        fields=fields,
        vector_search=vector_search,
        semantic_settings=semantic_settings)
    index_client = get_index_client()
    return index_client.create_index(index)

### Create ACS index

In [None]:
name = 'doc-sample'
vector_search = VectorSearch(
    algorithm_configurations=[
        HnswVectorSearchAlgorithmConfiguration(
            name="vector_config",
            kind="hnsw",
            parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 1000,
                "metric": "cosine"
            }
        )
    ]
)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SearchableField(name="chunk_content", type=SearchFieldDataType.String),
    SearchField(name="chunk_content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, vector_search_dimensions=1536, vector_search_configuration="vector_config")   
]

index = create_index(index_name, fields, vector_search=vector_search, semantic_title_field_name="chunk_content", semantic_content_field_names=["chunk_content"])

### Load embeddings

In [None]:
df = pd.read_json('../../data/docs/employee_handbook_chunk_embeddings.json')

### Ingest to azure cognitive search

This cell works because the dataframe and the ACS Index both have same columns. If the dataframe doesn't have the same columns (column names or numbers) as the ACS Index, add a preprocessing step to it to structure the dataframe according to the ACS columns.

In [None]:
import requests
import json
from azure.search.documents import SearchClient  

batch_size = 10
total_records = df.shape[0]
fields = df.columns.to_numpy()
df['id'] = df['id'].astype(str)

records = []

for index, row in df.iterrows():
    record = {}
    for field in fields:
            record[field] = row[field]

    records.append(
        record
    )

    if index % batch_size == 0 or (index+1 == total_records):
        search_client = SearchClient(acs_endpoint, index_name, AzureKeyCredential(acs_key))
        result = search_client.upload_documents(documents=records)
        records = []

### Perform a vector similarity search

In [None]:
import matplotlib.pyplot as plt
from PIL import Image
from azure.search.documents.models import Vector  

query = 'when are performance review announced?'
query_vector = get_embedding(query, engine = aoai_embedding_deployed_model)

search_client = SearchClient(acs_endpoint, index_name, AzureKeyCredential(acs_key))
vector = Vector(value=query_vector, k=3, fields="chunk_content_vector")  

results = search_client.search(  
    search_text=None,  
    vectors=[vector],
    select=["chunk_content"]  
)   

for result in results:
    print(result['chunk_content'])
    print("\n")