## Vector Search on Documents: Azure Cognitive Search via Python SDK

### Load environment variables

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

acs_key  = os.getenv("COGNITIVE_SEARCH_KEY")
if acs_key is None or acs_key == "":
    print("COGNITIVE_SEARCH_KEY environment variable not set.")
    exit()

aoai_key  = os.getenv("AZURE_OPENAI_KEY")
if aoai_key is None or aoai_key == "":
    print("AZURE_OPENAI_KEY environment variable not set.")
    exit()
    
index_name = 'doc-sample'
acs_endpoint = 'https://cogsearch02.search.windows.net'
acs_index_definition = 'index_definition/index_definition_text.json'
acs_api_version = '2023-07-01-Preview'
aoai_endpoint = 'https://azure-openai-dnai.openai.azure.com'
aoai_api_version = '2023-08-01-preview'
aoai_embedding_deployed_model = 'embedding-ada'

### Helper Methods

In [2]:
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchField,
    SearchableField,
    SearchFieldDataType,
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,  
    SemanticSettings,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SearchIndex
)

def get_index_client() -> SearchIndexClient:
    return SearchIndexClient(acs_endpoint, AzureKeyCredential(acs_key))

def create_index(index_name, fields, vector_search, semantic_title_field_name, semantic_content_field_names):
    semantic_settings = SemanticSettings(
        configurations=[SemanticConfiguration(
            name='default',
            prioritized_fields=PrioritizedFields(
                title_field=SemanticField(field_name=semantic_title_field_name), prioritized_content_fields=[SemanticField(field_name=field_name) for field_name in semantic_content_field_names]))])
    index = SearchIndex(
        name=index_name,
        fields=fields,
        vector_search=vector_search,
        semantic_settings=semantic_settings)
    index_client = get_index_client()
    return index_client.create_index(index)

### Create ACS Index

In [8]:
name = 'doc-sample'
vector_search = VectorSearch(
    algorithm_configurations=[
        HnswVectorSearchAlgorithmConfiguration(
            name="vector_config",
            kind="hnsw",
            parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 1000,
                "metric": "cosine"
            }
        )
    ]
)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SearchableField(name="chunk_content", type=SearchFieldDataType.String),
    SearchField(name="chunk_content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, vector_search_dimensions=1536, vector_search_configuration="vector_config")   
]

index = create_index(index_name, fields, vector_search=vector_search, semantic_title_field_name="chunk_content", semantic_content_field_names=["chunk_content"])

HttpResponseError: (ResourceNameAlreadyInUse) Cannot create index 'doc-sample' because it already exists.
Code: ResourceNameAlreadyInUse
Message: Cannot create index 'doc-sample' because it already exists.
Exception Details:	(CannotCreateExistingIndex) Cannot create index 'doc-sample' because it already exists.
	Code: CannotCreateExistingIndex
	Message: Cannot create index 'doc-sample' because it already exists.

### Chunk Document

In [5]:
from PyPDF2 import PdfReader
import pandas as pd
from langchain.text_splitter import CharacterTextSplitter

pdf_reader = PdfReader('../../data/docs/employee_handbook.pdf')
pages = [page.extract_text() for page in pdf_reader.pages]
text = " ".join(pages)

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)
chunks = text_splitter.split_text(text)

df = pd.DataFrame(chunks, columns=["chunk_content"])

print(df.head())

                                       chunk_content
0  Contoso Electronics \nEmployee Handbook  \n \n...
1  edge systems that are both reliable and effici...
2  edge systems that are both reliable and effici...
3  customers.  \n \nCompany Values:  \n1. Quality...
4  we work and live.  \nPerformance Reviews  \n \...


### Create embeddings

In [6]:
import openai
from openai.embeddings_utils import get_embedding, cosine_similarity
import pandas as pd
import json

openai.api_type = "azure"
openai.api_key = aoai_key
openai.api_base = aoai_endpoint
openai.api_version = aoai_api_version

df['chunk_content_vector'] = df['chunk_content'].apply(lambda x : get_embedding(x, engine = aoai_embedding_deployed_model)) 

df['id'] = df.index

print(df.head())

                                       chunk_content  \
0  Contoso Electronics \nEmployee Handbook  \n \n...   
1  edge systems that are both reliable and effici...   
2  edge systems that are both reliable and effici...   
3  customers.  \n \nCompany Values:  \n1. Quality...   
4  we work and live.  \nPerformance Reviews  \n \...   

                                chunk_content_vector  id  
0  [-0.013424188829958439, 0.008336983621120453, ...   0  
1  [-0.007864218205213547, 0.003030280815437436, ...   1  
2  [-0.010799335315823555, 0.003672731574624777, ...   2  
3  [-0.018283184617757797, -0.002287083538249135,...   3  
4  [-0.01662578247487545, -6.200416828505695e-05,...   4  


### Ingest to Azure Cognitive Search

This cell works because the dataframe and the ACS Index both have same columns. If the dataframe doesn't have the same columns (column names or numbers) as the ACS Index, add a preprocessing step to it to structure the dataframe according to the ACS columns.

In [9]:
import requests
import json
from azure.search.documents import SearchClient  

batch_size = 10
total_records = df.shape[0]
fields = df.columns.to_numpy()
df['id'] = df['id'].astype(str)

records = []

for index, row in df.iterrows():
    record = {}
    for field in fields:
            record[field] = row[field]

    records.append(
        record
    )

    if index % batch_size == 0 or (index+1 == total_records):
        search_client = SearchClient(acs_endpoint, index_name, AzureKeyCredential(acs_key))
        result = search_client.upload_documents(documents=records)
        records = []

### Perform a vector similarity search

In [11]:
import matplotlib.pyplot as plt
from PIL import Image
from azure.search.documents.models import Vector  

query = 'when are performance review announced?'
query_vector = get_embedding(query, engine = aoai_embedding_deployed_model)

search_client = SearchClient(acs_endpoint, index_name, AzureKeyCredential(acs_key))
vector = Vector(value=query_vector, k=3, fields="chunk_content_vector")  

results = search_client.search(  
    search_text=None,  
    vectors=[vector],
    select=["chunk_content"]  
)   

for result in results:
    print(result['chunk_content'])
    print("\n")

we work and live.  
Performance Reviews  
 
Performance Reviews at Contoso Electronics  
 
At Contoso Electronics, we strive to ensure our employees are getting the feedback they 
need to continue growing and developing in their roles. We understand that performance 
reviews are a key part of this process and it is important to us that they are conducted in an 
effective and efficient manner.  
 
Performance reviews are conducted annually a nd are an important part of your career 
development. During the review, your supervisor will discuss your performance over the 
past year and provide feedback on areas for improvement. They will also provide you with 
an opportunity to discuss your goals and  objectives for the upcoming year.  
 
Performance reviews are a two -way dialogue between managers and employees. We 
encourage all employees to be honest and open during the review process, as it is an 
important opportunity to discuss successes and challenges  in the workplace.


encourage a