# Azure Cognitive Search Vector Search Code Sample with Azure OpenAI
This code demonstrates how to use Azure Cognitive Search with OpenAI and Azure Python SDK

Author: [Vlad Feigin - Cloud Solution Architect - Microsoft](<https://www.linkedin.com/in/vladifeigin/>)



## Import required libraries and environment variables

In [10]:
! pip install azure-search-documents --pre --upgrade
! pip install azure-identity




In [11]:
# Import required libraries
import os
import json
from openai import AzureOpenAI
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from tenacity import retry, wait_random_exponential, stop_after_attempt
from langchain.document_loaders import PyPDFLoader


# Configure environment variables
load_dotenv()

True

In [12]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    SearchIndex,
)

In [13]:
openai_credential = DefaultAzureCredential()
token_provider = get_bearer_token_provider(openai_credential, "https://cognitiveservices.azure.com/.default")
# Configure environment variables
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
index_name = "mydocs"
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")

azure_openai_key = os.getenv("OPENAI_API_KEY")
azure_openai_endpoint = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
azure_openai_embedding_deployment = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
azure_openai_api_version = "2024-02-15-preview"

# Configure OpenAI API
client = AzureOpenAI(
    azure_deployment=azure_openai_embedding_deployment,
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_key,
    azure_ad_token_provider=token_provider if not azure_openai_key else None
)
credential = AzureKeyCredential(key)

## Create embeddings
Read your data, generate OpenAI embeddings and export to a format to insert your Azure Cognitive Search index:

## Prepare data for loading into Azure Cognitive Search - DO THIS ONLY ONCE !!!

In [14]:
doc_title = "Semantic Kernel"
# load pdf and split into pages
fileName = "./data/semantic-kernel.pdf"
loader = PyPDFLoader(fileName)
pages = loader.load_and_split()
print("Number of pages: ", len(pages))

doc_with_vector_list = []
doc_id = 0
# Generate embeddings for title and content fields
for page in pages:
    page_with_vector = {}
    page_with_vector['id'] = str(doc_id)
    page_with_vector['title'] = doc_title
    page_with_vector['titleVector'] = client.embeddings.create(input=doc_title, model=azure_openai_embedding_deployment).data[0].embedding
    page_with_vector['content'] = page.page_content
    page_with_vector['contentVector'] = client.embeddings.create(input=page.page_content, model=azure_openai_embedding_deployment).data[0].embedding
    doc_with_vector_list.append(page_with_vector)
    doc_id += 1

# Output embeddings to docVectors.json file
with open("./sk_Vectors1.json", "w") as f:
    json.dump(doc_with_vector_list, f)

Number of pages:  187


## Create search index - DO THIS ONLY ONCE !!!
Create your search index schema and vector search configuration:

In [15]:
# Create a search index
# Note: You must create Cognitive Search resource and get the endpoint and key in advance
index_client = SearchIndexClient(endpoint=service_endpoint, credential=credential)

fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="title", type=SearchFieldDataType.String),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchableField(name="category", type=SearchFieldDataType.String,
                    filterable=True),
    SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
]

# Configure the vector search configuration  
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw"
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="title"),
        keywords_fields=[SemanticField(field_name="category")],
        content_fields=[SemanticField(field_name="content")]
    )
)

# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

 mydocs created


## Insert text and embeddings into vector store - DO THIS ONLY ONCE !!
Add texts and metadata from the JSON data to the vector store:

In [16]:
# Upload documents to the index
with open('./sk_Vectors1.json', 'r') as file:
    documents = json.load(file)
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(documents)
print(f"Uploaded {len(documents)} documents")

Uploaded 187 documents


## Perform a vector similarity search

In [17]:
from azure.search.documents.models import VectorizedQuery

# Pure Vector Search
query = "what is semantic kernel?"  
  
embedding = client.embeddings.create(input=query, model=azure_openai_embedding_deployment).data[0].embedding
vector_query = VectorizedQuery(vector=embedding, k_nearest_neighbors=3, fields="contentVector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["title", "content", "category"],
)  
  
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['content']}")  
    print(f"Category: {result['category']}\n")  

Title: Semantic Kernel
Score: 0.8932474
Content: Tell us about y our PDF experience.
What is Semantic Kernel?
Article •07/11/2023
Semantic K ernel is an open-source SDK that lets you easily combine AI services like
OpenAI , Azure OpenAI , and Hugging F ace  with conventional programming
languages like C# and Python. By doing so, you can create AI apps that combine the
best of both worlds.
During K evin Scott's talk The era of the AI Copilot , he showed how Microsoft powers its
Copilot system  with a stack of AI models and plugins. At the center of this stack is an AI
orchestration layer that allows us to combine AI models and plugins together to create
brand new experiences for users.
Semantic Kernel is at the center of the copilot
stack
Category: None

Title: Semantic Kernel
Score: 0.8851806
Content: Additional learning for Semantic Kernel
Article •07/11/2023
Want to learn more about Semantic K ernel? Check out these in-depth tutorials and
videos. W e will add more content over time f

In [18]:
query = "semantic kernel planner and kernel"

embedding = client.embeddings.create(input=query, model=azure_openai_embedding_deployment).data[0].embedding
vector_query = VectorizedQuery(vector=embedding, k_nearest_neighbors=2, fields="contentVector")

results = search_client.search(
    search_text=None,
    vector_queries=[vector_query], 
    select=["title", "content"],
)

for result in results:
    print(f"Title: {result['title']}")
    print(f"Score: {result['@search.score']}")
    print(f"Content: {result['content']}")

Title: Semantic Kernel
Score: 0.87037426
Content: To simplify the creation of AI apps, open source projects like LangChain  have
emerged. Semantic K ernel is Microsoft's contribution to this space and is designed to
support enterprise app developers who want to integrate AI into their existing apps.
By using multiple AI models, plugins, and memory all together within Semantic K ernel,
you can create sophisticated pipelines that allow AI to automate complex tasks for users.
For example, with Semantic K ernel, you could create a pipeline that helps a user send an
email to their marketing team. With memory , you could retrieve information about the
project and then use planner  to autogenerate the remaining steps using available
plugins (e.g., ground the user's ask with Microsoft Graph data, generate a response with
GPT-4, and send the email). Finally, you can display a success message back to your user
in your app using a custom plugin.
Step Component Descr iption
1 Ask It starts with a 

## Perform an Hybrid Search

In [19]:
# Hybrid Search
query = "semantic kernel planner and kernel"  
  
embedding = client.embeddings.create(input=query, model=azure_openai_embedding_deployment).data[0].embedding
vector_query = VectorizedQuery(vector=embedding, k_nearest_neighbors=3, fields="contentVector")

results = search_client.search(  
    search_text=query,  
    vector_queries=[vector_query],
    select=["title", "content", "category"],
    top=3
)  
  
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['content']}")  
    print(f"Category: {result['category']}\n")  

Title: Semantic Kernel
Score: 0.03306011110544205
Content: To simplify the creation of AI apps, open source projects like LangChain  have
emerged. Semantic K ernel is Microsoft's contribution to this space and is designed to
support enterprise app developers who want to integrate AI into their existing apps.
By using multiple AI models, plugins, and memory all together within Semantic K ernel,
you can create sophisticated pipelines that allow AI to automate complex tasks for users.
For example, with Semantic K ernel, you could create a pipeline that helps a user send an
email to their marketing team. With memory , you could retrieve information about the
project and then use planner  to autogenerate the remaining steps using available
plugins (e.g., ground the user's ask with Microsoft Graph data, generate a response with
GPT-4, and send the email). Finally, you can display a success message back to your user
in your app using a custom plugin.
Step Component Descr iption
1 Ask It start

## Perform a Semantic Hybrid Search

In [None]:
from azure.search.documents.models import QueryType, QueryCaptionType, QueryAnswerType
# Semantic Hybrid Search
query = "semantic kernel planner and kernel"

embedding = client.embeddings.create(input=query, model=azure_openai_embedding_deployment).data[0].embedding
vector_query = VectorizedQuery(vector=embedding, k_nearest_neighbors=3, fields="contentVector")

results = search_client.search(
    search_text=query,
    vector_queries=[vector_query], 
    select=["title", "content", "category"],
    query_type=QueryType.SEMANTIC, semantic_configuration_name='my-semantic-config', query_caption=QueryCaptionType.EXTRACTIVE, query_answer=QueryAnswerType.EXTRACTIVE,
    top=3
)

semantic_answers = results.get_answers()
for answer in semantic_answers:
    if answer.highlights:
        print(f"Semantic Answer: {answer.highlights}")
    else:
        print(f"Semantic Answer: {answer.text}")
    print(f"Semantic Answer Score: {answer.score}\n")

for result in results:
    print(f"Title: {result['title']}")
    print(f"Reranker Score: {result['@search.reranker_score']}")
    print(f"Content: {result['content']}")
    print(f"Category: {result['category']}")

    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")