# Prepare Azure Cognitive Search with Vector Search

This script demonstrates how to use the Azure Cognitive Search SDK to create an index over Azure AdventureLT database. This is used to create a search index for the Azure SQL promptflow demo.


### Prerequisites

To run the code, you need to install the packages in the requirements.txt file. You can do this by running the following command:

```python
pip install -r requirements.txt
```

Copyright (c) Microsoft Corporation.
Licensed under the MIT license.

In [1]:
import pandas as pd
import pyodbc
import json
import openai
from tqdm.auto import tqdm
from dotenv import load_dotenv
import os
from tenacity import retry, wait_random_exponential, stop_after_attempt
from azure.core.credentials import AzureKeyCredential
import requests
load_dotenv()

True

In [2]:
# init openai service to create embeddings
openai.api_key = os.getenv("OPENAI_API_KEY_AZURE_Embeddings")
openai.api_type = "azure"
openai.api_base = os.getenv("OPENAI_API_BASE_Embeddings")
openai.api_version = "2023-03-15-preview"

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(text,  embedding_model_deploy_id="text-embedding-ada-002"):
    response = openai.Embedding.create(
        input=text, engine=embedding_model_deploy_id)
    embeddings = response['data'][0]['embedding']
    return embeddings

In [3]:
connectionString=os.environ["connectionString"]

# SQL QUERY to get products details from the database
sqlQuery="""SELECT PC.Name AS ProductCategoryName, SP.ProductId, SP.Name, SP.ProductNumber, SP.Color, SP.ListPrice, SP.Size, SP.ProductCategoryID, SP.ProductModelID, PD.ProductDescriptionID, PD.Description
from [SalesLT].[Product] SP
INNER JOIN SalesLT.ProductCategory PC ON PC.ProductCategoryID = SP.ProductCategoryID
INNER JOIN [SalesLT].[ProductModelProductDescription] PMPD ON PMPD.ProductModelID = SP.ProductModelID
INNER JOIN [SalesLT].[ProductDescription] PD ON PD.ProductDescriptionID = PMPD.ProductDescriptionID
WHERE PMPD.Culture = 'en'"""

# Connect to the database and execute the query to get the data for indexing
conn = pyodbc.connect(connectionString)
cursor = conn.cursor()
queryResults = pd.DataFrame()
try:
    cursor.execute(sqlQuery)
    records = cursor.fetchall()
    queryResults = pd.DataFrame.from_records(records, columns=[col[0] for col in cursor.description])
except Exception as e:
    print(f"connection could not be established: {e}")
finally:
    cursor.close()

queryResultsJson = json.loads(queryResults.to_json(orient='records'))
print(f"Total records to be indexed: {len(queryResultsJson)}, the maximum length of the description field is {queryResults['Description'].str.len().max()} characters.")

Total records to be indexed: 294, the maximum length of the description field is 221 characters.


In [4]:
# generate embeddings for the product name and product description fields
print("Generating embeddings for the product name and product description fields.")
for i in tqdm(range(len(queryResultsJson))):
    doc = queryResultsJson[i]
    queryResultsJson[i]['DescriptionVector'] = generate_embeddings(doc['Description'].strip())
    queryResultsJson[i]['ProductCategoryNameVector'] = generate_embeddings(doc['ProductCategoryName'])

Generating embeddings for the product name and product description fields.


  0%|          | 0/294 [00:00<?, ?it/s]

### Create your seach index

In [5]:
from azure.search.documents import SearchClient, SearchIndexingBufferedSender
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import (
    QueryAnswerType,
    QueryCaptionType,
    QueryLanguage,
    QueryType,
    RawVectorQuery,
    VectorizableTextQuery,
    VectorFilterMode,
)
from azure.search.documents.indexes.models import (
    ExhaustiveKnnVectorSearchAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchIndex,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SearchField,
    SemanticSettings,
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration,
    HnswParameters,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
    VectorSearchVectorizer,
    VectorSearchVectorizerKind,
    AzureOpenAIParameters,
    AzureOpenAIVectorizer,
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration,
    ExhaustiveKnnVectorSearchAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    SearchIndex,
    SearchField,
    SimpleField,
    SearchableField,
    SearchIndex,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SearchField,
    SemanticSettings,
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration,
    HnswParameters,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
    AzureOpenAIParameters,
    AzureOpenAIVectorizer,
)


In [7]:
# init Azure Cognitive Search Service
index_name = "promptflow-demo-product-description"
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
key = os.getenv("AZURE_SEARCH_KEY")
credential = AzureKeyCredential(key)
model = "text-embedding-ada-002"

In [8]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SearchableField(name="ProductCategoryName", type=SearchFieldDataType.String, searchable = True, filterable=True, facetable=False, retrievable = True),
    SearchableField(name="ProductId", type=SearchFieldDataType.String, searchable = True, filterable=True, key=True),
    SearchableField(name="Name", type=SearchFieldDataType.String, key=False, searchable = True, sortable=False, filterable=True, facetable=False, retrievable = True),
    SearchableField(name="ProductNumber", type=SearchFieldDataType.String, searchable = True),
    SearchableField(name="Color", type=SearchFieldDataType.String, searchable = True),
    SimpleField(name="ListPrice", type=SearchFieldDataType.Double, searchable = False, filterable=True),
    SimpleField(name="Size", type=SearchFieldDataType.String, searchable = False, filterable=True),
    SimpleField(name="ProductCategoryID", type=SearchFieldDataType.Int32, searchable = True, filterable=True),
    SimpleField(name="ProductModelID", type=SearchFieldDataType.Int32, searchable = True, filterable=True),
    SimpleField(name="ProductDescriptionID", type=SearchFieldDataType.Int32, searchable = True, filterable=True, ),
    SearchableField(name="Description", type=SearchFieldDataType.String,
                    filterable=True, searchable = True),
    SearchField(name="DescriptionVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile="myHnswProfile"),
    SearchField(name="ProductCategoryNameVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile="myHnswProfile"),
]

# Configure the vector search configuration
vector_search = VectorSearch(
    algorithms=[
        HnswVectorSearchAlgorithmConfiguration(
            name="myHnsw",
            kind=VectorSearchAlgorithmKind.HNSW,
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric="cosine"
            )
        ),
        ExhaustiveKnnVectorSearchAlgorithmConfiguration(
            name="myExhaustiveKnn",
            kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
            parameters=ExhaustiveKnnParameters(
                metric="cosine"
            )
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm="myHnsw",
            vectorizer="myOpenAI"
        ),
        VectorSearchProfile(
            name="myExhaustiveKnnProfile",
            algorithm="myExhaustiveKnn",
            vectorizer="myOpenAI"
        )
    ],
    vectorizers=[
        AzureOpenAIVectorizer(
            name="myOpenAI",
            kind="azureOpenAI",
            azure_open_ai_parameters=AzureOpenAIParameters(
                resource_uri=os.getenv("OPENAI_API_BASE_Embeddings"),
                deployment_id=model,
                api_key=os.getenv("OPENAI_API_KEY_AZURE_Embeddings")
            )
    )
]
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="Name"),
        prioritized_keywords_fields=[SemanticField(field_name="Description")],
        prioritized_content_fields=[SemanticField(field_name="Description")]
    )
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f'{result.name} created')

promptflow-demo-product-description created


### Index and upload embedded documents into vector store

In [9]:
for i in range(len(queryResultsJson)):
    # since productId is the key field, it needs to be a string
    queryResultsJson[i]['ProductId'] = str(queryResultsJson[i]['ProductId'])

search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(queryResultsJson)
print(f"Uploaded {len(queryResultsJson)} documents")

Uploaded 294 documents


### Vector Search Example

In [12]:
search_service = "sqldricopilot"
query = "Do you have something that can help me exercise at home?"
top_k = 5
api_version = "2023-07-01-Preview"

headers = {
        'Content-Type': 'application/json',
        'api-key': key,
    }
params = {
    'api-version': api_version,
}
body = {
    "vector": {
        "value": generate_embeddings(query),
        "fields": "ProductCategoryNameVector, DescriptionVector",
        "k": top_k
    },
    "select": "ProductId, ProductCategoryName, Name, ProductNumber, Color, ListPrice, Size, ProductCategoryID, ProductModelID, ProductDescriptionID, Description",
    "top": top_k,
}
response = requests.post(
    f"{service_endpoint}/indexes/{index_name}/docs/search", headers=headers, params=params, json=body)
response_json = response.json()['value']

In [13]:
response_json

[{'@search.score': 0.01666666753590107,
  'ProductCategoryName': 'Socks',
  'ProductId': '710',
  'Name': 'Mountain Bike Socks, L',
  'ProductNumber': 'SO-B909-L',
  'Color': 'White',
  'ListPrice': 9.5,
  'Size': 'L',
  'ProductCategoryID': 27,
  'ProductModelID': 18,
  'ProductDescriptionID': 1189,
  'Description': 'Combination of natural and synthetic fibers stays dry and provides just the right cushioning.'},
 {'@search.score': 0.01666666753590107,
  'ProductCategoryName': 'Bike Stands',
  'ProductId': '879',
  'Name': 'All-Purpose Bike Stand',
  'ProductNumber': 'ST-1401',
  'Color': None,
  'ListPrice': 159.0,
  'Size': None,
  'ProductCategoryID': 31,
  'ProductModelID': 122,
  'ProductDescriptionID': 1201,
  'Description': 'Perfect all-purpose bike stand for working on your bike at home. Quick-adjusting clamps and steel construction.'},
 {'@search.score': 0.016393441706895828,
  'ProductCategoryName': 'Socks',
  'ProductId': '709',
  'Name': 'Mountain Bike Socks, M',
  'Product