## Vector Search on Text: Azure Cognitive Search via REST Endpoint

### Set environment variables

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

acs_key  = os.getenv("COGNITIVE_SEARCH_KEY")
if acs_key is None or acs_key == "":
    print("COGNITIVE_SEARCH_KEY environment variable not set.")
    exit()

aoai_key  = os.getenv("AZURE_OPENAI_KEY")
if aoai_key is None or aoai_key == "":
    print("AZURE_OPENAI_KEY environment variable not set.")
    exit()

acs_endpoint = 'https://cogsearch02.search.windows.net'
acs_index_definition = 'index_definition/index_definition_text.json'
acs_api_version = '2023-07-01-Preview'
aoai_endpoint = 'https://azure-openai-dnai.openai.azure.com'
aoai_api_version = '2023-08-01-preview'
aoai_embedding_deployed_model = 'embedding-ada'

### Helper methods

In [None]:
import requests
import json

def insert_record(acs_endpoint, acs_index, data, acs_key, acs_api_version):
    url = f"{acs_endpoint}/indexes/{acs_index}/docs/index?api-version={acs_api_version}"
    headers = {
        "Content-Type": "application/json",
        "api-key": acs_key
    }    
    response = requests.post(url, data=data, headers=headers)
    print(response.status_code)
    print(response.content)

def create_index(acs_endpoint, json_content, index_name, api_key, acs_api_version):
    url = f"{acs_endpoint}/indexes/{index_name}?api-version={acs_api_version}"
    headers = {
        "Content-Type": "application/json",
        "api-key": api_key
    }
    response = requests.request('PUT', url, headers=headers, data=json_content)
    print(response.status_code)
    print(response.content)

def search_vector_similarity(query_vector, top_doc_count, acs_endpoint, acs_index,acs_key, acs_api_version):
    url = f"{acs_endpoint}/indexes/{acs_index}/docs/search?api-version={acs_api_version}"

    headers = {
        "Content-Type": "application/json",
        "api-key": acs_key
    }

    request_body = {
        "vectors": [{
            "value": query_vector,
            "fields": "content_vector",
            "k": top_doc_count
        }],
        "select": "title"
    }
    request_body = json.dumps(request_body)
    response = requests.request('POST', url, headers=headers, data=request_body)

    docs = [(item['title']) for item in response.json()['value']]

    return docs

def get_acs_index_name(acs_index_definition):
    index_json_content = read_json_file(acs_index_definition)
    index_json = json.loads(index_json_content)
    index_name = index_json['name']

    return index_name

def read_json_file(file_path):
    with open(file_path, "r") as file:
        return file.read()

### Create ACS index

In [None]:
index_definition = read_json_file(acs_index_definition)
index_name = get_acs_index_name(acs_index_definition)

create_index(acs_endpoint, index_definition, index_name, acs_key, acs_api_version)

### Load embeddings

In [None]:
df = pd.read_json('../../data/text/product_docs_embeddings.json')

### Ingest to azure cognitive search

This cell works because the dataframe and the ACS Index both have same columns. If the dataframe doesn't have the same columns (column names or numbers) as the ACS Index, add a preprocessing step to it to structure the dataframe according to the ACS columns.

In [None]:
import requests
import json

batch_size = 10
total_records = df.shape[0]
fields = df.columns.to_numpy()
df['id'] = df['id'].astype(str)

records = {
    'value': []
}

for index, row in df.iterrows():
    record = {}
    for field in fields:
            record[field] = row[field]

    records['value'].append(
        record
    )

    if index % batch_size == 0 or (index+1 == total_records):
        json_data = json.dumps(records)
        insert_record(acs_endpoint, index_name, json_data, acs_key, acs_api_version)
        records['value'] = []

### Perform a vector similarity search

In [None]:
query = 'tools for software development'

query_vector = get_embedding(query, engine = aoai_embedding_deployed_model)

search_results = search_vector_similarity(query_vector, 5, acs_endpoint, index_name,acs_key, acs_api_version)

print(search_results)