## Vector Search on Text: Azure Cognitive Search via REST Endpoint

This notebook demonstrates how to use Azure Cognitive Search REST endpoint with OpenAI to work with simple tabular text datasets. It uses the [product_docs.json](../../data/text/product_docs.json) as source dataset. 

Key steps in the notebook -

- Create ACS Index from index definition
- Load the source dataset and generating embeddings
- Ingesting embeddings to ACS Index
- Multiple search queries
  
### Prerequisites

- Create a conda environment using the [cognitive_search_rest_conda.yml](/cognitive_search_rest_conda.yml) file to include all the python dependencies.
- Create a *.env* file from the *.env-template* and populate it with all necessary endpoint links and keys. 

### Load environment variables

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

acs_endpoint = os.getenv("COGNITIVE_SEARCH_ENDPOINT")
if acs_endpoint is None or acs_endpoint == "":
    print("COGNITIVE_SEARCH_ENDPOINT environment variable not set.")
    exit()

acs_index  = os.getenv("COGNITIVE_SEARCH_INDEX")
if acs_index is None or acs_index == "":
    print("COGNITIVE_SEARCH_INDEX environment variable not set.")
    exit()

acs_key  = os.getenv("COGNITIVE_SEARCH_KEY")
if acs_key is None or acs_key == "":
    print("COGNITIVE_SEARCH_KEY environment variable not set.")
    exit()
    
acs_api_version  = os.getenv("COGNITIVE_SEARCH_API_VERSION")
if acs_api_version is None or acs_api_version == "":
    print("COGNITIVE_SEARCH_API_VERSION environment variable not set.")
    exit()

aoai_endpoint  = os.getenv("AZURE_OPENAI_ENDPOINT")
if aoai_endpoint is None or aoai_endpoint == "":
    print("AZURE_OPENAI_ENDPOINT environment variable not set.")
    exit()

aoai_key  = os.getenv("AZURE_OPENAI_KEY")
if aoai_key is None or aoai_key == "":
    print("AZURE_OPENAI_KEY environment variable not set.")
    exit()

aoai_api_version  = os.getenv("AZURE_OPENAI_API_VERSION")
if aoai_api_version is None or aoai_api_version == "":
    print("AZURE_OPENAI_API_VERSION environment variable not set.")
    exit()

aoai_embedding_deployed_model  = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL")
if aoai_embedding_deployed_model is None or aoai_embedding_deployed_model == "":
    print("AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL environment variable not set.")
    exit()

### Helper Methods

In [None]:
import requests

def insert_record(acs_endpoint, acs_index, data, acs_key, acs_api_version):
    url = f"{acs_endpoint}/indexes/{acs_index}/docs/index?api-version={acs_api_version}"
    headers = {
        "Content-Type": "application/json",
        "api-key": acs_key
    }    
    response = requests.post(url, data=data, headers=headers)
    print(response.status_code)
    print(response.content)

def create_index(acs_endpoint, json_content, acs_index, api_key, acs_api_version):
    url = f"{acs_endpoint}/indexes/{acs_index}?api-version={acs_api_version}"
    headers = {
        "Content-Type": "application/json",
        "api-key": api_key
    }
    response = requests.request('PUT', url, headers=headers, data=json_content)
    print(response.status_code)
    print(response.content)

def search_vector_similarity(query_vector, top_doc_count, acs_endpoint, acs_index,acs_key, acs_api_version):
    url = f"{acs_endpoint}/indexes/{acs_index}/docs/search?api-version={acs_api_version}"

    headers = {
        "Content-Type": "application/json",
        "api-key": acs_key
    }

    request_body = {
        "vectors": [{
            "value": query_vector,
            "fields": "content_vector",
            "k": top_doc_count
        }],
        "select": "title"
    }
    request_body = json.dumps(request_body)

    response = requests.request('POST', url, headers=headers, data=request_body)

    docs = [(item['title']) for item in response.json()['value']]

    return docs

def read_json_file(file_path):
    with open(file_path, "r") as file:
        return file.read()

### Create ACS Index

In [None]:
index_definition_path = 'index_definition.json'
index_definition = read_json_file(index_definition_path)

create_index(acs_endpoint, index_definition, acs_index, acs_key, acs_api_version)

## Create embeddings
Read your data, generate OpenAI embeddings and export to a format to insert your Azure Cognitive Search index:

In [None]:
import openai
from openai.embeddings_utils import get_embedding, cosine_similarity
import pandas as pd
import json

df = pd.read_json('../../data/text/product_docs.json')

openai.api_type = "azure"
openai.api_key = aoai_key
openai.api_base = aoai_endpoint
openai.api_version = aoai_api_version

df['title_vector'] = df['title'].apply(lambda x : get_embedding(x, engine = aoai_embedding_deployed_model)) 
df['content_vector'] = df['content'].apply(lambda x : get_embedding(x, engine = aoai_embedding_deployed_model)) 

### Ingest to Azure Cognitive Search

This cell works because the dataframe and the ACS Index both have same columns. If the dataframe doesn't have the same columns (column names or numbers) as the ACS Index, add a preprocessing step to it to structure the dataframe according to the ACS columns.

In [None]:
import requests
import json

batch_size = 10
total_records = df.shape[0]
fields = df.columns.to_numpy()
df['id'] = df['id'].astype(str)

records = {
    'value': []
}

for index, row in df.iterrows():
    record = {}
    for field in fields:
            record[field] = row[field]

    records['value'].append(
        row
    )

    if index % batch_size == 0 or (index+1 == total_records):
        json_data = json.dumps(records)
        insert_record(acs_endpoint, acs_index, json_data, acs_key, acs_api_version)
        records['value'] = []

### Perform a vector similarity search

In [None]:
query = 'tools for software development'

query_vector = get_embedding(query, engine = aoai_embedding_deployed_model)

search_results = search_vector_similarity(query_vector, 5, acs_endpoint, acs_index,acs_key, acs_api_version)

print(search_results)