# This notebook shows how to create Azure cognitive search's index
### Algotithm which we are using for our index is Hierarchical Navigable small world(HNSW)
github url: https://github.com/Azure/cognitive-search-vector-pr

In [4]:
import os  
import json  
import openai 
import uuid  
import pandas as pd
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient, SearchIndexingBufferedSender  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import (
    QueryAnswerType,
    QueryCaptionType,
    QueryLanguage,
    QueryType, 
    VectorizableTextQuery,  
)
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,   
    SemanticConfiguration,  
    PrioritizedFields,   
    SemanticField,    
    SemanticSettings,  
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,
    HnswParameters,  
    VectorSearchAlgorithmConfiguration,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
    VectorSearchVectorizer,
    VectorSearchVectorizerKind,
    AzureOpenAIParameters,
    AzureOpenAIVectorizer
)

In [5]:
service_endpoint = "YOUR-ACS-SERVICE-ENPOINT"
index_name = "YOUR-INDEX-NAME"          
key = "YOUR-CREDENTIAL-KEY"
openai.api_type = "azure"  
openai.api_key = "YOUR_EMBEDDING-MODEL-API-KEY"  
openai.api_base = "YOUR-EMBEDDING-MODEL-API-BASE"
openai.api_version = "2023-07-01-preview"
credential = AzureKeyCredential(key)

In [7]:
csv_filepath = "YOUR-CSV-FILEPATH"
df = pd.read_csv(csv_filepath)
ids = [str(uuid.uuid4()) for _ in range(len(df))]
df["id"] = ids

json file should contain embedding of text, text, and other metadata if required <br>
change this structure according to your usecase

In [6]:
json_file = []
for index, item in df.iterrows():
    data = {"id": item["id"],
            "chunked_data": item["chunked_data"],
            "file_name": item["file_name"],
            "file_path": item["file_path"],
            "embeddings": eval(item["embeddings"])
            }
    json_file.append(data)

we want almost all fields to be searchable, if you want to filter any documents with certain field, set filterable=True for that field

In [8]:
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="file_name", type=SearchFieldDataType.String, searchable=True),
    SearchableField(name="chunked_data", type=SearchFieldDataType.String, searchable=True),
    SearchableField(name="file_path", type=SearchFieldDataType.String),
    SearchField(name="embeddings", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile="myHnswProfile")]

In [9]:
vector_search = VectorSearch(
    algorithms=[
        HnswVectorSearchAlgorithmConfiguration(
            name="myHnsw",
            kind=VectorSearchAlgorithmKind.HNSW,
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric="cosine"
            ))

    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm="myHnsw",
            vectorizer="myOpenAI"
        ),
    ],
    vectorizers=[
        AzureOpenAIVectorizer(
            name="myOpenAI",
            kind="azureOpenAI",
            azure_open_ai_parameters=AzureOpenAIParameters(
                resource_uri="YOUR-EMBEDDING-MODEL-ENDPOINT",
                deployment_id="YOUR-DEPLOYMENT-ID",
                api_key=",YOUR-EMBEDDING-MODEL-API-KEY"
            )
    )  
]  

)

In [10]:
semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="file_name"),
        prioritized_content_fields=[SemanticField(field_name="chunked_data")]
    )
)

In [11]:
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

In [12]:
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(json_file)
print(f"Uploaded {len(json_file)} documents") 

In [13]:
query = "WRITE YOUR QUERY HERE"  
vector_query = VectorizableTextQuery(text=query, k=3, fields="embeddings")
results = search_client.search(  
    search_text=None,      # If you want to perform Hybrid Search, set this arg to query, eg search_text=query
    vector_queries= [vector_query],
    select=["chunked_data", "file_name"],
)  

In [None]:
for result in results:  
    print(f"file_name: {result['file_name']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"chunked_data: {result['chunked_data']}") 