## Vector Search - Azure Cosmos DB No SQL

####  Libraries


In [3]:
import json
import datetime
import time

from azure.core.exceptions import AzureError
from azure.core.credentials import AzureKeyCredential
from azure.cosmos import exceptions, CosmosClient, PartitionKey
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.search.documents.models import (
    QueryAnswerType,
    QueryCaptionType,
    ##QueryLanguage,
    QueryType
    #Vector  
)
from azure.search.documents.indexes.models import (
    IndexingSchedule,
    SearchIndex,
    SearchIndexer,
    SearchIndexerDataContainer,
    SearchField,
    SearchFieldDataType,
    SearchableField,
    SemanticConfiguration,
    SimpleField,
    SemanticField,
    VectorSearch,
   SearchIndexerDataSourceConnection    
)


from tenacity import retry, wait_random_exponential, stop_after_attempt


import os
from dotenv import load_dotenv

import openai
from openai.embeddings_utils import  cosine_similarity


#### Enviromnent variables

In [None]:

load_dotenv()

## Cosmos db endpoint format: https://<nameofyourcosmosservice>.documents.azure.com
cosmos_db_api_endpoint  = os.getenv("cosmos_db_api_endpoint")
if cosmos_db_api_endpoint is None or cosmos_db_api_endpoint == "":
    print("cosmos_db_api_endpoint environment variable not set.")
    exit()

## Cosmos db API Key
cosmos_db_api_key  = os.getenv("cosmos_db_api_key")
if cosmos_db_api_key is None or cosmos_db_api_key == "":
    print("cosmos_db_api_key environment variable not set.")
    exit()

##Cosmos Connection String. Format: 
##AccountEndpoint=https://<nameofthesevice>.documents.azure.com;AccountKey=<value of the key>;Database=<name of the database, suggested here Vector_DB>;
cosmos_db_connection_string  = os.getenv("cosmos_db_connection_string")
if cosmos_db_connection_string is None or cosmos_db_connection_string == "":
    print("cosmos_db_connection_string environment variable not set.")
    exit()
    
##Cognitive Search Service Name, you need to deploy this service. Format: https://<nameoftheservice>.search.windows.net
cog_search_endpoint  = os.getenv("cog_search_endpoint")
if cog_search_endpoint is None or cog_search_endpoint == "":
    print("cog_search_endpoint environment variable not set.")
    exit()

##Cognitive Search Service Key
cog_search_key  = os.getenv("cog_search_key")
if cog_search_key is None or cog_search_key == "":
    print("cog_search_key environment variable not set.")
    exit()

    
##Open AI Service. This must be deployed. Format:https://nameoftheservice.azure.com/    
aoai_endpoint  = os.getenv("AOAI_ENDPOINT") ##api_base 
if aoai_endpoint is None or aoai_endpoint == "":
    print("AOAI_ENDPOINT environment variable not set.")
    exit()

##Version of the Open AI Service. This was build with the "2023-05-15" version
aoai_api_version  = os.getenv("AOAI_API_VERSION")
if aoai_api_version is None or aoai_api_version == "":
    print("AOAI_API_VERSION environment variable not set.")
    exit()

##Model  of the Open AI Service. This must be deployed: "text-embedding-ada-002"
aoai_embedding_deployed_model  = os.getenv("AOAI_EMBEDDING_DEPLOYED_MODEL")
if aoai_embedding_deployed_model is None or aoai_embedding_deployed_model == "":
    print("AOAI_EMBEDDING_DEPLOYED_MODEL environment variable not set.")
    exit()

##Open AI ServikeyKeyce.
azure_openai_key  = os.getenv("AZURE_OPENAI_KEY")
if azure_openai_key is None or azure_openai_key == "":
    print("AZURE_OPENAI_KEY environment variable not set.")
    exit()

##Container names for the CosmosDB
text_table_name = 'text_sample'
doc_table_name = 'doc_sample'
image_table_name = 'image_sample'

database_name = "Vector_DB"
credential = AzureKeyCredential(str(cog_search_key))
openai.api_type = "azure"


#### Initializing Open AI

In [None]:
import openai
from openai.embeddings_utils import  cosine_similarity


openai.api_key = azure_openai_key
openai.api_version = aoai_api_version ##2023-05-15##
openai.api_base = aoai_endpoint
openai.api_type = "azure"



#### Initialize CosmosDB

In [6]:
# Initialize Cosmos DB client
cosmos_client = CosmosClient(cosmos_db_api_endpoint, cosmos_db_api_key)
database = cosmos_client.get_database_client(database_name)


#### Embedding and Cosine Functions

In [7]:
import numpy as np
from typing import List


def get_embedding(text: str, engine, **kwargs) -> List[float]:
    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    return openai.Embedding.create(input=[text], engine=engine, **kwargs)["data"][0][
        "embedding"
    ]


def cosine_similarity(a, b):
    # Convert the input arrays to numpy arrays
    a = np.asarray(a, dtype=np.float64)
    b = np.asarray(b, dtype=np.float64)

    # Check for empty arrays or arrays with zero norms
    if np.all(a == 0) or np.all(b == 0):
        return 0.0

    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    
    similarity = dot_product / (norm_a * norm_b)
    return similarity



#### Vector Search
##### You will use the index name that you created previously on the ingestion steps for the respectively containers


#### Simple Vector Search

In [None]:
import openai

container_name = 'text_sample'
index_name = "text_sample_index"

container = database.get_container_client(container_name)
search_client = SearchClient(cog_search_endpoint, index_name, credential) 


query = 'tools for software development'
query_vector = get_embedding(query, k=3,  engine=aoai_embedding_deployed_model)


# Perform Azure Cognitive Search query
search_results = search_client.search(search_text=query, select=["title", "content", "category", "title_vector", "content_vector"])

for result in search_results:
    result_vector = result.get("content_vector", None)

    if result_vector is not None and len(result_vector) > 0:
        similarity_score = cosine_similarity(query_vector, result_vector)

        print(f"Title: {result['title']}")
        print(f"Score: {result['@search.score']}")
        print(f"Content: {result['content']}")
        print(f"Category: {result['category']}")
        print(f"Cosine Similarity: {similarity_score}\n")
    else:
        print(f"Skipping result with empty or missing vector.\n")

#### Cross Search (two columns)

In [None]:
container_name = "text_sample"
index_name = "text_sample_index"

container = database.get_container_client(container_name)
search_client = SearchClient(cog_search_endpoint, index_name, credential) 

query = 'tools for software development'
query_vector = get_embedding(query, engine=aoai_embedding_deployed_model)

# Perform Azure Cognitive Search query
search_results = search_client.search(
    search_text=query,
    select=["title", "content", "category", "title_vector", "content_vector"]
)

# Filter results in Python based on cosine similarity
for result in search_results:
    title_vector = result.get("title_vector", None)
    content_vector = result.get("content_vector", None)

    if title_vector is not None and content_vector is not None:
        title_similarity = cosine_similarity(query_vector, title_vector)
        content_similarity = cosine_similarity(query_vector, content_vector)

        # Adjust the threshold as needed
        if title_similarity > 0.7 or content_similarity > 0.7:
            print(f"Title: {result['title']}")
            print(f"Score: {result['@search.score']}")
            print(f"Content: {result['content']}")
            print(f"Category: {result['category']}")
            print(f"Title Cosine Similarity: {title_similarity}")
            print(f"Content Cosine Similarity: {content_similarity}\n")
    else:
        print(f"Skipping result with empty or missing vector.\n")


### Hybrid search + Semantic Rank

In [None]:
##You must Enable the service: https://learn.microsoft.com/en-us/azure/search/semantic-how-to-enable-disable?tabs=enable-portal
##Use the Semantic configuration defined at the Ingestion

container_name = "text_sample"
index_name = "text_sample_index"
semantic_configuration = 'ConfigSemantictext'

container = database.get_container_client(container_name)
search_client = SearchClient(cog_search_endpoint, index_name, credential) 

query = 'Azure DevOps is a suite of services that help you plan'

# Perform Azure Cognitive Search query
search_results = search_client.search(
    search_text=query,
    select=["title", "content", "category", "title_vector", "content_vector"],
    query_type=QueryType.SEMANTIC,  semantic_configuration_name=semantic_configuration, query_caption=QueryCaptionType.EXTRACTIVE, query_answer=QueryAnswerType.EXTRACTIVE,
    top=5
)

_answers = search_results.get_answers()
for answer in _answers:
    print(f"Semantic Answer: {answer}")
    if answer.highlights:
        print(f"Semantic Answer highlight: {answer.highlights}")
    else:
        print(f"Semantic Answer Text : {answer.text}")
    print(f"Semantic Answer Score: {answer.score}\n")


# Filter results in Python based on cosine similarity
for row in search_results:
    print(f"Title: {row['title']}")
    print(f"Score: {row['@search.score']}")
    print(f"Content: {row['content']}")
    print(f"Category: {row['category']}")

    
    captions = row["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")


#### Document Vector Search

In [None]:
container_name = 'doc_sample'
index_name = "doc_sample_index"

container = database.get_container_client(container_name)
search_client = SearchClient(cog_search_endpoint, index_name, credential) 

#  Query Cosmos DB using Azure Cognitive Search

query = 'web hosting services'
query_vector = get_embedding(query, k=3,  engine=aoai_embedding_deployed_model)

# Perform Azure Cognitive Search query
search_results = search_client.search(search_text=query, select=["chunk_content", "chunk_content_vector"])


for result in search_results:
    print(f"chunk_content: {result['chunk_content']}")
    print(f"Score: {result['@search.score']}")
    print(f"chunk_content_vector: {result['chunk_content_vector']}")

### Hybrid search + Semantic Rank

In [None]:
##Enable the service: https://learn.microsoft.com/en-us/azure/search/semantic-how-to-enable-disable?tabs=enable-portal
##Use the Semantic configuration defined at the Ingestion

container_name = "doc_sample"
index_name = "doc_sample_index"
semantic_configuration = 'ConfigSemanticdoc'

container = database.get_container_client(container_name)
search_client = SearchClient(cog_search_endpoint, index_name, credential) 

query = 'This policy applies to all Contoso Electronics employees'
# Perform Azure Cognitive Search query with semantic search
search_results = search_client.search(
    search_text=query,
    select=["chunk_content", "chunk_content_vector"],
    query_type=QueryType.SEMANTIC,  semantic_configuration_name=semantic_configuration, query_caption=QueryCaptionType.EXTRACTIVE, query_answer=QueryAnswerType.EXTRACTIVE,
    top=5
)


_answers = search_results.get_answers()
for answer in _answers:
    print(f"Semantic Answer: {answer}")
    if answer.highlights:
        print(f"Semantic Answer highlight: {answer.highlights}")
    else:
        print(f"Semantic Answer Text : {answer.text}")
    print(f"Semantic Answer Score: {answer.score}\n")

# Filter results in Python based on cosine similarity
for row in search_results:
    print(f"chunk_content: {row['chunk_content']}")
    print(f"Score: {row['@search.score']}")
    print(f"chunk_content_vector: {row['chunk_content_vector']}")
    
    captions = row["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")


