# Search methods in Azure Cosmos DB for NoSQL
- Full Text Search
- Full Text Search ordered by BM25 score
- Vector Search
- Hybrid Search (Vector Search + BM25 merged by RRF)

In [None]:
import time
import json
from dotenv import dotenv_values
import textwrap
from openai import AzureOpenAI
from azure.cosmos import CosmosClient, PartitionKey, exceptions

In [None]:
env_name = "config.env" # following example.env template change to your own .env file name
config = dotenv_values(env_name)

cosmos_connection =  config['cosmos_connection']
cosmos_key = config['cosmos_key']

openai_endpoint = config['openai_endpoint']
openai_key = config['openai_key']

In [None]:
cosmos_database = "a-ignite2024demo"  
cosmos_collection = "search_diskann"  
openai_type = "azure"

openai_api_version = "2023-05-15"
openai_embeddings_deployment = "embeddings"
openai_embeddings_model = "text-embedding-ada-002"
openai_embeddings_dimensions = "1536"
openai_completions_deployment = "completions"
openai_completions_model = "gpt-35-turbo"

In [None]:
cosmos_client = CosmosClient(url=cosmos_connection, credential=cosmos_key)
db = cosmos_client.get_database_client(cosmos_database)
container = db.get_container_client(cosmos_collection)

openai_client = AzureOpenAI(azure_endpoint=openai_endpoint, api_key=openai_key, api_version=openai_api_version)

In [None]:
def generate_embeddings(text):
    response = openai_client.embeddings.create(
        input=text,
        model=openai_embeddings_deployment)     
    embeddings = response.model_dump()
    return embeddings['data'][0]['embedding']

In [None]:
search_terms = "quantum particles and electron interactions"
emb = generate_embeddings(search_terms)
full_text = search_terms.split()

# Full Text Search (FullTextContainsAny)
Some documents contains at least one keyword.

In [None]:
results = container.query_items(
        query= f'''
        SELECT TOP 5 c.id, c.title, c.abstract
        FROM c
        WHERE FullTextContainsAny(c.abstract,'{"', '".join(full_text)}')
        ''',
          enable_cross_partition_query=True, populate_query_metrics=True)

for r in list(results):
    print(json.dumps(r,indent=4),"\n")

# Full Text Search (FullTextContainsAll)
No documents contain ALL keywords

In [None]:
results = container.query_items(
        query= f'''
        SELECT TOP 5 c.id, c.title, c.abstract
        FROM c
        WHERE FullTextContainsAll(c.abstract,'{"', '".join(full_text)}')
        ''',
          enable_cross_partition_query=True, populate_query_metrics=True)

for r in list(results):
    print(json.dumps(r,indent=4),"\n")

# Full Text Ranking with BM25
Documents ranked by:
- Frequency of keywords/terms in property path
- Normalized by overall property text length
- Normalized by frequency of the keywords/terms in the entire collection

In [None]:
results = container.query_items(
        query= f'''
        SELECT TOP 5 c.id, c.title, c.abstract
        FROM c
        ORDER BY RANK FullTextScore(c.abstract, {str(full_text)})
        ''',
        enable_cross_partition_query=True, populate_query_metrics=True)

for r in list(results):
    print(json.dumps(r,indent=4),"\n")

# Vector Similarity Search

In [None]:
results = container.query_items(
        query= f'''
        SELECT TOP 5 c.id, c.title, c.abstract, VectorDistance(c.Embedding, {emb}) as SimilarityScore
        FROM c
        ORDER BY VectorDistance(c.Embedding, {emb})
        ''',
        enable_cross_partition_query=True, populate_query_metrics=True)

for r in list(results):
    print(json.dumps(r,indent=4),"\n")

# Hybrid Search

In [None]:
results = container.query_items(
        query= f'''
        SELECT TOP 5 c.id, c.title, c.abstract, VectorDistance(c.Embedding, {emb}) as SimilarityScore
        FROM c
        ORDER BY RANK RRF (FullTextScore(c.abstract, {str(full_text)}), VectorDistance(c.Embedding, {emb}))
        ''',
        enable_cross_partition_query=True, populate_query_metrics=True)

for r in list(results):
    print(json.dumps(r,indent=4),"\n")