# Quick sample to get started with basic string search, full-text search, vector search and hybrid search

In [14]:
from dotenv import dotenv_values
import json
from azure.cosmos import CosmosClient, PartitionKey, ThroughputProperties, exceptions
from azure.cosmos.aio import CosmosClient as AsyncClient
from openai import AzureOpenAI
import asyncio

import nest_asyncio
nest_asyncio.apply()

In [12]:
# Replace credentials and endpoints in .env with your own. You will find these in the Azure portal.
config = dotenv_values(".env")
cosmos_endpoint =  config['cosmos_endpoint']
cosmos_key = config['cosmos_key']
cosmos_database = config['cosmos_database']
cosmos_container = config['cosmos_container']
openai_endpoint = config['openai_endpoint']
openai_key = config['openai_key']
openai_embeddings_deployment = config['openai_embeddings_deployment']
openai_embeddings_dimensions = config['openai_embeddings_dimensions']
openai_api_version = config['openai_api_version']

In [None]:
# Establish connection to database (Cosmos DB) and OpenAI
cosmos_client = CosmosClient(url=cosmos_endpoint, credential=cosmos_key)
openai_client = AzureOpenAI(azure_endpoint=openai_endpoint, api_key=openai_key, api_version=openai_api_version)

## Setup Azure Cosmos DB with container and indexing policies for search

In [None]:
vector_embedding_policy = { 
    "vectorEmbeddings": [ 
        { 
            "path": "/embedding", 
            "dataType": "float32", 
            "distanceFunction": "cosine", 
            "dimensions": 3072 
        }, 
    ]    
}

full_text_policy = {
   "defaultLanguage": "en-US",
   "fullTextPaths": [
       {
           "path": "/description",
           "language": "en-US"
       },
       {
           "path": "/summary_review",
           "language": "en-US"
       }
   ]
}

indexing_policy = { 
    "includedPaths": [ 
        { 
            "path": "/*" 
        } 
    ], 
    "excludedPaths": [ 
        { 
            "path": "/\"_etag\"/?",
            "path": "/embedding/*",                
        }
    ],
    "vectorIndexes": [ 
        {
            "path": "/embedding", 
            "type": "diskANN"
        }
    ],
    "fullTextIndexes": [
        {
            "path": "/description",
        },
        {
            "path": "/summary_review",
        }
    ]
}

# Create Cosmos DB Database
db = cosmos_client.create_database_if_not_exists(cosmos_database)

# Create Cosmos DB Container
container = db.create_container_if_not_exists(
    id='Products',
    partition_key=PartitionKey(path='/id'),
    indexing_policy=indexing_policy,
    vector_embedding_policy=vector_embedding_policy,
    full_text_policy=full_text_policy,
    offer_throughput=ThroughputProperties(auto_scale_max_throughput=10000, auto_scale_increment_percent=0)
)

## Insert data for search examples

In [None]:
# Load sample data
with open('data/e-retail-data-3072D.json', 'r') as file:
    data = json.load(file) 

In [None]:
# Insert Data into Cosmos DB
# Note: Our data already contains embeddings, so we don't need to generate them again. In real scenarios, 
# you would first pass your documents through an embedding model and then insert them to Cosmos DB.
for i in data:
    container.create_item(body=i)

## Define search terms and generate embeddings

In [None]:
# Define helper function to call into OpenAI and generate embeddings
def generate_embeddings(text):
    response = openai_client.embeddings.create(
        input=text,
        model=openai_embeddings_deployment)     
    embeddings = response.model_dump()
    return embeddings['data'][0]['embedding']

In [19]:
search_terms = "luxury bags"
emb = generate_embeddings(search_terms)
full_text = search_terms.split()

## Classic string search with `CONTAINS` (equivalent to LIKE in SQL)

In [None]:
results = container.query_items(
        query= f'''
        SELECT TOP 5 c.product_id, c.product_name, c.description
        FROM c
        WHERE CONTAINS(c.description, '{full_text[0]}') OR CONTAINS(c.description, '{full_text[1]}')
        ORDER BY c.product_name
        ''',
          enable_cross_partition_query=True, populate_query_metrics=True)

for r in list(results):
    print(json.dumps(r,indent=4))

### With little more data, this will become very expensive and very slow. CONTAINS function performs a full scan (refer: https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/query/contains).
### We also do not have any meaningful way to rank the results. We can rank using product id or product name, but that is not very useful.

## A Simple Text search with `FullTextContainsAny`

In [None]:
results = container.query_items(
        query= f'''
        SELECT TOP 5 c.product_id, c.product_name, c.description
        FROM c
        WHERE FullTextContainsAny(c.description,'{"', '".join(full_text)}')
        ''',
          enable_cross_partition_query=True, populate_query_metrics=True)

for r in list(results):
    print(json.dumps(r,indent=4))

### With Full text index we can scale to larger volumes of data while keeping cost in check. Refer: https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/query/fulltextcontainsany, https://learn.microsoft.com/en-us/azure/cosmos-db/index-policy#full-text-indexes
### However, results are still in random order, which is not useful. We need to rank the results based on relevance.

## Search and order by BM25 with `FullTextScore`

In [None]:
results = container.query_items(
        query= f'''
        SELECT TOP 5 c.product_id, c.product_name, c.description
        FROM c
        ORDER BY RANK FullTextScore(c.description,'{"', '".join(full_text)}')
        ''',
          enable_cross_partition_query=True, populate_query_metrics=True)

for r in list(results):
    print(json.dumps(r,indent=4))

### With Full text index we can scale to larger volumes of data while keeping cost in check.
### Thanks to FullTextScore function, which leverages industry standard BM25 algorithm, results are now ranked based on relevance. Refer: https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/query/fulltextscore

### For reference - 
### BM25 helps search engines and systems prioritize and rank information by balancing the frequency of keywords with document length, so shorter, more focused documents are fairly compared against longer ones. 
### BM25 widely used to improve the accuracy and usefulness of search results across many platforms, ensuring users find what they’re looking for efficiently without getting overwhelmed by irrelevant or excessively long content. 

### So, the results have improvemed but we are still searching on exact terms. We would like our search to be more flexible, allowing for synonyms, related terms, and similar concepts.
### Instead of only searching for "luxry bags" we want to find products that are similar or related such as "designer purses". That's what vector search is for!

## Search and order by Vector similary with `VectorDistance`

In [None]:
results = container.query_items(
query = f'''
        SELECT TOP 5 c.id, c.product_name, c.description, VectorDistance(c.embedding, {emb}) as Score
        FROM c
        ORDER BY VectorDistance(c.embedding, {emb}, false, {{"searchListSizeMultiplier": 5}})
    ''',
          enable_cross_partition_query=True, populate_query_metrics=True)

for r in list(results):
    print(json.dumps(r,indent=4))

### Now we are matching all sorts of similar content but "Rose Gold Quartz Watch" or "Sapphire Blue Silk Scarf" are not that close to "luxury handbags". 
### They are luxury products and perhaps user browsing a product catalog on our website could also be interested in them, but we should first surface products in this order: luxury handbags -> other luxury items -> everything else.
### The Reciprocal Rank Fusion (RRF) function will allow us to do just that. We will combine the full text search we tried before with the vector search and weight the results. Refer: https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/query/rrf

In [None]:
results = container.query_items(
        query= f'''
        SELECT TOP 5 c.product_id, c.product_name, c.description, VectorDistance(c.embedding, {emb}) as SimilarityScore
        FROM c
        ORDER BY RANK RRF(FullTextScore(c.description,'{"', '".join(full_text)}'),VectorDistance(c.embedding, {emb}))
        ''',
          enable_cross_partition_query=True, populate_query_metrics=True)

for r in list(results):
    print(json.dumps(r,indent=4))

### Perfect! Now the results seem highly relevant to the search term "luxury bags".
### Feel free to try different search terms or try adjusting the weighting for full text and vector search like this: 
### RRF(FullTextScore(c.description,'{"', '".join(full_text)}'),VectorDistance(c.embedding, {emb}), [2, 1])  //<- this will put twice as much weight on exact full text matches than semantic meanings (2:1 wieghting).