## Vector Search - Fabric Real-Time Analytics(Kusto) using Python SDK

### Prerequisites
  
- Generate embeddings - [generate_embeddings.ipynb](../../common/generate_embeddings.ipynb) 
- Create Kusto Tables to store vectors and ingest embeddings - [fabric_kusto_data_pipeline.ipynb](./fabric_kusto_data_pipeline.ipynb)

#### Set environment variables

In [None]:
import os
from dotenv import load_dotenv
import openai

load_dotenv()

kdb_cluster_name  = os.getenv("KUSTO_CLUSTER_NAME")
if kdb_cluster_name is None or kdb_cluster_name == "":
    print("KUSTO_CLUSTER_NAME environment variable not set.")
    exit()

kdb_database_name = os.getenv("KUSTO_DATABASE_NAME")
if kdb_database_name is None or kdb_database_name == "":
    print("KUSTO_DATABASE_NAME environment variable not set.")
    exit()

aoai_endpoint  = os.getenv("AOAI_ENDPOINT")
if aoai_endpoint is None or aoai_endpoint == "":
    print("AOAI_ENDPOINT environment variable not set.")
    exit()

aoai_api_version  = os.getenv("AOAI_API_VERSION")
if aoai_api_version is None or aoai_api_version == "":
    print("AOAI_API_VERSION environment variable not set.")
    exit()

aoai_embedding_deployed_model  = os.getenv("AOAI_EMBEDDING_DEPLOYED_MODEL")
if aoai_embedding_deployed_model is None or aoai_embedding_deployed_model == "":
    print("AOAI_EMBEDDING_DEPLOYED_MODEL environment variable not set.")
    exit()

azure_openai_key  = os.getenv("AZURE_OPENAI_KEY")
if azure_openai_key is None or azure_openai_key == "":
    print("AZURE_OPENAI_KEY environment variable not set.")
    exit()

com_vision_endpoint  = os.getenv("COM_VISION_ENDPOINT")
if com_vision_endpoint is None or com_vision_endpoint == "":
    print("COM_VISION_ENDPOINT environment variable not set.")
    exit()

com_vision_api_version  = os.getenv("COM_VISION_API_VERSION")
if com_vision_api_version is None or com_vision_api_version == "":
    print("COM_VISION_API_VERSION environment variable not set.")
    exit()

com_vision_key  = os.getenv("COM_VISION_KEY")
if com_vision_key is None or com_vision_key == "":
    print("COM_VISION_KEY environment variable not set.")
    exit()

text_index_name = 'text-sample'
doc_index_name = 'doc-sample'
image_index_name = 'image-sample'

openai.api_type = "azure"
openai.api_key = azure_openai_key
openai.api_base = aoai_endpoint
openai.api_version = aoai_api_version

In [None]:
# (Optional) Set the env vars of service principal if authenticating with client-secret. 
kdb_client_id  = os.getenv("KUSTO_CLIENT_ID")
if kdb_client_id is None or kdb_client_id == "":
    print("KUSTO_CLIENT_ID environment variable not set.")
    exit()

kdb_client_secret  = os.getenv("KUSTO_CLIENT_SECRET")
if kdb_client_secret is None or kdb_client_secret == "":
    print("KUSTO_CLIENT_SECRET environment variable not set.")
    exit()

kdb_authority_id = os.getenv("KUSTO_AUTHORITY_ID")
if kdb_authority_id is None or kdb_authority_id == "":
    print("KUSTO_AUTHORITY_ID environment variable not set.")
    exit()

In [None]:
import requests

import matplotlib.pyplot as plt
from openai import AzureOpenAI
from PIL import Image

from azure.kusto.data import KustoClient, KustoConnectionStringBuilder
from azure.kusto.data.helpers import dataframe_from_result_table


cluster = f"https://{kdb_cluster_name}.kusto.fabric.microsoft.com/"

#### [Option 1] AAD Device Authentication

In [None]:
kcsb = KustoConnectionStringBuilder.with_aad_device_authentication(cluster)
client = KustoClient(kcsb)

#### [Option 2] Client-Secret Authentication 
To use client-secret authentication, you need to set the following environment variables in the `.env` file:
```bash
KUSTO_CLIENT_ID=<client id of the service principle>
KUSTO_CLIENT_SECRET=<client secret of the service principle>
KUSTO_AUTHORITY_ID=<authority id>
```

In [None]:
kcsb = KustoConnectionStringBuilder.with_aad_application_key_authentication(cluster, kdb_client_id, kdb_client_secret, kdb_authority_id)
client = KustoClient(kcsb)

#### Helper methods
The native KQL function `series_cosine_similarity` was used in `search_kusto` method to calculate the cosine similarity between the input vector and the vectors stored in the Kusto table. For more details, please check [here](https://learn.microsoft.com/en-us/azure/data-explorer/kusto/query/series-cosine-similarity-function).

In [None]:
def search_kusto(
    kusto_client: KustoClient,
    kusto_db: str,
    query_vector: list,
    index_name: str,
    vector_fields: list, 
    key_field: str = "id",
    return_fields: list = [],
    k: int = 20,
    filter: str = ''
):
    if not vector_fields: 
        raise("vector_field is required")
    
    queries = []
    for v_field in vector_fields:
        queries.append(f"""
        let {v_field}_view = view() {{ 
            ['{index_name}'] 
            | summarize arg_max(ingestion_time(), *) by ['{key_field}'] 
            | extend similarity=series_cosine_similarity(dynamic({query_vector}), ['{v_field}'])
        }};
        """)
        
    QUERY = f"".join(queries) + f"""
    union withsource = '{index_name}' {",".join([f"{v_field}_view" for v_field in vector_fields])}
    """
    
    QUERY += f"""
    | top {k} by similarity desc
    """
    
    if return_fields:
        QUERY += f"""
        | project {",".join([ f"['{field}']" for field in return_fields])}
        """
    
    if filter:
        QUERY += f"""
        | where {filter}
        """

    result = kusto_client.execute_query(kusto_db, QUERY)

    df = dataframe_from_result_table(result.primary_results[0])
    return df


def vectorize_text_com_vision(com_vision_endpoint,com_vision_key,query):
    vectorize_text_url = f"{com_vision_endpoint}/computervision/retrieval:vectorizeText"  
    params = {  
        "api-version": com_vision_api_version  
    } 
    headers = {  
        "Content-Type": "application/json",  
        "Ocp-Apim-Subscription-Key": com_vision_key  
    }  
    data = {
        'text': query
    }

    response = requests.post(vectorize_text_url, params=params, headers=headers, json=data)
    query_vector = response.json()["vector"]

    return query_vector

def show_image(image_folder, image):
    image_path = os.path.join(image_folder, image)
    plt.imshow(Image.open(image_path))
    plt.axis('off')
    plt.show()

#### Simple vector search

This demo shows how to apply vector search on single field.

In [None]:
azure_oai_client = AzureOpenAI(
  api_key = azure_openai_key,  
  api_version = aoai_api_version,
  azure_endpoint = aoai_endpoint
)

query = 'find me a distributed system'
query_vector = azure_oai_client.embeddings.create(input = [query], model=aoai_embedding_deployed_model).data[0].embedding

vector_field = "content_vector"
return_fields = ["title", "category", "content", "similarity"]

results_df = search_kusto(client, 
                       kusto_db=kdb_database_name, 
                       query_vector=query_vector, 
                       index_name=text_index_name, 
                       vector_fields=[vector_field], 
                       return_fields=return_fields,
                       k=5
)

results_df

#### Metadata filtering with vector search

This demo shows how to apply metadata filtering (SQL - where, order by etc.) on top of vector search.

In [None]:
query = 'find me a distributed system'
query_vector = azure_oai_client.embeddings.create(input = [query], model=aoai_embedding_deployed_model).data[0].embedding

vector_field = "content_vector"
return_fields = ["title", "category", "content", "similarity"]

filter = "category == 'Compute'"

results_df = search_kusto(client, 
                       kusto_db=kdb_database_name, 
                       query_vector=query_vector, 
                       index_name=text_index_name, 
                       vector_fields=[vector_field], 
                       return_fields=return_fields,
                       filter=filter

)

results_df

#### Cross column vector search

This demo shows how to apply vector search on multiple columns.

In [None]:
query = 'find me a distributed system'
query_vector = azure_oai_client.embeddings.create(input = [query], model=aoai_embedding_deployed_model).data[0].embedding

vector_fields = ["title_vector", "content_vector"]
return_fields = ["title", "category", "content", "similarity"]

results_df = search_kusto(client, 
                       kusto_db=kdb_database_name, 
                       query_vector=query_vector, 
                       index_name=text_index_name, 
                       vector_fields=vector_fields, 
                       return_fields=return_fields,
                       k=5
)

results_df

#### Document search example

This demo shows how to apply vector search for srarching within documents.

In [None]:

query = 'when are performance review announced?'
query_vector = azure_oai_client.embeddings.create(input = [query], model=aoai_embedding_deployed_model).data[0].embedding

vector_fields = ["chunk_content_vector"]
return_fields = ["chunk_content", "similarity"]

results_df = search_kusto(client, 
                       kusto_db=kdb_database_name, 
                       query_vector=query_vector, 
                       index_name=doc_index_name, 
                       vector_fields=vector_fields, 
                       return_fields=return_fields,
                       k=5
)

results_df

#### Image search example

This demo shows how to apply vector search for searching images.

In [None]:
query = 'white flower'
query_vector = vectorize_text_com_vision(com_vision_endpoint,com_vision_key,query)

vector_fields = ["image_vector"]
return_fields = ["image", "similarity"]

results_df = search_kusto(client, 
                       kusto_db=kdb_database_name, 
                       query_vector=query_vector, 
                       index_name=image_index_name, 
                       vector_fields=vector_fields, 
                       return_fields=return_fields,
                       k=1
)

image_folder = "../data/images"
for image in results_df['image']:
    show_image(image_folder, image)
    print("\n")
