## Vector Search on PostgreSQL


### Prerequisites
  
- Generate embeddings - [generate_embeddings.ipynb](../common/generate_embeddings.ipynb)
- Create table and ingest embeddings - [postgree_ingestion.ipynb](.../postgree_ingestion.ipynb)

### Libraries


In [11]:
import json
import datetime
import time

from azure.core.exceptions import AzureError
from azure.core.credentials import AzureKeyCredential
from azure.cosmos import exceptions, CosmosClient, PartitionKey
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.search.documents.models import Vector
from azure.search.documents.indexes.models import (
    IndexingSchedule,
    SearchIndex,
    SearchIndexer,
    SearchIndexerDataContainer,
    SearchField,
    SearchFieldDataType,
    SearchableField,
    SemanticConfiguration,
    SimpleField,
    PrioritizedFields,
    SemanticField,
    SemanticSettings,
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
    SearchIndexerDataSourceConnection
)

import openai
from tenacity import retry, wait_random_exponential, stop_after_attempt

import os
from dotenv import load_dotenv


#### Set environment variables

In [None]:

load_dotenv()

cosmos_db_api_endpoint  = os.getenv("cosmos_db_api_endpoint")
if cosmos_db_api_endpoint is None or cosmos_db_api_endpoint == "":
    print("cosmos_db_api_endpoint environment variable not set.")
    exit()

cosmos_db_api_key  = os.getenv("cosmos_db_api_key")
if cosmos_db_api_key is None or cosmos_db_api_key == "":
    print("cosmos_db_api_key environment variable not set.")
    exit()

cog_search_endpoint  = os.getenv("cog_search_endpoint")
if cog_search_endpoint is None or cog_search_endpoint == "":
    print("cog_search_endpoint environment variable not set.")
    exit()

cog_search_key  = os.getenv("cog_search_key")
if cog_search_key is None or cog_search_key == "":
    print("cog_search_key environment variable not set.")
    exit()

aoai_embedding_deployed_model  = os.getenv("aoai_embedding_deployed_model")
if aoai_embedding_deployed_model is None or aoai_embedding_deployed_model == "":
    print("aoai_embedding_deployed_model environment variable not set.")
    exit()   


aoai_key  = os.getenv("aoai_key")
if aoai_key is None or aoai_key == "":
    print("aoai_key environment variable not set.")
    exit()   



text_table_name = 'text_sample'
doc_table_name = 'doc_sample'
image_table_name = 'image_sample'





#### Helper method

In [None]:
import requests

def vectorize_text_com_vision(com_vision_endpoint,com_vision_key,query):
    vectorize_text_url = f"{com_vision_endpoint}/computervision/retrieval:vectorizeText"  
    params = {  
        "api-version": "2023-02-01-preview"  
    } 
    headers = {  
        "Content-Type": "application/json",  
        "Ocp-Apim-Subscription-Key": com_vision_key  
    }  
    data = {
        'text':query
    }

    response = requests.post(vectorize_text_url, params=params, headers=headers, json=data)
    query_vector = response.json()["vector"]

    return query_vector

def show_image(image_folder, image):
    image_path = os.path.join(image_folder, image)
    plt.imshow(Image.open(image_path))
    plt.axis('off')
    plt.show()

In [25]:
##secrets
##todo

##https://learn.microsoft.com/en-us/training/modules/search-azure-cosmos-db-sql-api-data-azure-cognitive-search/

##https://learn.microsoft.com/en-us/azure/search/search-howto-index-cosmosdb

##https://github.com/microsoft/AzureDataRetrievalAugmentedGenerationSamples/blob/main/Python/AzureSQL_CognitiveSearch/AzureSQL_CogSearch.ipynb
##https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/how-to-python-create-container  
aoai_endpoint = "https://openaitesteise.openai.azure.com/"

##https://learn.microsoft.com/en-gb/azure/search/search-create-service-portal
cog_search_endpoint  = "https://cosmossearch.search.windows.net"
cog_search_key  = "VYHMZj5OotW3UQ3CXYsM0vOFmmh6wCHbokIIEoXze4AzSeANr6lK"

openai.api_type = "azure"  
openai.api_base = aoai_endpoint
openai.api_version = "2023-05-15" ##"2023-06-01-preview"
aoai_api_version= "2023-06-01-preview"
aoai_embedding_deployed_model = "text-embedding-ada-002"
aoai_key = "b3319bcf227a428aa5df532e84716f7e"


openai_completions_deployment = "gpt-35-turbo" ##completions_deployment , embeddings_deployment 


text_table_name = 'text_sample'
doc_table_name = 'doc_sample'
image_table_name = 'image_sample'

##connection params
cosmos_db_api_endpoint= "https://lilem.documents.azure.com:443/"
cosmos_db_api_key= "TqDjr7pe4Xo64MeZfIItqKjgeFByOduzFFTwJSVGsSqLnB0ZTTxagQ6qOH9mpmwJEEeFy5nVOz7RACDbMpyXww=="   

client = CosmosClient(cosmos_db_api_endpoint, credential=cosmos_db_api_key)


#cosmosdb_database_name = "Vector_DB"
#cosmosdb_container_name = "text_sample"

database_name = "Vector_DB"
container_name = 'text_sample'



# Configure Azure Cognitive Search
cog_search_endpoint  = "https://cosmossearch.search.windows.net"
cog_search_key  = "VYHMZj5OotW3UQ3CXYsM0vOFmmh6wCHbokIIEoXze4AzSeANr6lK"
index_name = "https://cognitivepoc.search.windows.net"

container_name = 'text_sample'



#### Vector search


In [23]:
from azure.cosmos import CosmosClient
##from azure.search.documents.indexes import SimpleField, SearchableField, Index
from azure.search.documents import SearchClient





# Initialize Cosmos DB client
cosmos_client = CosmosClient(cosmos_db_api_endpoint, cosmos_db_api_key)
database = cosmos_client.get_database_client(database_name)
container = database.get_container_client(container_name)

# Initialize Azure Cognitive Search client
#search_client = SearchClient(account_name=cog_search_endpoint, index_name=index_name, index_version="2020-06-30", credential=cog_search_key)
search_client = SearchClient(endpoint=cog_search_endpoint, index_name=index_name, credential=cog_search_key)

#index = SearchIndex(
#    name=index_name,
#    fields=[
##        SimpleField(name="title", type="Edm.String", key=True, searchable=True),
#        SearchableField(name="content_vector", type="Collection(Edm.Double)")
 #   ]
#)



# Define the fields for the index
fields = [
    SimpleField(name="title", type="Edm.String", key=True, searchable=True),
    SearchableField(name="content_vector", type="Collection(Edm.Double)")
]

# Create or update the search index with the defined fields
index = SearchIndex(name=index_name, fields=fields)
#search_client.create_index(index)

#  Query Cosmos DB using Azure Cognitive Search
query = 'web hosting services'

# Query Azure Cognitive Search
results = search_client.search(search_text=query, include_total_count=True, top=5)

# Display results
#for result in results:
 #   print(result['title'])


In [24]:
from openai.embeddings_utils import get_embedding, cosine_similarity
from azure.search.documents import SearchClient, SearchIndexingBufferedSender  

query = 'tools for software development'
query_vector = get_embedding(query, k=3,  engine=aoai_embedding_deployed_model )
search_client = SearchClient(cog_search_endpoint,  AzureKeyCredential(aoai_key))


RetryError: RetryError[<Future at 0x1c9f6b68410 state=finished raised AuthenticationError>]

In [None]:
query = 'tools for software development'
query_vector = get_embedding(query,   engine=aoai_embedding_deployed_model )

with connect(**postgresql_params) as connection:
    with connection.cursor() as cursor:
        # A higher value of probes provides better recall at the cost of speed.
        query_sql = f"SET ivfflat.probes = 10;"
        cursor.execute(query_sql)

        query_sql = f'''
            (
                SELECT title, ((title_vector <-> '{query_vector}')) AS content_similarity FROM text_sample
                union
                SELECT title, ((content_vector <-> '{query_vector}')) AS content_similarity FROM text_sample
            ) ORDER BY content_similarity LIMIT 5;
        '''
        cursor.execute(query_sql)
        records = cursor.fetchall()

        for row in records:
                print(row[0])

#### Hybrid search

- This demo shows how to apply vector search in in conjunction with additional search methods, such as lexical search. 

- Implement a hybrid search that combines semantic keyword search by reranking. Details - https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search.py

In [None]:
import itertools
from sentence_transformers import CrossEncoder, SentenceTransformer

query = 'database'

def semantic_search(query):
    query_vector = get_embedding(query, engine = aoai_embedding_deployed_model)

    with connect(**postgresql_params) as connection:
        with connection.cursor() as cursor:
            # A higher value of probes provides better recall at the cost of speed.
            query_sql = f"SET ivfflat.probes = 10;"
            cursor.execute(query_sql)

            # Postgres supports L2 distance (<->), inner product (<#>) and cosine distance (<=>)
            query_sql = f"SELECT title FROM text_sample ORDER BY ((content_vector <=> '{query_vector}')) LIMIT 5;"
            cursor.execute(query_sql)
            return cursor.fetchall()

def keyword_search(query):
    with connect(**postgresql_params) as connection:
        with connection.cursor() as cursor:
            cursor.execute("SELECT title FROM text_sample, plainto_tsquery('english', %s) query WHERE to_tsvector('english', content) @@ query ORDER BY ts_rank_cd(to_tsvector('english', content), query) DESC LIMIT 5", (query,))
            return cursor.fetchall()

def rerank(query, records):
    # deduplicate
    results = set(itertools.chain(*records))

    # re-rank
    encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
    scores = encoder.predict([(query, item[1]) for item in results])
    return [v for _, v in sorted(zip(scores, results), reverse=True)]

keyword_search_records = keyword_search(query)
semantic_search_records = semantic_search(query)

records = list(semantic_search_records) + (list(keyword_search_records))
results = rerank(query, records)

print(results)

#### Document search example

This demo shows how to apply vector search for srarching within documents.

In [None]:
query = 'web hosting services'
query_vector = get_embedding(query, engine = aoai_embedding_deployed_model)

with connect(**postgresql_params) as connection:
    with connection.cursor() as cursor:
        # A higher value of probes provides better recall at the cost of speed.
        query_sql = f"SET ivfflat.probes = 10;"
        cursor.execute(query_sql)

        # Postgres supports L2 distance (<->), inner product (<#>) and cosine distance (<=>)
        query_sql = f"SELECT chunk_content FROM doc_sample ORDER BY ((chunk_content_vector <=> '{query_vector}')) LIMIT 5;"
        cursor.execute(query_sql)
        records = cursor.fetchall()

        for row in records:
                print(row[0], )

#### Image search example

This demo shows how to apply vector search for searching images.

In [None]:
from psycopg2 import connect

query = 'flower with hand'
image_folder = "../data/images"
query_vector = vectorize_text_com_vision(com_vision_endpoint,com_vision_key,query)

with connect(**postgresql_params) as connection:
    with connection.cursor() as cursor:
        # A higher value of probes provides better recall at the cost of speed.
        query_sql = f"SET ivfflat.probes = 10;"
        cursor.execute(query_sql)

        # Postgres supports L2 distance (<->), inner product (<#>) and cosine distance (<=>)
        query_sql = f"SELECT image FROM image_sample ORDER BY ((image_vector <=> '{query_vector}')) LIMIT 5;"
        cursor.execute(query_sql)
        records = cursor.fetchall()

        for result in records:
            show_image(image_folder, result[0])
            print("\n")