## Vector Search on PostgreSQL


### Prerequisites
  
- Generate embeddings - [generate_embeddings.ipynb](../common/generate_embeddings.ipynb) 
- Create table and ingest embeddings - [postgree_ingestion.ipynb](.../postgree_ingestion.ipynb)

### Set environment variables

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

pg_host  = os.getenv("POSTGRESQL_HOST")
if pg_host is None or pg_host == "":
    print("POSTGRESQL_HOST environment variable not set.")
    exit()

pg_user  = os.getenv("POSTGRESQL_USERNAME")
if pg_user is None or pg_user == "":
    print("POSTGRESQL_USERNAME environment variable not set.")
    exit()

pg_password  = os.getenv("POSTGRESQL_PASSWORD")
if pg_password is None or pg_password == "":
    print("POSTGRESQL_PASSWORD environment variable not set.")
    exit()

db_name  = os.getenv("POSTGRESQL_DATABASE")
if db_name is None or db_name == "":
    print("POSTGRESQL_DATABASE environment variable not set.")
    exit()

aoai_key  = os.getenv("AZURE_OPENAI_KEY")
if aoai_key is None or aoai_key == "":
    print("AZURE_OPENAI_KEY environment variable not set.")
    exit()

aoai_endpoint = 'https://azure-openai-dnai.openai.azure.com'
aoai_api_version = '2023-08-01-preview'
aoai_embedding_deployed_model = 'embedding-ada'

text_table_name = 'text_sample'
doc_table_name = 'doc_sample'
image_table_name = 'image_sample'

postgresql_params = {
    "host": pg_host,
    "port": "5432", 
    "dbname": db_name,
    "user": pg_user,
    "password": pg_password
}




#### Simple vector search

In [None]:
import psycopg2 
import openai
from openai.embeddings_utils import get_embedding, cosine_similarity

query = 'web hosting services'

openai.api_type = "azure"
openai.api_key = aoai_key
openai.api_base = aoai_endpoint
openai.api_version = aoai_api_version

query_vector = get_embedding(query, engine = aoai_embedding_deployed_model)

connection = psycopg2.connect(**postgresql_params)
print("Connection established.")

# Postgres supports L2 distance (<->), inner product (<#>) and cosine distance (<=>)
query_sql = f"SELECT title FROM text_sample ORDER BY ((content_vector <=> '{query_vector}')) LIMIT 5;"

cursor = connection.cursor()
cursor.execute(query_sql)

records = cursor.fetchall()

for row in records:
        print(row[0], )

cursor.close()
connection.close()

### Function -  Converting the Dataframe values to help with the Search

In [14]:

# Function to convert string to PostgreSQL double precision[]
def to_double_precision_array(value):
    if isinstance(value, str):
        # Remove brackets and split by comma, then convert to float
        values = [float(x.strip()) for x in value.strip('[]').split(',')]
        return values
    return []


### Cross column vector similarity search
#### Filter at the dataframe

In [None]:
import matplotlib.pyplot as plt
from PIL import Image
from azure.search.documents.models import Vector 
import pandas as pd 
from openai.embeddings_utils import get_embedding, cosine_similarity
import psycopg2 
import openai

#autheticating
openai.api_type = "azure"
openai.api_key = aoai_key
openai.api_base = aoai_endpoint
openai.api_version = aoai_api_version


query = 'tools for software development'

query_sql = f'''
    SELECT  title_vector,
            title,
            content_vector,
            content
    FROM {text_table_name} ;
'''
print("Query table")



# Fetch and process the results
connection = psycopg2.connect(**postgresql_params)
cursor = connection.cursor()
cursor.execute(query_sql)

records = cursor.fetchall()

##creating a dataframe from the results for the query
column_names = [desc[0] for desc in cursor.description]
df_query_results = pd.DataFrame(records, columns=column_names)

##create embedding
query_vector = get_embedding(query,   engine=aoai_embedding_deployed_model )

##converting datatype
df_query_results['content_vector_array'] = df_query_results['content_vector'].apply(to_double_precision_array)
df_query_results['title_vector_array'] = df_query_results['title_vector'].apply(to_double_precision_array)


##checking similarities to do the vector cross search
df_query_results["similarities_content"] = df_query_results['content_vector_array'].apply(lambda x: cosine_similarity(x, query_vector)).rank(ascending=False)
df_query_results["similarities_title"] = df_query_results['title_vector_array'].apply(lambda x: cosine_similarity(x, query_vector)).rank(ascending=False)

# Display the results and similarities
for index, row in df_query_results.iterrows():
    print("Content :", row["content"])
    print("Content Vector:", row["content_vector_array"])
    print("Title :", row["title"])
    print("Title Vector:", row["title_vector_array"])
    print("similarities Content:", row["similarities_content"])
    print("similarities Title:", row["similarities_title"])
    print("\n")




cursor.close()
connection.close()

#### Filter at the the source

In [None]:
import matplotlib.pyplot as plt
from PIL import Image
from azure.search.documents.models import Vector 
import pandas as pd 
from openai.embeddings_utils import get_embedding, cosine_similarity
import psycopg2 
import openai

#autheticating
openai.api_type = "azure"
openai.api_key = aoai_key
openai.api_base = aoai_endpoint
openai.api_version = aoai_api_version


query = 'tools for software development'

##create embedding
query_vector = get_embedding(query,   engine=aoai_embedding_deployed_model )

# Fetch and process the results
connection = psycopg2.connect(**postgresql_params)
cursor = connection.cursor()


# Define the similarity thresholds
content_similarity_threshold = 0.1
title_similarity_threshold = 0.1

# SQL query for content vector
query_sql_content = f'''
    SELECT
        title_vector,
        title,
        content_vector,
        content,
        ((content_vector <-> '{query_vector}')) AS content_similarity
    FROM {text_table_name}
    WHERE ((content_vector <-> '{query_vector}')) > {content_similarity_threshold}
    ORDER BY content_similarity
    LIMIT 5;
'''

# SQL query for title vector
query_sql_title = f'''
    SELECT
        title_vector,
        title,
        content_vector,
        content,
        ((title_vector <-> '{query_vector}')) AS title_similarity
    FROM {text_table_name}
    WHERE ((title_vector <-> '{query_vector}')) > {title_similarity_threshold}
    ORDER BY title_similarity
    LIMIT 5;
'''


# Fetch and process the results for content vector
cursor.execute(query_sql_content, (query_vector,))
content = cursor.fetchall()

# Panda dataframe for Content
column_names = [desc[0] for desc in cursor.description]
df_query_content = pd.DataFrame(content, columns=column_names)

# Fetch and process the results for title vector
cursor.execute(query_sql_title, (query_vector,))
title = cursor.fetchall()

# Panda Dataframe for Title
column_names = [desc[0] for desc in cursor.description]
df_query_title = pd.DataFrame(title, columns=column_names)

##merge
df_query_results = pd.concat([df_query_content, df_query_title], ignore_index=True)

##Rank
df_query_results['content_rank'] = df_query_results['content_similarity'].rank()
df_query_results['title_rank'] = df_query_results['title_similarity'].rank()

# Display the results with ranks
for index, row in df_query_results.iterrows():
    print("Content:", row["content"])
    print("Content Vector:", row["content_vector"])
    print("Title:", row["title"])
    print("Title Vector:", row["title_vector"])
    print("Content Similarity:", row["content_similarity"])
    print("Content Rank:", row["content_rank"])
    print("Title Similarity:", row["title_similarity"])
    print("Title Rank:", row["title_rank"])
    print("\n")



cursor.close()
connection.close()




### Hybrid search

In [None]:
import openai
from azure.search.documents.models import Vector 
from openai.embeddings_utils import get_embedding, cosine_similarity
import psycopg2 
import openai

# Define your search query
query = 'Azure Application'
query_vector = get_embedding(query, engine = aoai_embedding_deployed_model)

# Connect to the PostgreSQL database
connection = psycopg2.connect(**postgresql_params)
cursor = connection.cursor()


cursor.execute("""
    SELECT id, title, content
    FROM text_sample
    WHERE to_tsvector('english', title || ' ' || content) @@ plainto_tsquery('english', %s)
""", (query,))

fts_results = cursor.fetchall()

# Result
for row in fts_results:
    print("Title:", row[1])
    print("Content:", row[2])
    print("\n")

# Close the database connection
connection.close()

#### Document search example

In [None]:
import matplotlib.pyplot as plt
from PIL import Image
from azure.search.documents.models import Vector 
import pandas as pd 
from openai.embeddings_utils import get_embedding, cosine_similarity
import psycopg2 
import openai

#autheticating
openai.api_type = "azure"
openai.api_key = aoai_key
openai.api_base = aoai_endpoint
openai.api_version = aoai_api_version



query = 'tools for software development'
query_vector = get_embedding(query,   engine=aoai_embedding_deployed_model )

# Fetch and process the results
connection = psycopg2.connect(**postgresql_params)
cursor = connection.cursor()

# threshold
similarity_threshold = 0.1

# Update the query_sql to include the filter
query_sql = f'''
    SELECT chunk_content_vector as chunk_content_vector,
           chunk_content as chunk_content,
           ((chunk_content_vector <-> '{query_vector}')) AS similarities
    FROM {doc_table_name}
    WHERE ((chunk_content_vector <-> '{query_vector}')) > {similarity_threshold}
    LIMIT 100;
'''


print("Query table")
cursor.execute(query_sql)

records = cursor.fetchall()

##creating a dataframe from the results for the query
column_names = [desc[0] for desc in cursor.description]
df_query_results = pd.DataFrame(records, columns=column_names)


# Sort the DataFrame by similarities in descending order
df_query_results = df_query_results.sort_values(by="similarities", ascending=False)

# Display the results and similarities
for index, row in df_query_results.iterrows():
    print("Content:", row["chunk_content"])
    print("Content Vector:", row["chunk_content_vector"])
    print("Similarity:", row["similarities"])
    print("\n")


# Close the database connection
connection.close()



#### Image search example

In [None]:
## TODO