# Azure Cognitive Search Vector Search via Python SDK
This code demonstrates how to use Azure Cognitive Search with OpenAI and Azure Python SDK
## Prerequisites
To run the code, set up the conda environment using the environment.yml.

## Load environment variables


Adjusting the file.
vector extension works just before version 14. 
So I calculate vector using the Euclidian distance function from numpy. For now, I did a cross-join between the values on the same column of the file imported with the embeddings. 
the vector comparison between the table and the text still is not working. -> Similarity Vector Search with the results

In [None]:
!pip install openai
!pip install openai[datalib]
!pip install python-dotenv
!pip install azure-ai-textanalytics
!pip install azure-search-documents --pre
!pip install azure-search --pre --upgrade
!pip install azure-core --pre --upgrade
!pip install azure-storage-blob
#!pip install azure-search-documents==11.4.0
!pip install azure-identity
!pip install azure-search-documents==11.4.0a20230509004 -i https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/ --no-cache-dir


In [None]:
import os  
import json  
import openai  
from dotenv import load_dotenv  
from azure.core.credentials import AzureKeyCredential  

load_dotenv()  
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT") 
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME") 
key = os.getenv("AZURE_SEARCH_ADMIN_KEY") 
openai.api_type = "azure"  
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")  
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")  
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION")  
credential = AzureKeyCredential(key)



### Helper methods
Create your search index schema and vector search configuration:

In [None]:
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes.models import (
    ComplexField,
    CorsOptions,
    SearchIndex,
    ScoringProfile,
    SearchFieldDataType,
    SimpleField,
    SearchableField
)
##https://azuresdkdocs.blob.core.windows.net/$web/python/azure-search-documents/latest/index.html
def get_index_client() -> SearchIndexClient:
    return SearchIndexClient(service_endpoint, AzureKeyCredential(key))

def create_index(index_name, fields, vector_search, semantic_title_field_name, semantic_content_field_names):
    semantic_settings = SemanticSettings(
        configurations=[SemanticConfiguration(
            name='default',
            prioritized_fields=PrioritizedFields(
                title_field=SemanticField(field_name=semantic_title_field_name), prioritized_content_fields=[SemanticField(field_name=field_name) for field_name in semantic_content_field_names]))])
    index = SearchIndex(
        name=index_name,
        fields=fields,
        vector_search=vector_search,
        semantic_settings=semantic_settings)
    index_client = get_index_client()
    return index_client.create_index(index)

### Load embeddings

In [None]:
import pandas as pd
df = pd.read_json('../data/pdf/employee_handbook_chunk_embeddings.json')
df.head()

## Create PostGres Table

In [None]:
import psycopg2
import re


# Update connection string information
postgree_params = {
    "host": "server.postgres.database.azure.com",
    "port": "5432", #postgres default
    "dbname": "postgres",  #postgres default
    "user": "user name",
    "password": " password"
}

conn = psycopg2.connect(**postgree_params)
print("Connection established")


cursor = conn.cursor()


# SQL query to fetch PostgreSQL version
query = "SELECT version();"

# Execute the query
cursor.execute(query)

# Fetch the result
version_string = cursor.fetchone()[0]


# Use regular expression to extract the PostgreSQL version
numeric_version_match = re.search(r'PostgreSQL (\d+\.\d+)', version_string)
if numeric_version_match:
    Sversion = numeric_version_match.group(1)
else:
    Sversion = "Version not found"

# Print the extracted PostgreSQL version
print(f"Numeric PostgreSQL Version: {Sversion}")


# Remove the period and convert to float
version = float(Sversion.replace(".", ""))

if version > 14:
    print("Greater than 15 configuration, table column use Array")
    ##Postgree version<14 has no extension vector
    table_schema_data = """
      id_serial UUID DEFAULT gen_random_uuid() PRIMARY KEY,
      chunk_content  text,
      chunk_content_vector  double precision[]
"""
if version <= 14:
      print("Smaller than 15 configuration, table column use Vector")
      #install pgvector -> need to add the extension at the database before create.
      ##Postgree version<14
      cursor.execute("CREATE EXTENSION IF NOT EXISTS vector"); 
      conn.commit()
      print("Adding extension - vector")

      # Define the table schema if needed
      table_schema_data = """
            id_serial UUID DEFAULT gen_random_uuid() PRIMARY KEY,
            chunk_content text,
            chunk_content_vector VECTOR(1536)
      """


# Drop previous table of same name if one exists
# Replace 'your_table_name' with the name of your PostgreSQL table
table_name = "chunk_content_embeddings"
table_schema = "chunk"

cursor.execute(f"CREATE schema IF NOT EXISTS {table_schema}"); ##postgis
conn.commit()




#Drop table
cursor.execute(f"DROP TABLE IF  EXISTS {table_schema}.{table_name} ")
#Table Creatin
cursor.execute(f"CREATE TABLE IF NOT EXISTS {table_schema}.{table_name} ({table_schema_data});")
print("Drop the old table and Create a new(if old existed)")



# Clean up
# Close the cursor and connection
conn.commit()
cursor.close()
conn.close()
print("Connection Closed")


### Populate table with Embeddings file

In [None]:
import psycopg2

# Update connection string information

postgree_params = {
    "host": "server.postgres.database.azure.com",
    "port": "5432", #postgres default
    "dbname": "postgres",  #postgres default
    "user": "user name",
    "password": " password"
}

table_name = "chunk_content_embeddings"
table_schema = "chunk"



# Assuming you have already defined your df DataFrame and postgree_params
conn = psycopg2.connect(**postgree_params)
print("Connection established")
cursor = conn.cursor()

total_records = df.shape[0]

# Iterate through your DataFrame and insert embeddings into PostgreSQL
for index, row in df.iterrows():
    insert_sql = f'''
       INSERT INTO {table_schema}.{table_name} (chunk_content, chunk_content_vector)
        VALUES (%s, ARRAY[%s]::double precision[]);
    '''
    # Use parameterized queries to safely insert data
    cursor.execute(insert_sql, (row['chunk_content'], row['chunk_content_vector']))


# Commit the changes and close the cursor and connection
conn.commit()
cursor.close()
conn.close()
print("Connection Closed")


### Similarity Vector Search with the results
##### Read table
##### Create Dataframe
##### Apply cosine function to get the similarities.


In [None]:
import psycopg2
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from azure.search.documents.models import Vector  
from openai.embeddings_utils import get_embedding, cosine_similarity
import openai
import pandas as pd


# Update connection string information
postgree_params = {
    "host": "server.postgres.database.azure.com",
    "port": "5432", #postgres default
    "dbname": "postgres",  #postgres default
    "user": "user name",
    "password": " password"
}



conn = psycopg2.connect(**postgree_params)
print("Connection established")
cursor = conn.cursor()

# Define the query

table_name = "chunk_content_embeddings"
table_schema = "chunk"
column_name_chunk = "chunk_content_vector"
# Define the similarity threshold (adjust as needed)
similarity_threshold = 0.5

query_text = "tools for software development"


query = f'''
    SELECT t1.chunk_content_vector as chunk_content_vector,
           t1.chunk_content as chunk_content
    FROM {table_schema}.{table_name} t1
    LIMIT 100;
'''
print("Query table")

cursor.execute(query)
# Fetch and process the results
query_results = cursor.fetchall()

column_names = [desc[0] for desc in cursor.description]
df_query_results = pd.DataFrame(query_results, columns=column_names)


# search through the reviews for a specific product
##Extract from: https://learn.microsoft.com/en-us/azure/ai-services/openai/tutorials/embeddings?tabs=command-line
def search_docs(df, user_query,column_name_chunk, top_n=3, to_print=True):
    embedding = get_embedding(
        user_query,
        engine="text-embedding-ada-002" # engine should be set to the deployment name you chose when you deployed the text-embedding-ada-002 (Version 2) model
    )
    df["similarities"] = df[column_name_chunk].apply(lambda x: cosine_similarity(x, embedding))

    res = (
        df.sort_values("similarities", ascending=False)
        .head(top_n)
    )
    if to_print:
        display(res)
    return res


res = search_docs(df_query_results, query_text,column_name_chunk, top_n=3)

# Close the PostgreSQL connection
cursor.close()
conn.close()
