# Azure Cognitive Search Vector Search via Python SDK
This code demonstrates how to use Azure Cognitive Search with OpenAI and Azure Python SDK
## Prerequisites
To run the code, set up the conda environment using the environment.yml.

## Load environment variables

In [7]:
!pip install openai
!pip install openai[datalib]
!pip install python-dotenv
!pip install azure-ai-textanalytics
!pip install azure-search-documents --pre
!pip install azure-search --pre --upgrade
!pip install azure-core --pre --upgrade
!pip install azure-storage-blob
#!pip install azure-search-documents==11.4.0
!pip install azure-identity
!pip install azure-search-documents==11.4.0a20230509004 -i https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/ --no-cache-dir



ERROR: Could not find a version that satisfies the requirement azure-search-documents==11.4.0 (from versions: 1.0.0b2, 1.0.0b3, 1.0.0b4, 11.0.0, 11.1.0b1, 11.1.0b2, 11.1.0b3, 11.1.0b4, 11.1.0, 11.2.0b1, 11.2.0b2, 11.2.0b3, 11.2.0, 11.2.1, 11.2.2, 11.3.0b1, 11.3.0b2, 11.3.0b3, 11.3.0b4, 11.3.0b5, 11.3.0b6, 11.3.0b7, 11.3.0b8, 11.3.0, 11.4.0b1, 11.4.0b2, 11.4.0b3, 11.4.0b4, 11.4.0b5, 11.4.0b6, 11.4.0b7, 11.4.0b8, 11.4.0b9)
ERROR: No matching distribution found for azure-search-documents==11.4.0


Looking in indexes: https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/


In [16]:
import os  
import json  
import openai  
from dotenv import load_dotenv  
from azure.core.credentials import AzureKeyCredential  

load_dotenv()  
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT") 
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME") 
key = os.getenv("AZURE_SEARCH_ADMIN_KEY") 
openai.api_type = "azure"  
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")  
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")  
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION")  
credential = AzureKeyCredential(key)




### Helper methods
Create your search index schema and vector search configuration:

In [9]:
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes.models import (
    ComplexField,
    CorsOptions,
    SearchIndex,
    ScoringProfile,
    SearchFieldDataType,
    SimpleField,
    SearchableField
)
##https://azuresdkdocs.blob.core.windows.net/$web/python/azure-search-documents/latest/index.html
def get_index_client() -> SearchIndexClient:
    return SearchIndexClient(service_endpoint, AzureKeyCredential(key))

def create_index(index_name, fields, vector_search, semantic_title_field_name, semantic_content_field_names):
    semantic_settings = SemanticSettings(
        configurations=[SemanticConfiguration(
            name='default',
            prioritized_fields=PrioritizedFields(
                title_field=SemanticField(field_name=semantic_title_field_name), prioritized_content_fields=[SemanticField(field_name=field_name) for field_name in semantic_content_field_names]))])
    index = SearchIndex(
        name=index_name,
        fields=fields,
        vector_search=vector_search,
        semantic_settings=semantic_settings)
    index_client = get_index_client()
    return index_client.create_index(index)

### Create Embeddings


### Load embeddings

In [10]:
import pandas as pd
df = pd.read_json('../data/pdf/employee_handbook_chunk_embeddings.json')
df.head()

Unnamed: 0,chunk_content,chunk_content_vector,id
0,Contoso Electronics \nEmployee Handbook \n \n...,"[-0.0134241888, 0.0083369836, 0.00018061460000...",0
1,edge systems that are both reliable and effici...,"[-0.0078642182, 0.0030302808, -0.0163918491, -...",1
2,edge systems that are both reliable and effici...,"[-0.0107993353, 0.0036727316, -0.009540895, -0...",2
3,customers. \n \nCompany Values: \n1. Quality...,"[-0.018283184600000002, -0.0022870835000000003...",3
4,we work and live. \nPerformance Reviews \n \...,"[-0.016625782500000002, -6.20042e-05, 0.031033...",4


## Create PostGres Table

In [12]:
import psycopg2
import re


# Update connection string information
postgree_params = {
    "host": "pvector.postgres.database.azure.com",
    "port": "5432",
    "dbname": "postgres",
    "user": "administrators",
    "password": "Contoso!0000"
}

conn = psycopg2.connect(**postgree_params)
print("Connection established")


cursor = conn.cursor()


# SQL query to fetch PostgreSQL version
query = "SELECT version();"

# Execute the query
cursor.execute(query)

# Fetch the result
version_string = cursor.fetchone()[0]


# Use regular expression to extract the PostgreSQL version
numeric_version_match = re.search(r'PostgreSQL (\d+\.\d+)', version_string)
if numeric_version_match:
    Sversion = numeric_version_match.group(1)
else:
    Sversion = "Version not found"

# Print the extracted PostgreSQL version
print(f"Numeric PostgreSQL Version: {Sversion}")


# Remove the period and convert to float
version = float(Sversion.replace(".", ""))

if version > 14:
    print("Greater than 15 configuration, table column use Array")
    ##Postgree version<14 has no extension vector
    table_schema_data = """
      id_serial UUID DEFAULT gen_random_uuid() PRIMARY KEY,
      chunk_content  text,
      chunk_content_vector  double precision[]
"""
if version <= 14:
      print("Smaller than 15 configuration, table column use Vector")
      #install pgvector -> need to add the extension at the database before create.
      ##Postgree version<14
      cursor.execute("CREATE EXTENSION IF NOT EXISTS vector"); 
      conn.commit()
      print("Adding extension - vector")

      # Define the table schema if needed
      table_schema_data = """
            id_serial UUID DEFAULT gen_random_uuid() PRIMARY KEY,
            chunk_content text,
            chunk_content_vector VECTOR(1536)
      """


# Drop previous table of same name if one exists
# Replace 'your_table_name' with the name of your PostgreSQL table
table_name = "chunk_content_embeddings"
table_schema = "chunk"

cursor.execute(f"CREATE schema IF NOT EXISTS {table_schema}"); ##postgis
conn.commit()




#Drop table
cursor.execute(f"DROP TABLE IF  EXISTS {table_schema}.{table_name} ")
#Table Creatin
cursor.execute(f"CREATE TABLE IF NOT EXISTS {table_schema}.{table_name} ({table_schema_data});")
print("Drop the old table and Create a new(if old existed)")



# Clean up
# Close the cursor and connection
conn.commit()
cursor.close()
conn.close()
print("Connection Closed")


Connection established
Numeric PostgreSQL Version: 15.3
Greater than 15 configuration, table column use Array
Drop the old table and Create a new(if old existed)
Connection Closed


In [13]:
import psycopg2

# Update connection string information

postgree_params = {
    "host": "pvector.postgres.database.azure.com",
    "port": "5432",
    "dbname": "postgres",
    "user": "administrators",
    "password": "Contoso!0000"
}


table_name = "chunk_content_embeddings"
table_schema = "chunk"

#host = "server.postgres.database.azure.com"
#dbname = "dataabse"
#user = "user name"
#password = " password"
#port = "port - postgree usually use 5432"
#sslmode = "require"


# Assuming you have already defined your df DataFrame and postgree_params
conn = psycopg2.connect(**postgree_params)
print("Connection established")
cursor = conn.cursor()

total_records = df.shape[0]

# Iterate through your DataFrame and insert embeddings into PostgreSQL
for index, row in df.iterrows():
    insert_sql = f'''
       INSERT INTO {table_schema}.{table_name} (chunk_content, chunk_content_vector)
        VALUES (%s, ARRAY[%s]::double precision[]);
    '''
    # Use parameterized queries to safely insert data
    cursor.execute(insert_sql, (row['chunk_content'], row['chunk_content_vector']))


# Commit the changes and close the cursor and connection
conn.commit()
cursor.close()
conn.close()
print("Connection Closed")


Connection established
Connection Closed


### Create Query Vector using Euclidean distance

In [75]:
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np  # for vector operations
import psycopg2  

# Update connection string information

postgree_params = {
    "host": "server.postgres.database.azure.com",
    "port": "5432",
    "dbname": "postgres",
    "user": "user name",
    "password": " password"
}


table_name = "chunk_content_embeddings"
table_schema = "chunk"

# Define a threshold for similarity (adjust as needed)
threshold = 0.5  

# Create a list to store similar vector pairs
similar_pairs = []


conn = psycopg2.connect(**postgree_params)
print("Connection established")
cursor = conn.cursor()

##Calculate the euclidean distance under the same column
##using limit (like top) to filter the sample

query = f'''
    SELECT t1.chunk_content_vector as chunk_content_vector_1,
           t2.chunk_content_vector as chunk_content_vector_2,
           t1.chunk_content as chunk_content_t1,
           t2.chunk_content as chunk_content_t2
    FROM {table_schema}.{table_name} t1
    CROSS JOIN
        {table_schema}.{table_name} t2
WHERE
    t1.chunk_content_vector < t2.chunk_content_vector
    LIMIT 100;
'''
print("Query table")

cursor.execute(query)

# Fetch and process the results
query = cursor.fetchall()
#for row in query:
#    print(f"chunk_content: {row[0]}, chunk_content_vector: {row[1]}")


# Function to calculate Euclidean distance between two vectors
def euclidean_distance(vector1, vector2):
    return np.linalg.norm(np.array(vector1) - np.array(vector2))

# Iterate through the results and calculate the Euclidean distance
for row in query:
    vector1 = row[0]
    vector2 = row[1]
    
    distance = euclidean_distance(vector1, vector2)
    #print(f"Euclidean Distance: {distance}")

    # Check if the distance is below the threshold
    if distance < threshold:
        similar_pairs.append((vector1, vector2, distance, ))

# Print or process the similar pairs
for pair in similar_pairs:
    vector1, vector2, distance = pair
    print(f"Similar Pair - Euclidean Distance: {distance}")

    
# Close the database connection
conn.commit()
cursor.close()
conn.close()
print("Connection Closed")

# Iterate through the results and calculate the Euclidean distance
#for row in query:
#    vector1 = row[0]
#    vector2 = row[1]
    #vector3 = row[2]
    #vector4 = row[3]
 #   distance = euclidean_distance(vector1, vector2)
    #print(f"Euclidean Distance: {distance}")

    # Check if the distance is below the threshold
  #  if distance < threshold:
        #similar_pairs.append((vector1, vector2, distance,vector3, vector4 ))
   #     similar_pairs.append((vector1, vector2, distance))

# Print or process the similar pairs
#for pair in similar_pairs:
 #   vector1, vector2,vector3, vector4, distance = pair
  #  print(f"Similar Pair - Euclidean Distance: {distance} \n\n")
    #print(f"Values are 1: {vector3} \n\n")
    #print(f"Values are 2:  {vector4}")
    


Connection established
Query table
Similar Pair - Euclidean Distance: 0.23853913387441963
Similar Pair - Euclidean Distance: 0.48445206589993106
Similar Pair - Euclidean Distance: 0.4735859061045146
Similar Pair - Euclidean Distance: 0.42803151232637265
Similar Pair - Euclidean Distance: 0.40827244979223315
Similar Pair - Euclidean Distance: 0.4730061688620234
Similar Pair - Euclidean Distance: 0.42493653521070196
Similar Pair - Euclidean Distance: 0.45966613627218855
Connection Closed


### Similarity Vector Search with the results

In [63]:
import psycopg2
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from azure.search.documents.models import Vector  
from openai.embeddings_utils import get_embedding, cosine_similarity
import openai


# Update connection string information

postgree_params = {
    "host": "server.postgres.database.azure.com",
    "port": "5432",
    "dbname": "postgres",
    "user": "user name",
    "password": " password"
}

#host = "server.postgres.database.azure.com"
#dbname = "dataabse"
#user = "user name"
#password = " password"
#port = "port - postgree usually use 5432"
#sslmode = "require"

conn = psycopg2.connect(**postgree_params)
print("Connection established")
cursor = conn.cursor()

# Define the query

table_name = "chunk_content_embeddings"
table_schema = "chunk"
# Define the similarity threshold (adjust as needed)
similarity_threshold = 0.5

query = "tools for software development"
# Extract the embedding vector
embedding = openai.Embedding.create(input=query, engine=AZURE_OPENAI_EMBEDDING_DEPLOYMENT, deployment =AZURE_OPENAI_EMBEDDING_DEPLOYMENT)
query_vector = embedding["data"][0]["embedding"]




# Calculate the query vector
#query_vector=get_embedding(query, engine = AZURE_OPENAI_EMBEDDING_DEPLOYMENT)##"text-embedding-ada-002" 

query = f'''
    SELECT t1.chunk_content_vector as chunk_content_vector,
           t2.chunk_content as chunk_content
    FROM {table_schema}.{table_name} t1
    LIMIT 1;
'''
print("Query table")

# Fetch and process the results
query = cursor.fetchall()

# Function to calculate Euclidean distance between two vectors
def euclidean_distance(vector1, vector2):
    return np.linalg.norm(np.array(vector1) - np.array(vector2))

# Iterate through the results and calculate the Euclidean distance
for row in query:
    vector1 = row[0]
    distance = euclidean_distance(vector1, query_vector)

    # Check if the distance is below the threshold
    if distance < similarity_threshold:
        similar_pairs.append((row[0], vector1, distance))

# Print similar pairs
#for chunk_content, chunk_content_vector, distance in similar_pairs:
#    print(f"Chunk Content: {chunk_content}")
#    print(f"Chunk Content Vector: {chunk_content_vector}")
#    print(f"Euclidean Distance: {distance}\n")

# Close the PostgreSQL connection
cursor.close()
conn.close()


Connection established


InvalidRequestError: Resource not found