### Library Requisites
#### Pip install before proceed

In [None]:
!pip install psycopg2
!pip install load_dotenv
!pip install requests
!pip install plotly
!pip install scipy
!pip install scikit-learn
!pip install openai
# The 'sklearn' PyPI package is deprecated, use 'scikit-learn'

### Setting environmnent tables

In [15]:
import os  
import json  
import openai  
from dotenv import load_dotenv  
from azure.core.credentials import AzureKeyCredential  

load_dotenv()  
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT") 
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME") 
key = os.getenv("AZURE_SEARCH_ADMIN_KEY") 
openai.api_type = "azure"  
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")  
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")  
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION")  
AZURE_OPENAI_EMBEDDING_DEPLOYMENT =  os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
##AZURE_OPENAI_EMBEDDING_DEPLOYMENT = 'text-embedding-ada-002'
credential = AzureKeyCredential(str(key))

### Helper Methods

In [4]:
import requests

def insert_record(acs_endpoint, acs_index, data, acs_key, acs_api_version):
    url = f"{acs_endpoint}/indexes/{acs_index}/docs/index?api-version={acs_api_version}"
    headers = {
        "Content-Type": "application/json",
        "api-key": acs_key
    }    
    response = requests.post(url, data=data, headers=headers)
    print(response.status_code)
    print(response.content)

def create_index(acs_endpoint, json_content, acs_index, api_key, acs_api_version):
    url = f"{acs_endpoint}/indexes/{acs_index}?api-version={acs_api_version}"
    headers = {
        "Content-Type": "application/json",
        "api-key": api_key
    }
    response = requests.request('PUT', url, headers=headers, data=json_content)
    print(response.status_code)
    print(response.content)

def search_vector_similarity(query_vector, top_doc_count, acs_endpoint, acs_index,acs_key, acs_api_version):
    url = f"{acs_endpoint}/indexes/{acs_index}/docs/search?api-version={acs_api_version}"

    headers = {
        "Content-Type": "application/json",
        "api-key": acs_key
    }

    request_body = {
        "vectors": [{
            "value": query_vector,
            "fields": "content_vector",
            "k": top_doc_count
        }],
        "select": "title"
    }
    request_body = json.dumps(request_body)

    response = requests.request('POST', url, headers=headers, data=request_body)

    docs = [(item['title']) for item in response.json()['value']]

    return docs

def read_json_file(file_path):
    with open(file_path, "r") as file:
        return file.read()

## Create embeddings
Read your data, generate OpenAI embeddings 
Batch size serves to limit the dataframe size as also helps to manage the rate limit scenario for OpenAI


In [13]:

import openai
from openai.embeddings_utils import get_embedding, cosine_similarity
import pandas as pd
import json

# Define your batch size (adjust as needed based on API rate limits)
## This variable helps to limit the number of rows to added on the dataframe 
batch_size = 2

# Read the entire JSON file into a DataFrame
df = pd.read_json('../data/text/product_docs.json')

# Slice the DataFrame to get the desired number of rows
df = df.head(batch_size)


# Apply get_embedding to your DataFrame
df['title_vector'] = df['title'].apply(lambda x : get_embedding(x, engine = AZURE_OPENAI_EMBEDDING_DEPLOYMENT)) 
df['content_vector'] = df['content'].apply(lambda x : get_embedding(x, engine = AZURE_OPENAI_EMBEDDING_DEPLOYMENT)) 




#### Create Table and Schema in Postgres
##### Different versions maybe handled differently

In [138]:
import psycopg2 #postgres conexion
import re ##using this library to get the precise version of the database by searching the string version query results

# Update connection string information
postgree_params = {
    "host": "server.postgres.database.azure.com",
    "port": "5432", ##default 5432
    "dbname": "postgres", ##default databas, change if needed
    "user": "user name",
    "password": "password"
}


conn = psycopg2.connect(**postgree_params)
cursor = conn.cursor()
print("Connection established")


# SQL query to fetch PostgreSQL version
##version 14 and beyond has no vector extension, so the columns should be array.
query = "SELECT version();"

# Execute the query
cursor.execute(query)

# Fetch the result
version_string = cursor.fetchone()[0]


# Use regular expression to extract the PostgreSQL version
numeric_version_match = re.search(r'PostgreSQL (\d+\.\d+)', version_string)
if numeric_version_match:
    Sversion = numeric_version_match.group(1)
else:
    Sversion = "Version not found"

# Print the extracted PostgreSQL version
print(f"Numeric PostgreSQL Version: {Sversion}")


# Remove the period and convert to float
version = float(Sversion.replace(".", ""))

##Postgree version<14 has no extension vector
if version > 14:
    print("Greater than 15 configuration, table column use Array")
    table_schema_data = """
      id_serial UUID DEFAULT gen_random_uuid() PRIMARY KEY,
      title text,
      content text,
      title_vector double precision[],
      content_vector double precision[]
"""
if version <= 14:
      print("Smaller than 15 configuration, table column use Vector")
      cursor.execute("CREATE EXTENSION IF NOT EXISTS vector"); 
      conn.commit()
      print("Adding extension")

      # Define the table schema if needed
      # Make sure it matches the structure of your DataFrame
      table_schema_data = """
            id_serial UUID DEFAULT gen_random_uuid() PRIMARY KEY,
            title text,
            content text,
            title_vector VECTOR(1536),
            content_vector VECTOR(1536)
      """


# Drop previous table of same name if one exists
# Replace 'vctor_embeddings' with the name of your PostgreSQL table
table_name = "vctor_embeddings"
# Replace 'vctor' schema with the schema name of your PostgreSQL table
table_schema = "vctor"


#create schema
cursor.execute(f"CREATE schema IF NOT EXISTS {table_schema}}"); 
conn.commit()
print("Create Schema extension")


#Drop table
cursor.execute(f"DROP TABLE IF  EXISTS {table_schema}.{table_name} ")
#Table Creatin
cursor.execute(f"CREATE TABLE IF NOT EXISTS {table_schema}.{table_name} ({table_schema_data});")
print("Drop the old table and Create a new(if old existed)")


# Clean up
# Close the cursor and connection
conn.commit()
cursor.close()
conn.close()
print("Connection Closed")






Connection established
Numeric PostgreSQL Version: 15.3
Greater than 15 configuration, table column use Array
Create Schema extension
Drop the old table and Create a new(if old existed)
Connection Closed


### Inserting Data into Postgres

In [144]:
#import psycopg2

# Update connection string information

postgree_params = {
    "host": "server.postgres.database.azure.com",
    "port": "5432", ##default 5432
    "dbname": "postgres", ##default databas, change if needed
    "user": "user name",
    "password": "password"
}

# Replace 'vctor_embeddings' with the name of your PostgreSQL table
table_name = "vctor_embeddings"
# Replace 'vctor' schema with the schema name of your PostgreSQL table
table_schema = "vctor"


conn = psycopg2.connect(**postgree_params)
print("Connection established")
cursor = conn.cursor()

total_records = df.shape[0]

# Iterate through your DataFrame and insert embeddings into PostgreSQL
for index, row in df.iterrows():
    insert_sql = f'''
       INSERT INTO {table_schema}.{table_name} (title,content,title_vector, content_vector)
        VALUES ('{row['title']}', '{row['content']}',
                ARRAY[{','.join(map(str, row['title_vector']))}]::double precision[],
                ARRAY[{','.join(map(str, row['content_vector']))}]::double precision[]);
    '''
    cursor.execute(insert_sql)



if index % batch_size == 0 or (index + 1 == total_records):
    print(f"Inserted {index+1} records into PostgreSQL")

print(f"Finished inserting {total_records} records into PostgreSQL")

# Close the database connection
conn.commit()
cursor.close()
conn.close()
print("Connection Closed")

Connection established
Inserted 3 records into PostgreSQL
Finished inserting 3 records into PostgreSQL
Connection Closed


###  Query Data at Postgres and checking results.

In [None]:
###Checking if the data was inserted sucessfully
#import psycopg2

# Update connection string information
postgree_params = {
    "host": "server.postgres.database.azure.com",
    "port": "5432", ##default 5432
    "dbname": "postgres", ##default databas, change if needed
    "user": "user name",
    "password": "password"
}

conn = psycopg2.connect(**postgree_params)
print("Connection established")
cursor = conn.cursor()

table_name = "vctor_embeddings"
table_schema = "vctor"


# Define the SELECT query
select_query = f'SELECT * FROM {table_schema}.{table_name} ;'

# Execute the query
cursor.execute(select_query)

# Fetch all rows
rows = cursor.fetchall()

# Process the results
for row in rows:
    print(row)


# Close the database connection
conn.commit()
cursor.close()
conn.close()
print("Connection Closed")