In [1]:
# Import the required libraries
import openai
import os
import load_env_var
import json
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import Vector
from azure.search.documents.indexes.models import (
    SearchIndex, 
    SearchFieldDataType,
    SimpleField,
    SearchField,
    SearchableField,
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration)

In [2]:
# Load the environment variables of the search service
search_endpoint, search_key, search_index_name = load_env_var.load_env_variables_azure_search()

In [3]:
# Load environment variables OpenAI
openai.api_type, openai.api_base, openai.api_version, openai.api_key, llm_model, emb_model = load_env_var.load_env_variables_openai()

In [140]:
# Create vector configuration for vector search
vector_search = VectorSearch(
    algorithm_configurations=[
        HnswVectorSearchAlgorithmConfiguration(
            name="my_vector_config",
            kind="hnsw",
            parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 500,
                "metric": "cosine"
            }
        )
    ]
)

##### Explanation of search parameters used in the vector configuration: https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md

In [141]:
# Create a search index client
client = SearchIndexClient(search_endpoint, AzureKeyCredential(search_key))

In [143]:
# Create the index fields
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, filterable=True, sortable=True, key=True),
    SearchableField(name="category", type=SearchFieldDataType.String),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchField(name="categoryVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, 
                vector_search_dimensions=1536, vector_search_configuration="my_vector_config"),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, 
                vector_search_dimensions=1536, vector_search_configuration="my_vector_config")
]

In [144]:
# Create the index (verify the index in Azure Portal to check if everything is correct)
index = SearchIndex(name=search_index_name, fields=fields, vector_search=vector_search)

In [145]:
result = client.create_index(index)

In [4]:
# Define a search client to upload documents obtained from the json_emb_files_generator
search_client = SearchClient(endpoint=search_endpoint, index_name=search_index_name, credential=AzureKeyCredential(search_key))

In [5]:
# Function to upload documents to the Azure service
def upload_documents_to_azure_search(folder_path):
    file_list = os.listdir(folder_path)

    for filename in file_list:
        file_path = os.path.join(folder_path, filename)

        with open(file_path, 'r') as file:
            document = json.load(file)
        
        result = search_client.upload_documents(document)
    
    print(f'Uploaded documents to Azure Search index {search_index_name} successfully!')

In [149]:
upload_documents_to_azure_search('data/json_unit')

Uploaded documents to Azure Search index vectorindex successfully!


In [150]:
upload_documents_to_azure_search('data/json_state')

Uploaded documents to Azure Search index vectorindex successfully!


In [152]:
upload_documents_to_azure_search('data/json_city')

Uploaded documents to Azure Search index vectorindex successfully!


In [153]:
upload_documents_to_azure_search('data/json_entity_name')

Uploaded documents to Azure Search index vectorindex successfully!


In [151]:
upload_documents_to_azure_search('data/json_var_name_def')

Uploaded documents to Azure Search index vectorindex successfully!


In [6]:
upload_documents_to_azure_search('data/json_variable_name')

Uploaded documents to Azure Search index vectorindex successfully!


In [8]:
# TEST THE VECTOR SEARCH
# Function to transform text into embeddings
def generate_embeddings(text, model=emb_model):
    response = openai.Embedding.create(input=text, engine=model)
    embeddings = response['data'][0]['embedding']
    return embeddings

In [13]:
# Write a question about the documents to test the search service
query = '¿Hay algun banco que se llame valley bank o similar?'

# Use the generate_embeddings function to transform the user query into embeddings
vector = Vector(value=generate_embeddings(query), k=2, fields='categoryVector, contentVector')

results = search_client.search(search_text=None, vectors=[vector], select=['category, content'])

llm_input = []

for result in results:
    print(f"Category: {result['category']}")  
    print(f"Content: {result['content']}")
    print(f"Score: {result['@search.score']}\n")
    llm_input.append(result['content'])


print(llm_input)

Category: entity_name
Content: the valley state bank
Score: 0.01666666753590107

Category: variable_name
Content: all real estate loans
Score: 0.01666666753590107

Category: entity_name
Content: valley capital bank, national  association
Score: 0.016393441706895828

Category: variable_name
Content: total assets: the sum of all assets owned by the institution including cash, loans, securities, bank premises and other assets. this total does not include off-balance-sheet accounts.
Score: 0.016393441706895828

['the valley state bank', 'all real estate loans', 'valley capital bank, national  association', 'total assets: the sum of all assets owned by the institution including cash, loans, securities, bank premises and other assets. this total does not include off-balance-sheet accounts.']


In [14]:
# Passes the information to the LLM so that it can generate an answer with the search results
system_message = f'Answer the question based on the given input text. Input: {llm_input}'

user_message = f'Question: {query}'

message = [{"role": "system", "content": system_message},
           {"role": "user", "content": user_message}]

completion = openai.ChatCompletion.create(deployment_id=llm_model, messages=message, max_tokens=100, temperature=0)

print(completion['choices'][0]['message']['content'])

Answer: Sí, hay dos bancos con nombres similares: 'the valley state bank' y 'valley capital bank, national association'.
