In [11]:
import langchain
import os, json
import openai
from dotenv import load_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch

In [12]:
# Load environment variables from a .env file using load_dotenv():
load_dotenv()


openai.api_key = os.environ.get("OPENAI_API_KEY")
model: str = "text-embedding-ada-002"

In [13]:
vector_store_address: str = "https://mysuperdbapp.search.windows.net"
vector_store_password: str = "71D0w2gklO9fteqWgfLVxrWwqyLK0dOBjnYleT0MjsAzSeBgJaDE"
index_name: str = "langchain-vector-demo"

In [14]:
embeddings: OpenAIEmbeddings = OpenAIEmbeddings(model=model, chunk_size=1)
vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_password,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

In [15]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

loader = TextLoader("./text.txt", encoding="utf-8")

documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
for index, doc in enumerate(docs, start=0):
    doc.metadata = {"id": index, "brand": "test"}

vector_store.add_documents(documents=docs)

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')).


['YzdmNmRiNDEtZTI5Mi00OGM4LTgwODAtZGViOThmMDQzNWY3']

In [16]:
vector_store.similarity_search(query="wann sind die Öffnungszeiten?", search_type="hybrid")

KWARGS {'search_type': 'hybrid'}
Key: search_type, Value: hybrid
KWARGS {'search_type': 'hybrid'}
FILTERS None


[Document(page_content='Die Öffnungszeiten sind um 18 Uhr\nEssen ist super\nhallo', metadata={'id': 0, 'brand': 'test'}),
 Document(page_content='Die Öffnungszeiten sind um 18 Uhr\nEssen ist super\nhallo', metadata={'id': 0, 'brand': 'test'}),
 Document(page_content='Die Öffnungszeiten sind um 18 Uhr\nEssen ist super\nhallo', metadata={'id': 0, 'brand': 'test'}),
 Document(page_content='Die Öffnungszeiten sind um 18 Uhr\nEssen ist super\nhallo', metadata={'id': 0, 'brand': 'test'})]

In [17]:
vector_store.similarity_search(query="wann sind die Öffnungszeiten?", filters = "search.ismatch('\"brand: notthere\"')")

KWARGS {'filters': 'search.ismatch(\'"brand: notthere"\')'}
Key: filters, Value: search.ismatch('"brand: notthere"')
KWARGS {'filters': 'search.ismatch(\'"brand: notthere"\')'}
FILTERS search.ismatch('"brand: notthere"')


[]

In [18]:
# Does not work
retriever = vector_store.as_retriever()
docs = retriever.get_relevant_documents(
    query="when does the restaurant open?", filters="search.ismatch('\"brand: notthere\"')"
)
print(docs)

KWARGS... {'filters': 'search.ismatch(\'"brand: notthere"\')'}
KWARGS {}
KWARGS {}
FILTERS None
[Document(page_content='Die Öffnungszeiten sind um 18 Uhr\nEssen ist super\nhallo', metadata={'id': 0, 'brand': 'test'}), Document(page_content='Die Öffnungszeiten sind um 18 Uhr\nEssen ist super\nhallo', metadata={'id': 0, 'brand': 'test'}), Document(page_content='Die Öffnungszeiten sind um 18 Uhr\nEssen ist super\nhallo', metadata={'id': 0, 'brand': 'test'}), Document(page_content='Die Öffnungszeiten sind um 18 Uhr\nEssen ist super\nhallo', metadata={'id': 0, 'brand': 'test'})]


In [20]:
filters = f"search.ismatch('\"brand: notthere\"')"
search_kwargs = {"filters": filters}

retriever = vector_store.as_retriever(search_kwargs=search_kwargs)
retriever.get_relevant_documents(query="when does the restaurant open?")


KWARGS... {}
KWARGS {'filters': 'search.ismatch(\'"brand: notthere"\')'}
Key: filters, Value: search.ismatch('"brand: notthere"')
KWARGS {'filters': 'search.ismatch(\'"brand: notthere"\')'}
FILTERS search.ismatch('"brand: notthere"')


[]

In [None]:
# function based solution
def get_relevant_documents(query, brand):
    filters = f"search.ismatch('\"brand: {brand}\"')"
    search_kwargs = {"filters": filters}
    retriever = vector_store.as_retriever(search_kwargs=search_kwargs)
    return retriever.get_relevant_documents(query=query)

get_relevant_documents("when does the restaurant open?", "notthere")


In [None]:
# class based solution
class DocumentRetriever:
    def __init__(self, vector_database):
        self.vector_database = vector_database

    def get_documents(self, search_query, brand_filter):
        filter_string = f"search.ismatch('\"brand: {brand_filter}\"')"
        search_params = {"filters": filter_string}
        retriever_instance = self.vector_database.as_retriever(search_params=search_params)
        return retriever_instance.get_relevant_documents(query=search_query)

document_retriever = DocumentRetriever(vector_database=vector_store)
document_retriever.get_documents("when does the restaurant open?", "notthere")
