In [1]:
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

True

In [2]:
import os
from azure.storage.blob import BlobServiceClient

folder_path = "./hotels"

conn_str=os.getenv("BLOB_CONN_STRING")
container_name = os.getenv("BLOB_CONTAINER")


blob_service_client = BlobServiceClient.from_connection_string(conn_str=conn_str)
container_client = blob_service_client.get_container_client(container_name)

for filename in os.listdir(folder_path):
    if os.path.isfile(os.path.join(folder_path, filename)):
        file_path = os.path.join(folder_path, filename)

        blob_client = blob_service_client.get_blob_client(container=container_name, blob=filename)

        with open(file_path, "rb") as data:
            blob_client.upload_blob(data, overwrite=True)


Now we can link ACS BlobStorage and the ACS VectorStore

In [3]:
!pip install unstructured




[notice] A new release of pip available: 22.3 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
from langchain.document_loaders import AzureBlobStorageContainerLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = AzureBlobStorageContainerLoader(conn_str=conn_str, container=container_name)
data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap  = 20,
    length_function = len,
    is_separator_regex = False,
)
docs = text_splitter.split_documents(data)

In [5]:
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch

In [6]:
embeddings: OpenAIEmbeddings = OpenAIEmbeddings(deployment="text-embedding-ada-002", chunk_size=1)
index_name: str = "langchain-vector-demo"
vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=os.environ.get("SEARCH_ENDPOINT"),
    azure_search_key=os.environ.get("SEARCH_API_KEY"),
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

In [7]:
result = vector_store.add_documents(documents=docs)

If you run that code multiple times, we would add the same documents again and again - quick solution is to create the complete index again.

In [8]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient


index_name = "langchain-vector-demo"
endpoint = os.environ["SEARCH_ENDPOINT"]
api_key = os.environ["SEARCH_API_KEY"]

credential = AzureKeyCredential(api_key)
client = SearchClient(endpoint=endpoint,
                      index_name=index_name,
                      credential=credential)

results = client.search(search_text="*")
documents = [result for result in results]

print(len(documents))


0


In [9]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
import os

index_client = SearchIndexClient(endpoint, AzureKeyCredential(api_key))
index_client.delete_index(index_name)

print(f"Index '{index_name}' has been deleted.")


Index 'langchain-vector-demo' has been deleted.


Now we could create the Index again - but it would be actually better to:

1. Not have duplicated documents in the vectorstore
2. Not to drop Indexes and recreate them everything a source document changes

For this issue, the indexing API was developed. Unfortunately, the indexing API does NOT work in combination with ACS.
That´s why we will continue with PGVector on Azure
