In [None]:
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv('../application/.env'))

In [None]:
from langchain.indexes import SQLRecordManager, index
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector


host = os.getenv("PG_VECTOR_HOST")
user = os.getenv("PG_VECTOR_USER")
password = os.getenv("PG_VECTOR_PASSWORD")
COLLECTION_NAME = os.getenv("PGDATABASE")
CONNECTION_STRING = f"postgresql+psycopg2://{user}:{password}@{host}:5432/{COLLECTION_NAME}"

namespace = f"pgvector/{COLLECTION_NAME}"
record_manager = SQLRecordManager(
    namespace, db_url=CONNECTION_STRING
)

embeddings = OpenAIEmbeddings()

vector_store = PGVector(
    embedding_function=embeddings,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

In [None]:
index([], record_manager, vector_store, cleanup="full", source_id_key="source")

In [None]:
import os
from langchain.document_loaders import AzureBlobStorageContainerLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_and_split_documents(chunk_size=200, chunk_overlap=20):
    conn_str = os.getenv("BLOB_CONN_STRING")
    container_name = os.getenv("BLOB_CONTAINER")

    if conn_str is None or container_name is None:
        raise ValueError("Environment variables for BLOB_CONN_STRING or BLOB_CONTAINER are not set.")

    loader = AzureBlobStorageContainerLoader(conn_str=conn_str, container=container_name)
    data = loader.load()
    for doc in data:
        doc.metadata["source"] = os.path.basename(doc.metadata["source"])

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
    return text_splitter.split_documents(data)


In [None]:
docs = load_and_split_documents()
docs

In [None]:
index(docs, record_manager, vector_store, cleanup="full", source_id_key="source")

Lets now update something in the raw data and upload it again

In [None]:
import os
from azure.storage.blob import BlobServiceClient

folder_path = "./restaurant"

conn_str=os.getenv("BLOB_CONN_STRING")
container_name = os.getenv("BLOB_CONTAINER")


blob_service_client = BlobServiceClient.from_connection_string(conn_str=conn_str)

for filename in os.listdir(folder_path):
    if os.path.isfile(os.path.join(folder_path, filename)):
        file_path = os.path.join(folder_path, filename)

        blob_client = blob_service_client.get_blob_client(container=container_name, blob=filename)

        with open(file_path, "rb") as data:
            blob_client.upload_blob(data, overwrite=True)

In [None]:
docs = load_and_split_documents()
for doc in docs[0:3]:
    print(doc)

In [None]:
index(docs, record_manager, vector_store, cleanup="full", source_id_key="source")

We now have linked Blob Storage And PgVector and keep the raw docs and the indexed documents in sync. But in a real app we want to handle that automatically. Thats was Azure functions can do for us. We will take a look at Azure Functions later in the course