### Indexing API

In [5]:
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

True

Lets add Documents and Embeddings!

In [6]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter

loader = DirectoryLoader('./hotels', glob="**/*.txt", loader_cls=TextLoader)
data = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=150, chunk_overlap=20)
docs = text_splitter.split_documents(data)
print(len(docs))

Created a chunk of size 244, which is longer than the specified 150
Created a chunk of size 225, which is longer than the specified 150
Created a chunk of size 236, which is longer than the specified 150
Created a chunk of size 214, which is longer than the specified 150
Created a chunk of size 267, which is longer than the specified 150
Created a chunk of size 237, which is longer than the specified 150
Created a chunk of size 216, which is longer than the specified 150
Created a chunk of size 230, which is longer than the specified 150
Created a chunk of size 261, which is longer than the specified 150
Created a chunk of size 214, which is longer than the specified 150
Created a chunk of size 206, which is longer than the specified 150
Created a chunk of size 230, which is longer than the specified 150


15


In [7]:
import os

host = os.getenv("PG_VECTOR_HOST")
user = os.getenv("PG_VECTOR_USER")
password = os.getenv("PG_VECTOR_PASSWORD")
COLLECTION_NAME = os.getenv("PG_COLLECTION_NAME")

CONNECTION_STRING = f"postgresql+psycopg2://{user}:{password}@{host}:5432/{COLLECTION_NAME}"
CONNECTION_STRING

'postgresql+psycopg2://codingcrashcourses:123Sence@langchainazure.postgres.database.azure.com:5432/pgvector'

Create an instance of: Azure Database for PostgreSQL flexible server

Allow Vector Extension: https://learn.microsoft.com/de-de/azure/postgresql/flexible-server/concepts-extensions#how-to-use-postgresql-extensions

In [21]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector

embeddings = OpenAIEmbeddings()

vector_store = PGVector(
    embedding_function=embeddings,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

In [18]:
from langchain.indexes import SQLRecordManager, index

In [19]:
namespace = f"pgvector/{COLLECTION_NAME}"
record_manager = SQLRecordManager(
    namespace, db_url=CONNECTION_STRING
)

In [20]:
record_manager.create_schema()

Update the documents to see some changes (2nd run)

In [22]:
index(
    docs,
    record_manager,
    vector_store,
    cleanup=None,
    source_id_key="source",
)

{'num_added': 15, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [23]:
from langchain.schema import Document

docs[1].page_content = "updated"
del docs[6]
docs.append(Document(page_content="new content", metadata={"source": "important"}))

In [24]:
index(
    docs,
    record_manager,
    vector_store,
    cleanup=None,
    source_id_key="source",
)

{'num_added': 2, 'num_updated': 0, 'num_skipped': 13, 'num_deleted': 0}

In [25]:
docs[1].page_content = "updated again"
del docs[2]
del docs[3]
del docs[4]
docs.append(Document(page_content="more new content", metadata={"source": "important"}))

In [26]:
index(
    docs,
    record_manager,
    vector_store,
    cleanup="incremental",
    source_id_key="source",
)

{'num_added': 2, 'num_updated': 0, 'num_skipped': 11, 'num_deleted': 6}

In [27]:
index(
    [],
    record_manager,
    vector_store,
    cleanup="incremental",
    source_id_key="source",
)

{'num_added': 0, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [28]:
index([], record_manager, vector_store, cleanup="full", source_id_key="source")

{'num_added': 0, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 13}