### Process all documents from a directory and save the resulted chunks to a list as langchain documents

In [None]:
from load_chunk_to_langchain_docs import process_documents_to_langchain

raw_documents_dir = "../documents/raw"
all_chunks = process_documents_to_langchain(documents_dir=raw_documents_dir)

In [None]:
# Test
# print(len(all_chunks))
# print(all_chunks[150].page_content)

### Vector storage -> Postgres/pgvector

In [None]:
# import basics
import os
from dotenv import load_dotenv
from langchain_postgres import PGVector
from langchain_openai import OpenAIEmbeddings

# Load environment variables from .env file
load_dotenv()

# PostgreSQL Configuration
POSTGRES_USER = os.getenv("POSTGRES_USER")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")
POSTGRES_DB = os.getenv("POSTGRES_DB")
POSTGRES_PORT  = os.getenv("POSTGRES_PORT")

# initiate embeddings model
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Connection string
CONNECTION_STRING = f"postgresql+psycopg://{POSTGRES_USER}:{POSTGRES_PASSWORD}@localhost:{POSTGRES_PORT}/{POSTGRES_DB}"

# Initialize vector store
vectorstore_test = PGVector(
    connection=CONNECTION_STRING,
    embeddings=embeddings,
    collection_name="my_test_documents",  # table name
    use_jsonb=True,
)

### Ingestion

In [None]:
# Add documents
vectorstore_test.add_documents(all_chunks)

### Querying it

In [None]:
query = "What is the Q1 2025 revenue target?"

# Query
results = vectorstore_test.similarity_search(query, k=3)

print("Retrieved Document:")
for doc in results:
    print("=" * 60)
    print(f"* {doc.page_content} [{doc.metadata}]")

In [None]:
def retrieve_context(query: str):
    """Retrieve information to help answer a query."""
    retrieved_docs = vectorstore_test.similarity_search(query, k=5)
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\nContent: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs

In [None]:
query = "What is the Q1 2025 revenue target?"
serialized, retrieved_docs = retrieve_context(query)

In [None]:
print(serialized)