In [1]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_postgres.vectorstores import PGVector
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.schema.output_parser import StrOutputParser
from langchain_ollama import ChatOllama
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain_core.runnables import RunnablePassthrough
from langchain_core.callbacks import StdOutCallbackHandler
from langchain_core.prompts import PromptTemplate
import sys
from dotenv import load_dotenv
import os

In [2]:
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [3]:
loader = DirectoryLoader(
        path="./Bookshelf",
        glob="**/*.pdf",
        loader_cls=PyPDFLoader
    )
documents = loader.load()
print(f"Loaded {len(documents)} documents.")

Loaded 31 documents.


In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)

In [5]:
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")

In [6]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator(evaluator="embedding_distance", 
                          embeddings=embeddings)
evaluator.evaluate_strings(prediction="Paris", reference="Bordeaux")

{'score': 0.31489751577569425}

In [7]:
try:
    vectorstore = PGVector.from_documents(
        documents=chunks,
        collection_name='knowledge_base',
        connection="postgresql+psycopg://pguser:pgpass@localhost:5432/pgdb",
        embedding=embeddings,
        pre_delete_collection=True
    )
    print("✅ RAG database populated successfully!")
except Exception as e:
    print(f"Error connecting to the database: {e}")
    import traceback
    traceback.print_exc()
    raise

✅ RAG database populated successfully!


In [60]:
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, say that you don't know. Don't Make Up Anything.

{context}

---

Answer the question based on the above context: {question}
"""

In [61]:
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=PROMPT_TEMPLATE,
)

In [62]:
llm = ChatOllama(model="llama3.2", base_url="http://localhost:11434")
# llm = ChatOpenAI(model='gpt-4o-mini', temperature=0)

In [63]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 15})

In [64]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [65]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm 
    | StrOutputParser()
)

In [66]:
def ask_question(query: str) -> str:
    return rag_chain.invoke(query, config={"callbacks": [StdOutCallbackHandler()]})

In [67]:
ask_question('What are the information on Ruan Coelho ?')



[1m> Entering new RunnableSequence chain...[0m


[1m> Entering new RunnableParallel<context,question> chain...[0m


[1m> Entering new RunnableSequence chain...[0m


[1m> Entering new RunnablePassthrough chain...[0m

[1m> Finished chain.[0m


[1m> Entering new format_docs chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new PromptTemplate chain...[0m

[1m> Finished chain.[0m


[1m> Entering new StrOutputParser chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


'The retrieved context provides information about Ruan Nunes Coelho as follows:\n\n- Name: RUAN NUNES COELHO\n- CPF: 428.189.658-96\n- CNPJ: 55.323.138/0001-13 (as the employer)\n- Empresarial name: 55.323.138 RUAN NUNES COELHO \n- Capital Social: R$100,00\n- Address: RUA SHITIRO MAEJI 377, CENTRO - Município REGISTRO SP, CEP 11900-000\n- Profession/Occupation: Promotor(a) of sales, independent (7319-0/02)\n- Additional information:\n  - E-mail: RUANCOELHO3009@GMAIL.COM\n  - Telephone: (13) 9738-2652'

In [68]:
ask_question('What is Ruan Coelho CNPJ ?')



[1m> Entering new RunnableSequence chain...[0m


[1m> Entering new RunnableParallel<context,question> chain...[0m


[1m> Entering new RunnableSequence chain...[0m


[1m> Entering new RunnablePassthrough chain...[0m

[1m> Finished chain.[0m


[1m> Entering new format_docs chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new PromptTemplate chain...[0m

[1m> Finished chain.[0m


[1m> Entering new StrOutputParser chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


'The CNPJ of Ruan Nunes Coelho is 55.323.138/0001-13.'

In [69]:
ask_question('Did Ruan Nunes paid something in the NF-e ?')



[1m> Entering new RunnableSequence chain...[0m


[1m> Entering new RunnableParallel<context,question> chain...[0m


[1m> Entering new RunnableSequence chain...[0m


[1m> Entering new RunnablePassthrough chain...[0m

[1m> Finished chain.[0m


[1m> Entering new format_docs chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[1m> Entering new PromptTemplate chain...[0m

[1m> Finished chain.[0m


[1m> Entering new StrOutputParser chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


'Yes, according to the text, Ruan Nunes paid an amount of R$ 1.488,66 as ISSQN (Imposto sobre Serviços de Valor Agregado) in the NFS-e (Nota Fiscal Eletrônica).'