### Imports

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

import langchain

# LLM
from langchain_community.llms import Ollama

# Document loader
from langchain_community.document_loaders import PyPDFLoader
# from langchain_community.document_loaders import DirectoryLoader

# Text splitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Embedding
from langchain_community.embeddings.ollama import OllamaEmbeddings

# Vector store
from langchain.vectorstores.chroma import Chroma

# Prompt
from langchain import PromptTemplate
from langchain.chains import RetrievalQA

MODEL = 'llama3.1'
DATA_PATH = 'data'
CHROMA_PATH = 'chroma'

In [2]:
model = Ollama(model = MODEL)
# model.invoke('Olá, tudo bem?')

embeddings = OllamaEmbeddings(model = 'nomic-embed-text')

### Reading and chunking the document

In [3]:
documents = []

for filename in os.listdir(DATA_PATH):
    if filename.endswith('.pdf'):
        file_path = os.path.join(DATA_PATH, filename)
        loader = PyPDFLoader(file_path)
        documents.extend(loader.load())
        
text_spitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,
    chunk_overlap = 20,
    length_function = len,
    is_separator_regex = False
)

chunks = text_spitter.split_documents(documents)

### Creating the Vector DB

In [4]:
db = Chroma.from_documents(
    documents = chunks,    
    embedding = embeddings,
    persist_directory = CHROMA_PATH
)

db.persist()

: 

### Using LLama with RAG

In [140]:
prompt_template = """
Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES"). 
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer. Answer in Portuguese.

QUESTION: {question}
=========
{summaries}
=========
FINAL ANSWER:"""

PROMPT = PromptTemplate(
    template = prompt_template, input_variables = ["summaries", "question"]
)

langchain.verbose = False

chain = RetrievalQA.from_chain_type(
    llm = model,
    retriever = db.as_retriever(),
    chain_type_kwargs={
        'prompt': PROMPT,
        'document_variable_name': 'summaries'
    }
)

In [141]:
QUERY = 'Eu posso receber propina?'
response = chain.invoke(QUERY)

In [142]:
resposta = response['result']

print(f'Pergunta: {QUERY}')
print('\n')
print(f'Resposta: {resposta}')

Pergunta: Eu posso receber propina?


Resposta: Desculpe, mas não posso responder a essa pergunta. 

SOURCES:

* Não há informações no texto fornecido que indiquem se é permitido receber propina ou pagamentos indevidos. 
* O texto parece enfatizar a importância de uma relação transparente e ética com fornecedores, mas não aborda especificamente o recebimento de propina.
