<a href="https://colab.research.google.com/github/Fhupsel/RAG/blob/main/RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
# Instalar bibliotecas necessárias (caso ainda não tenha)
# !pip install langchain langchain-community chromadb sentence-transformers pypdf transformers huggingface_hub -q

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceHub
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from google.colab import userdata

import os

# Configurar modelo de embeddings gratuito
embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Definir modelo de LLM gratuito do Hugging Face
os.environ["HUGGINGFACEHUB_API_TOKEN"] = userdata.get('API_KEY')
llm = HuggingFaceHub(
    repo_id="meta-llama/Llama-2-7b-chat-hf",
    model_kwargs={"temperature": 0.7, "max_length": 200}
)

# Carregar o PDF para RAG
pdf_path = "/content/DOC-SF238339076816-20230503.pdf"
loader = PyPDFLoader(pdf_path, extract_images=False)
pages = loader.load_and_split()

# Separar o texto em chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=4000,  # Define o tamanho dos fragmentos
    chunk_overlap=20,  # Sobreposição entre os fragmentos
    length_function=len,
    add_start_index=True,
)
chunks = text_splitter.split_documents(pages)

# Criar o banco de vetores e persistir
vectordb = Chroma.from_documents(chunks, embedding=embeddings_model, persist_directory="text_index")
vectordb.persist()

# Configurar o retriever
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

# Criar pipeline de QA
chain = load_qa_chain(llm, chain_type="stuff")


In [48]:
def ask(question):
    context = retriever.get_relevant_documents(question)
    answer = chain({'input_documents': context, 'question': question}, return_only_outputs=True)['output_text']
    return answer

In [None]:
user_question = input("User: ")
response = ask(user_question)
# Quais os principais pontos da lei que devo me atentar?
print(response)