In [8]:
import os
from dotenv import load_dotenv

from pathlib import Path
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain_chroma import Chroma

In [9]:
load_dotenv()

True

In [10]:
config = {
     "INPUT_FOLDER": os.getenv("INPUT_FOLDER"),
     "GOOGLE_API_KEY": os.getenv("GOOGLE_API_KEY"),
     "CHROMA_DIR": os.getenv("CHROMA_DIR")
}

In [11]:
os.getenv("CHROMA_DIR")

'/home/arthur/argus/chroma'

In [12]:
input_path = Path(config["INPUT_FOLDER"])
text_files = list(input_path.glob("*.txt"))

In [13]:
text_files

[PosixPath('/home/arthur/argus/data/processed/Counterexample-Guided Repair of Reinforcement Learning Systems Using Safety Critics.txt'),
 PosixPath('/home/arthur/argus/data/processed/Some Insights into Lifelong Reinforcement Learning Systems.txt')]

In [14]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [15]:
embedding_model = GoogleGenerativeAIEmbeddings(
    google_api_key=os.getenv("GOOGLE_API_KEY"), 
    model="models/text-embedding-004"
)

In [16]:
## Criando o banco vetorial

vector_db = Chroma(
    persist_directory=os.getenv("CHROMA_DIR"),
    embedding_function=embedding_model
)

In [17]:
for file in text_files:
    loader = TextLoader(file.as_posix())

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100
    )

    data = loader.load_and_split(text_splitter=splitter)

    print(f"Inserindo: {file.stem}")

    vector_db.add_documents(documents=data)

Inserindo: Counterexample-Guided Repair of Reinforcement Learning Systems Using Safety Critics
Inserindo: Some Insights into Lifelong Reinforcement Learning Systems


In [29]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA


In [30]:
llm = ChatGoogleGenerativeAI(
    model = "gemini-1.5-flash",
    google_api_key = config["GOOGLE_API_KEY"],
    temperature = 0.3
)

In [27]:
retriever = vector_db.as_retriever(search_kwargs = {"k":3})

In [31]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type = "stuff",
    retriever = retriever
)