In [2]:
import requests
import time
import os
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib.robotparser import RobotFileParser
from langchain_community.embeddings.edenai import EdenAiEmbeddings
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from langchain.vectorstores import Chroma

In [4]:
def is_allowed(url, rp):
    return rp.can_fetch("*", url)

def get_all_links(url, base_url, pdf_links, rp):
    try:
        if not is_allowed(url, rp):
            print(f"Skipping {url} due to robots.txt")
            return set()
        
        response = requests.get(url, headers={'User-Agent': 'MyWebScraper'}, timeout=10)
        if response.status_code != 200:
            return set()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        links = set()
        
        for a_tag in soup.find_all('a', href=True):
            link = a_tag['href']
            full_url = urljoin(base_url, link)
            
            if full_url.endswith('.pdf'):
                pdf_links.add(full_url)
            
            if full_url.startswith(base_url):  
                links.add(full_url)
        
        return links
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while making a request: {e}")
        return set()

def get_visible_text_from_url(url, rp):
    try:
        if not is_allowed(url, rp):
            print(f"Skipping {url} due to robots.txt")
            return None
        
        response = requests.get(url, headers={'User-Agent': 'MyWebScraper'}, timeout=10)
        if response.status_code != 200 or 'text/html' not in response.headers['Content-Type']:
            return None
        
        soup = BeautifulSoup(response.text, 'html.parser')
        for script in soup(['script', 'style']):
            script.extract()
        texts = soup.stripped_strings
        return " ".join(texts)
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while making a request: {e}")
        return None

def download_pdfs(pdf_links):
    if not os.path.exists('pdfs'):
        os.makedirs('pdfs')
    
    for pdf_url in pdf_links:
        try:
            response = requests.get(pdf_url, headers={'User-Agent': 'MyWebScraper'}, timeout=10)
            pdf_name = pdf_url.split('/')[-1]
            with open(f'pdfs/{pdf_name}', 'wb') as f:
                f.write(response.content)
            print(f"Downloaded {pdf_name}")
        except requests.exceptions.RequestException as e:
            print(f"Failed to download {pdf_url}: {e}")

def main():
    base_url = "https://www.exercitodesalvacao.org.br/"
    rp = RobotFileParser()
    rp.set_url(urljoin(base_url, "robots.txt"))
    rp.read()
    
    all_links = set()
    pdf_links = set()
    to_visit = {base_url}
    
    with open("./data/doc_modules.txt", "w", encoding="utf-8") as f:
        while to_visit:
            current_url = to_visit.pop()
            print(f"Visiting {current_url}...")
            
            if current_url not in all_links:
                all_links.add(current_url)
                links = get_all_links(current_url, base_url, pdf_links, rp)
                to_visit.update(links)
            
            print(f"Scraping {current_url}...")
            text = get_visible_text_from_url(current_url, rp)
            if text:
                f.write(f"Data from {current_url}:\n")
                f.write(text)
                f.write("\n\n")
            
            time.sleep(1)  # Delay to avoid overloading the server
                
    print("Web scraping completed. Data saved to 'data_LC.txt'.")
    
    print("Downloading PDFs...")
    download_pdfs(pdf_links)
    print("All PDFs downloaded.")

if __name__ == "__main__":
    main()

Visiting https://www.exercitodesalvacao.org.br/...
Scraping https://www.exercitodesalvacao.org.br/...
Visiting https://www.exercitodesalvacao.org.br/política-territorial-de-denúncias...
Scraping https://www.exercitodesalvacao.org.br/política-territorial-de-denúncias...
Visiting https://www.exercitodesalvacao.org.br/tomada-de-posicao-internacional...
Scraping https://www.exercitodesalvacao.org.br/tomada-de-posicao-internacional...
Visiting https://www.exercitodesalvacao.org.br/_files/ugd/5e1ee4_24242d29004e46a39c99d1719b1690b3.pdf...
Scraping https://www.exercitodesalvacao.org.br/_files/ugd/5e1ee4_24242d29004e46a39c99d1719b1690b3.pdf...
Visiting https://www.exercitodesalvacao.org.br/_files/ugd/5e1ee4_e2a9bdb5ff984a3e96135ba45440a839.pdf...
Scraping https://www.exercitodesalvacao.org.br/_files/ugd/5e1ee4_e2a9bdb5ff984a3e96135ba45440a839.pdf...
Visiting https://www.exercitodesalvacao.org.br/nota-fiscal-paulista...
Scraping https://www.exercitodesalvacao.org.br/nota-fiscal-paulista...
Visi

In [3]:
load_dotenv()

True

In [4]:
key = os.getenv("EDENAI_API_KEY")

In [5]:
embeddings = EdenAiEmbeddings(provider="openai", edenai_api_key=key)

In [6]:
loader_txt = DirectoryLoader('data/', glob="**/*.txt")
loader_txt

<langchain_community.document_loaders.directory.DirectoryLoader at 0x26dabd40cb0>

In [7]:
docs = loader_txt.load()

In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000, chunk_overlap = 100)
splitData = text_splitter.split_documents(docs)

In [9]:
persist_directory = 'exercito_embedding_2'
colletion_vector_exercito = 'exercito_collection_2'

In [10]:
db = Chroma.from_documents(splitData, 
                           embeddings,
                           persist_directory=persist_directory,
                           collection_name=colletion_vector_exercito
                           )
db

<langchain_community.vectorstores.chroma.Chroma at 0x26db23f1790>

In [11]:
db.persist()

  warn_deprecated(


In [12]:
question = "Tem alguma unidade no RJ?"
v = db.similarity_search(query=question, k=2)
v

[Document(page_content='# DIVISÃO REGIONAL RIO DE JANEIRO, MINAS GERAIS, DISTRITO FEDERAL E RORAIMA\n\nRio de Janeiro: Escritório Regional: Rua José do Patrocínio, nº 240 – Ed. César Bordalho – Apto. 201 Bairro Grajaú Rio de Janeiro - RJ Cep: 20560-160 Tel: (21) 2186-8442\n\nOficiais Responsáveis:\n\nMajor Ricardo Iung\n\nMajor Cindy Meylan Iung\n\nIgrejas:\n\nBANGU\n\nRJ\n\nRua Cel. Tamarindo, 576\n\nBangu\n\nRio de Janeiro\n\nRJ | 21870\n\n002\n\nTel: (21) 3331\n\n5721\n\nCAMPOS - RJ Rua Cap. Júlio Nogueira, 39 - Jd. Carioca Campos dos Goytacazes - RJ | 28080-470 Tel: (22) 2722-2722\n\nGUARUS - RJ Rua Operário João de Barros, 169 - Parque Prazeres Campos dos Goytacazes - RJ | 28080-095 Tel: (22) 2738-9305\n\nMÉIER\n\nRJ\n\nRua Getúlio, 432\n\nTodos os Santos\n\nRio de Janeiro\n\nRJ | 20775\n\n001\n\nTel: (21) 3549\n\n7692 / (21) 96426\n\n4610 (cel)\n\nAVANÇADA MORRO AGUDO\n\nRua Santa Rosa, 200\n\nComendador Soares\n\nNova Iguaçu\n\nRJ | 26277\n\n345\n\nTel: (21) 2032\n\n2672\n\nNITE