In [2]:
import requests
import time
import os
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib.robotparser import RobotFileParser
from langchain_community.embeddings.edenai import EdenAiEmbeddings
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from langchain.vectorstores import Chroma

In [6]:
def is_allowed(url, rp):
    return rp.can_fetch("*", url)

def get_all_links(url, base_url, pdf_links, rp):
    try:
        if not is_allowed(url, rp):
            print(f"Skipping {url} due to robots.txt")
            return set()
        
        response = requests.get(url, headers={'User-Agent': 'MyWebScraper'}, timeout=10)
        if response.status_code != 200:
            return set()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        links = set()
        
        for a_tag in soup.find_all('a', href=True):
            link = a_tag['href']
            full_url = urljoin(base_url, link)
            
            if full_url.endswith('.pdf'):
                pdf_links.add(full_url)
            
            if full_url.startswith(base_url):  
                links.add(full_url)
        
        return links
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while making a request: {e}")
        return set()

def get_visible_text_from_url(url, rp):
    try:
        if not is_allowed(url, rp):
            print(f"Skipping {url} due to robots.txt")
            return None
        
        response = requests.get(url, headers={'User-Agent': 'MyWebScraper'}, timeout=10)
        if response.status_code != 200 or 'text/html' not in response.headers['Content-Type']:
            return None
        
        soup = BeautifulSoup(response.text, 'html.parser')
        for script in soup(['script', 'style']):
            script.extract()
        texts = soup.stripped_strings
        return " ".join(texts)
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while making a request: {e}")
        return None

def download_pdfs(pdf_links):
    if not os.path.exists('pdfs'):
        os.makedirs('pdfs')
    
    for pdf_url in pdf_links:
        try:
            response = requests.get(pdf_url, headers={'User-Agent': 'MyWebScraper'}, timeout=10)
            pdf_name = pdf_url.split('/')[-1]
            with open(f'pdfs/{pdf_name}', 'wb') as f:
                f.write(response.content)
            print(f"Downloaded {pdf_name}")
        except requests.exceptions.RequestException as e:
            print(f"Failed to download {pdf_url}: {e}")

def main():
    base_url = "https://kmdevantagens.com.br/"
    rp = RobotFileParser()
    rp.set_url(urljoin(base_url, "robots.txt"))
    rp.read()
    
    all_links = set()
    pdf_links = set()
    to_visit = {base_url}
    
    with open("./data/doc_modules.txt", "w", encoding="utf-8") as f:
        while to_visit:
            current_url = to_visit.pop()
            print(f"Visiting {current_url}...")
            
            if current_url not in all_links:
                all_links.add(current_url)
                links = get_all_links(current_url, base_url, pdf_links, rp)
                to_visit.update(links)
            
            print(f"Scraping {current_url}...")
            text = get_visible_text_from_url(current_url, rp)
            if text:
                f.write(f"Data from {current_url}:\n")
                f.write(text)
                f.write("\n\n")
            
            time.sleep(1)  # Delay to avoid overloading the server
                
    print("Web scraping completed. Data saved to 'data_LC.txt'.")
    
    print("Downloading PDFs...")
    download_pdfs(pdf_links)
    print("All PDFs downloaded.")

if __name__ == "__main__":
    main()

Visiting https://kmdevantagens.com.br/...
Scraping https://kmdevantagens.com.br/...
Visiting https://kmdevantagens.com.br/ganhe-km...
Scraping https://kmdevantagens.com.br/ganhe-km...
Visiting https://kmdevantagens.com.br/regulamento...
Scraping https://kmdevantagens.com.br/regulamento...
Visiting https://kmdevantagens.com.br/fale-conosco...
Scraping https://kmdevantagens.com.br/fale-conosco...
Visiting https://kmdevantagens.com.br/promocoes...
Scraping https://kmdevantagens.com.br/promocoes...
Visiting https://kmdevantagens.com.br/abastece-ai...
Scraping https://kmdevantagens.com.br/abastece-ai...
Visiting https://kmdevantagens.com.br/categorias...
Scraping https://kmdevantagens.com.br/categorias...
Visiting https://kmdevantagens.com.br/ajuda...
Scraping https://kmdevantagens.com.br/ajuda...
Visiting https://kmdevantagens.com.br/institucional...
Scraping https://kmdevantagens.com.br/institucional...
Visiting https://kmdevantagens.com.br/categoria/140...
Scraping https://kmdevantagens.

In [7]:
load_dotenv()

In [33]:
key = os.getenv("EDENAI_API_KEY")

In [34]:
embeddings = EdenAiEmbeddings(provider="openai", edenai_api_key=key)

In [13]:
with open ("./data/doc_modules.txt", "r", encoding='utf-8') as arquivo:
    content = arquivo.read()
    content = [content]

In [18]:
loader_txt = DirectoryLoader('data/', glob="**/*.txt")
loader_txt

<langchain_community.document_loaders.directory.DirectoryLoader at 0x2436c6edb20>

In [20]:
docs = loader_txt.load()

In [21]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000, chunk_overlap = 100)
splitData = text_splitter.split_documents(docs)

In [26]:
print(type(splitData))

<class 'list'>


In [38]:
persist_directory = 'kmv_embedding'
colletion_vector_coopercitrus = 'kmv_collection'

In [41]:
db = Chroma.from_documents(splitData, 
                           embeddings,
                           persist_directory=persist_directory,
                           collection_name=colletion_vector_coopercitrus
                           )
db

<langchain_community.vectorstores.chroma.Chroma at 0x2430b514e90>

In [42]:
db.persist()

  warn_deprecated(


In [43]:
question = "o que é o kmv?"
v = db.similarity_search(query=question, k=5)
v

[Document(page_content='Data from https://kmdevantagens.com.br/categoria/140: Serviços Financeiros | Km de Vantagens Troque pontos KMV Ganhe pontos KMV Como funciona APP KMV Promoções Caminhoneiro Todas as Categorias / Serviços Financeiros Serviços Financeiros TROQUE SEUS PONTOS KMV POR SERVIÇOS EXCLUSIVOS! CONFIRA TODAS AS CATEGORIAS Mycon Mycon e KMV, o consórcio digital com a menor taxa do Brasil. 0 pontos KMV RESGATAR VOUCHER Sorte Online Aproveite 30% de desconto na Sorte Online 100 pontos KMV RESGATAR VOUCHER Sorte Online Troque pontos KMV por 50% de desconto na Quina de São João! 100 pontos KMV RESGATAR VOUCHER Simule como ganhar pontos KMV Como funciona Fale conosco Ajuda Proteção de Dados Pessoais Regulamento Institucional © 2024 - KMV, programa de fidelidade dos postos Ipiranga. Uma empresa do Grupo Ultra. Avenida Brigadeiro Luis Antonio, nº 1343, 2º andar, ala A - São Paulo, SP - 01317-001 Qual é a sua dúvida? Vamos conversar sobre isso!', metadata={'source': 'data\\doc_modu