In [5]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

class WebScraperAgent:
    def __init__(self, base_url):
        self.base_url = base_url
        self.visited_urls = set()
        self.output_file = 'business_info.txt'
        self.ignored_paths = ['blog', 'news', 'newsletter', 'insights', 'press']

    def scrape_website(self):
        with open(self.output_file, 'w', encoding='utf-8') as file:
            self.scrape_page(self.base_url, file)

    def scrape_page(self, url, file):
        if url in self.visited_urls or self.should_ignore_url(url):
            return

        self.visited_urls.add(url)

        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract text content
            content = soup.get_text(separator=' ', strip=True)

            # Write to file
            file.write(f"URL: {url}\n")
            file.write(f"Content: {content}\n\n")
            file.write("-" * 80 + "\n\n")  # Separator between pages

            # Find and scrape linked pages
            for link in soup.find_all('a', href=True):
                next_url = urljoin(url, link['href'])
                if self.is_same_domain(next_url) and not self.should_ignore_url(next_url):
                    self.scrape_page(next_url, file)

        except Exception as e:
            print(f"Error scraping {url}: {str(e)}")

    def is_same_domain(self, url):
        return urlparse(url).netloc == urlparse(self.base_url).netloc

    def should_ignore_url(self, url):
        path = urlparse(url).path.lower()
        return any(ignored in path for ignored in self.ignored_paths)

# Usage
if __name__ == "__main__":
    business_url = input("Enter the business website URL: ")
    scraper = WebScraperAgent(business_url)
    scraper.scrape_website()
    print("Scraping completed. Data stored in business_info.txt")

Scraping completed. Data stored in business_info.txt
