In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import pandas as pd
from multiprocessing import Process, Queue, Manager


class RecursiveWebScraper:
    def __init__(self, base_url, delay=1, max_depth=2, skip_domains=None, num_workers=4):
        self.base_url = base_url
        self.delay = delay
        self.max_depth = max_depth
        self.num_workers = num_workers
        self.domain = urlparse(base_url).netloc
        self.skip_domains = skip_domains or []
        self.url_file = "To_scrape.txt"
        self.url_done = "Scraped.txt"

    def scrape(self, url, depth, visited_urls, results, queue):
        if depth == 0:
            return
        if url in visited_urls:
            return

        print(f"Scraping: {url} | Depth: {self.max_depth - depth + 1}")
        visited_urls.append(url)

        try:
            response = requests.get(url)
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"Failed to fetch {url}: {e}")
            return

        soup = BeautifulSoup(response.text, 'html.parser')
        results[url] = self.format_content_as_markdown(url, soup)

        # Save the URL to the completed list
        with open(self.url_done, "a+") as file:
            file.write(f"{url}\n")

        # Extract and enqueue links for further scraping
        for link_tag in soup.find_all('a', href=True):
            full_url = urljoin(url, link_tag['href'])
            if self.is_valid_url(full_url, visited_urls):
                queue.put((full_url, depth - 1))
                time.sleep(self.delay)

    def is_valid_url(self, url, visited_urls):
        parsed_url = urlparse(url)
        if parsed_url.netloc != self.domain:
            return False
        for skip_domain in self.skip_domains:
            if url.startswith(skip_domain):
                return False
        return url not in set(visited_urls)

    def format_content_as_markdown(self, url, soup):
        markdown = [f"# {soup.title.string.strip() if soup.title else url}"]

        for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'li', 'div']):
            if tag.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                level = int(tag.name[1])
                markdown.append(f"{'#' * level} {self.process_inline_links(tag)}")
            elif tag.name == 'p':
                markdown.append(self.process_inline_links(tag))
            elif tag.name == 'ul':
                for li in tag.find_all('li'):
                    markdown.append(f"- {self.process_inline_links(li)}")
            elif tag.name == 'div' and 'highlight-solidity' in tag.get('class', []):
                pre_tag = tag.find('pre')
                if pre_tag:
                    code = pre_tag.get_text(strip=False)
                    markdown.append(f"```solidity\n{code}\n```")

        return "\n\n".join(markdown)

    def process_inline_links(self, tag):
        text = ""
        for content in tag.contents:
            if content.name == 'a' and content.get('href'):
                link_text = content.get_text(strip=True)
                href = content['href']
                full_url = urljoin(self.base_url, href)
                with open(self.url_file, "a+") as file:
                    file.write(f"{full_url}\n")
                text += f"[{link_text}]({full_url})"
            elif isinstance(content, str):
                text += content
        return text.strip()

    def start_scraping(self):
        manager = Manager()
        visited_urls = manager.list()  # Use a list instead of set
        results = manager.dict()
        queue = Queue()

        # Initialize the queue with the base URL
        queue.put((self.base_url, self.max_depth))

        # Create worker processes
        workers = [
            Process(target=self.worker, args=(visited_urls, results, queue))
            for _ in range(self.num_workers)
        ]

        # Start all workers
        for worker in workers:
            worker.start()

        # Wait for all workers to complete
        for worker in workers:
            worker.join()

        # Save results after scraping
        self.save_results(results)


    def worker(self, visited_urls, results, queue):
        while not queue.empty():
            url, depth = queue.get()
            self.scrape(url, depth, visited_urls, results, queue)

    def save_results(self, filename="scraped_data.txt"):
        """Save the scraped results to a Markdown file."""
        with open(filename, "w", encoding="utf-8") as file:
            for url, content in self.results.items():
                new_content = ""
                lines = content.split("\n")
                for  line in lines:
                    print(len(line.split(" ")), len(line), line)
                    if len(line.split(" ")) > 0 and len(line) > 0:
                        new_content += line + "\n"
                # print(len(new_content))
                file.write(f"URL: {url}\n\n{new_content}\n\n{'-' * 80}\n\n")
        print(f"Scraped data saved to {filename}")

    def results_df(self):
        """Return the scraped results as a Pandas DataFrame."""
        return pd.DataFrame(self.results.items(), columns=["URL", "Content"])


In [None]:
base_url = "https://docs.soliditylang.org/en/v0.8.28/"
skip_domains = [
    "https://soliditylang.org/blog",
    # "https://docs.soliditylang.org/en"
]

scraper = RecursiveWebScraper(base_url, delay=1, max_depth=4, skip_domains=skip_domains, num_workers=1500)
scraper.start_scraping()  #

Scraping: https://docs.soliditylang.org/en/v0.8.28/ | Depth: 1
Scraping: https://docs.soliditylang.org/en/v0.8.28/introduction-to-smart-contracts.html | Depth: 2
Scraping: https://docs.soliditylang.org/en/v0.8.28/solidity-by-example.html | Depth: 2
Scraping: https://docs.soliditylang.org/en/v0.8.28/index.html | Depth: 3
Scraping: https://docs.soliditylang.org/en/v0.8.28/installing-solidity.html | Depth: 2
Scraping: https://docs.soliditylang.org/en/v0.8.28/introduction-to-smart-contracts.html#a-simple-smart-contract | Depth: 3
Scraping: https://docs.soliditylang.org/en/v0.8.28/solidity-by-example.html#voting | Depth: 3
Scraping: https://docs.soliditylang.org/en/v0.8.28/layout-of-source-files.html | Depth: 4
Scraping: https://docs.soliditylang.org/en/v0.8.28/structure-of-a-contract.html | Depth: 2
Scraping: https://docs.soliditylang.org/en/v0.8.28/introduction-to-smart-contracts.html#storage-example | Depth: 3
Scraping: https://docs.soliditylang.org/en/v0.8.28/installing-solidity.html#ve

KeyboardInterrupt: 

Scraping: https://docs.soliditylang.org/en/v0.8.28/solidity-by-example.html#id3 | Depth: 4
Scraping: https://docs.soliditylang.org/en/v0.8.28/abi-spec.html | Depth: 4
Scraping: https://docs.soliditylang.org/en/v0.8.28/contracts.html#constant-and-immutable-state-variables | Depth: 3
Scraping: https://docs.soliditylang.org/en/v0.8.28/metadata.html | Depth: 4
Scraping: https://docs.soliditylang.org/en/v0.8.28/ir-breaking-changes.html#semantic-only-changes | Depth: 4
Scraping: https://docs.soliditylang.org/en/v0.8.28/security-considerations.html | Depth: 4
Scraping: https://docs.soliditylang.org/en/v0.8.28/solidity-by-example.html#verifying-payments | Depth: 4Scraping: https://docs.soliditylang.org/en/v0.8.28/solidity-by-example.html#verifying-payments | Depth: 3

Scraping: https://docs.soliditylang.org/en/v0.8.28/solidity-by-example.html#verifying-payments | Depth: 4
Scraping: https://docs.soliditylang.org/en/v0.8.28/contracts.html#constant | Depth: 3
Scraping: https://docs.soliditylang.o