In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import pandas as pd


import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import pandas as pd


class RecursiveWebScraper:
    def __init__(self, base_url, delay=1, max_depth=2, skip_domains=None):
        self.base_url = base_url
        self.delay = delay
        self.max_depth = max_depth
        self.visited_urls = set()
        self.domain = urlparse(base_url).netloc
        self.skip_domains = skip_domains or []  # Domains to skip
        self.results = {}
        self.url_file = "To_scrape.txt"
        self.url_done = "Scraped.txt"

    def scrape(self, url, depth):
        if depth == 0:
            return  # Stop recursion when depth limit is reached
        
        if url in self.visited_urls:
            return  # Avoid redundant visits

        print(f"Scraping: {url} | Depth: {self.max_depth - depth + 1}")
        self.visited_urls.add(url)
        
        try:
            response = requests.get(url)
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"Failed to fetch {url}: {e}")
            return

        soup = BeautifulSoup(response.text, 'html.parser')
        self.results[url] = self.format_content_as_markdown(url, soup)
        with open(self.url_done, "a+") as file:
            file.write(f"{url}\n")

        # Extract and process links
        for link_tag in soup.find_all('a', href=True):
            full_url = urljoin(url, link_tag['href'])
            if self.is_valid_url(full_url):
                time.sleep(self.delay)  # Respectful scraping
                self.scrape(full_url, depth - 1)  # Recurse with reduced depth

    def is_valid_url(self, url):
        """Check if the URL is valid, within the same domain, and not in the skip list."""
        parsed_url = urlparse(url)
        if parsed_url.netloc != self.domain:
            return False
        for skip_domain in self.skip_domains:
            if url.startswith(skip_domain):
                return False
        return url not in self.visited_urls

    def format_content_as_markdown(self, url, soup):
        """Extract content and format it as Markdown with inline links."""
        markdown = [f"# {soup.title.string.strip() if soup.title else url}"]

        # Traverse the content and replace links inline
        for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'li']):
            if tag.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                level = int(tag.name[1])
                markdown.append(f"{'#' * level} {self.process_inline_links(tag)}")
            elif tag.name == 'p':
                markdown.append(self.process_inline_links(tag))
            elif tag.name == 'ul':
                for li in tag.find_all('li'):
                    markdown.append(f"- {self.process_inline_links(li)}")

        return "\n\n".join(markdown)

    def process_inline_links(self, tag):
        """Replace links inline within a tag."""
        text = ""
        for content in tag.contents:
            if content.name == 'a' and content.get('href'):
                link_text = content.get_text(strip=True)
                href = content['href']
                full_url = urljoin(self.base_url, href)
                with open(self.url_file, "a+") as file:
                    file.write(f"{full_url}\n")
                text += f"[{link_text}]({full_url})"
            elif isinstance(content, str):
                text += content
        return text.strip()

    def save_results(self, filename="scraped_data.md"):
        """Save the scraped results to a Markdown file."""
        with open(filename, "w", encoding="utf-8") as file:
            for url, content in self.results.items():
                file.write(f"URL: {url}\n\n{content}\n\n{'-' * 80}\n\n")
        print(f"Scraped data saved to {filename}")

    def results_df(self):
        """Return the scraped results as a Pandas DataFrame."""
        return pd.DataFrame(self.results.items(), columns=["URL", "Content"])




In [38]:
# Usage
base_url = "https://soliditylang.org/about/"
skip_domains = [
    "https://soliditylang.org/blog",
    # "https://docs.soliditylang.org/en"
]

scraper = RecursiveWebScraper(base_url, delay=1, max_depth=2, skip_domains=skip_domains)
scraper.scrape(base_url, depth=2)  # Start scraping with depth limit of 2
scraper.save_results("scraped_data.md")

Scraping: https://soliditylang.org/about/ | Depth: 1


UnboundLocalError: cannot access local variable 'full_url' where it is not associated with a value

In [14]:
df = scraper.df_results()

In [15]:
len(df)

6