In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import pandas as pd





class RecursiveWebScraper:
    def __init__(self, base_url, delay=1, max_depth=2, skip_domains=None):
        self.base_url = base_url
        self.delay = delay
        self.max_depth = max_depth
        self.visited_urls = set()
        self.domain = urlparse(base_url).netloc
        self.skip_domains = skip_domains or []  # Domains to skip
        self.results = {}
        self.url_file = "To_scrape.txt"
        self.url_done = "Scraped.txt"

    def scrape(self, url, depth):
        if depth == 0:
            return  # Stop recursion when depth limit is reached
        
        if url in self.visited_urls:
            return  # Avoid redundant visits

        print(f"Scraping: {url} | Depth: {self.max_depth - depth + 1}")
        self.visited_urls.add(url)
        
        try:
            response = requests.get(url)
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"Failed to fetch {url}: {e}")
            return

        soup = BeautifulSoup(response.text, 'html.parser')
        self.results[url] = self.format_content_as_markdown(url, soup)
        with open(self.url_done, "a+") as file:
            file.write(f"{url}\n")

        # Extract and process links
        for link_tag in soup.find_all('a', href=True):
            full_url = urljoin(url, link_tag['href'])
            if self.is_valid_url(full_url):
                time.sleep(self.delay)  # Respectful scraping
                self.scrape(full_url, depth - 1)  # Recurse with reduced depth

    def is_valid_url(self, url):
        """Check if the URL is valid, within the same domain, and not in the skip list."""
        parsed_url = urlparse(url)
        if parsed_url.netloc != self.domain:
            return False
        for skip_domain in self.skip_domains:
            if url.startswith(skip_domain):
                return False
        return url not in self.visited_urls

    def format_content_as_markdown(self, url, soup):
        """Extract content and format it as Markdown with inline links."""
        markdown = [f"# {soup.title.string.strip() if soup.title else url}"]

        # Traverse the content and replace links and Solidity code inline
        for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'li', 'div']):
            if tag.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                level = int(tag.name[1])
                markdown.append(f"{'#' * level} {self.process_inline_links(tag)}")
            elif tag.name == 'p':
                markdown.append(self.process_inline_links(tag))
            elif tag.name == 'ul':
                for li in tag.find_all('li'):
                    markdown.append(f"- {self.process_inline_links(li)}")
            elif tag.name == 'div' and 'highlight-solidity' in tag.get('class', []):
                pre_tag = tag.find('pre')
                if pre_tag:
                    code = pre_tag.get_text(strip=False)
                    markdown.append(f"
solidity\n{code}\n
")

        return "\n\n".join(markdown)

    def process_inline_links(self, tag):
        """Replace links inline within a tag."""
        text = ""
        for content in tag.contents:
            if content.name == 'a' and content.get('href'):
                link_text = content.get_text(strip=True)
                href = content['href']
                full_url = urljoin(self.base_url, href)
                with open(self.url_file, "a+") as file:
                    file.write(f"{full_url}\n")
                text += f"[{link_text}]({full_url})"
            elif isinstance(content, str):
                text += content
        return text.strip()

    def save_results(self, filename="scraped_data.md"):
        """Save the scraped results to a Markdown file."""
        with open(filename, "w", encoding="utf-8") as file:
            for url, content in self.results.items():
                new_content = ""
                lines = content.split("\n")
                for  line in lines:
                    print(len(line.split(" ")), len(line), line)
                    if len(line.split(" ")) > 0 and len(line) > 0:
                        new_content += line + "\n"
                print(len(new_content))
                file.write(f"URL: {url}\n\n{new_content}\n\n{'-' * 80}\n\n")
        print(f"Scraped data saved to {filename}")

    def results_df(self):
        """Return the scraped results as a Pandas DataFrame."""
        return pd.DataFrame(self.results.items(), columns=["URL", "Content"])

In [None]:
# Usage
base_url = "https://docs.soliditylang.org/en/v0.8.28/"
skip_domains = [
    "https://soliditylang.org/blog",
    # "https://docs.soliditylang.org/en"
]

scraper = RecursiveWebScraper(base_url, delay=1, max_depth=5, skip_domains=skip_domains)
scraper.scrape(base_url, depth=5)  # Start scraping with depth limit of 2


Scraping: https://docs.soliditylang.org/en/v0.8.28/ | Depth: 1
Scraping: https://docs.soliditylang.org/en/v0.8.28/introduction-to-smart-contracts.html | Depth: 2
Scraping: https://docs.soliditylang.org/en/v0.8.28/index.html | Depth: 3
Scraping: https://docs.soliditylang.org/en/v0.8.28/solidity-by-example.html | Depth: 4
Scraping: https://docs.soliditylang.org/en/v0.8.28/installing-solidity.html | Depth: 2
Scraping: https://docs.soliditylang.org/en/v0.8.28/installing-solidity.html#versioning | Depth: 3
Scraping: https://docs.soliditylang.org/en/v0.8.28/introduction-to-smart-contracts.html#a-simple-smart-contract | Depth: 3
Scraping: https://docs.soliditylang.org/en/v0.8.28/introduction-to-smart-contracts.html#storage-example | Depth: 4
Scraping: https://docs.soliditylang.org/en/v0.8.28/solidity-by-example.html#voting | Depth: 5
Scraping: https://docs.soliditylang.org/en/v0.8.28/installing-solidity.html#remix | Depth: 4
Scraping: https://docs.soliditylang.org/en/v0.8.28/layout-of-source-

KeyboardInterrupt: 

Scraping: https://docs.soliditylang.org/en/v0.8.28/types.html#arrays | Depth: 5
Scraping: https://docs.soliditylang.org/en/v0.8.28/contracts.html#libraries | Depth: 5
Scraping: https://docs.soliditylang.org/en/v0.8.28/metadata.html#encoding-of-the-metadata-hash-in-the-bytecode | Depth: 5
Scraping: https://docs.soliditylang.org/en/v0.8.28/units-and-global-variables.html#special-variables-functions | Depth: 3
Scraping: https://docs.soliditylang.org/en/v0.8.28/contracts.html#modifier-overriding | Depth: 4
Scraping: https://docs.soliditylang.org/en/v0.8.28/contracts.html#constructors | Depth: 4
Scraping: https://docs.soliditylang.org/en/v0.8.28/introduction-to-smart-contracts.html#simple-smart-contract | Depth: 4
Scraping: https://docs.soliditylang.org/en/v0.8.28/types.html#bytes-and-string-as-arrays | Depth: 5
Scraping: https://docs.soliditylang.org/en/v0.8.28/types.html#the-functions-bytes-concat-and-string-concat | Depth: 5
Scraping: https://docs.soliditylang.org/en/v0.8.28/contracts.ht

In [31]:
scraper.save_results("scraped_data.md")

6 38 # Home | Solidity Programming Language
1 0 
2 12 # {Solidity}
1 0 
1 6 pragma
1 0 
1 8 contract
1 0 
1 8 function
1 0 
1 8 modifier
1 0 
1 5 event
1 0 
1 6 struct
1 0 
1 4 enum
1 0 
1 7 require
1 0 
1 7 address
1 0 
1 8 Solidity
1 0 
36 309 [Solidity 0.8.28](https://soliditylang.org/blog/2024/10/09/solidity-0.8.28-release-announcement)  Introducing the newest version of the compiler which brings full support for transient storage state variables of value types, improvements to speed up compilation via IR and lower RAM usage, bugfixes, and more!
1 0 
5 31 ## Solidity is evolving rapidly
1 0 
31 244 We aim for a regular (non-breaking) release every month, with approximately one breaking release per year. You can follow the implementation status of new features in the  [Solidity GitHub project](https://github.com/orgs/ethereum/projects/26).
1 0 
4 25 ## Contribute to Solidity
1 0 
23 143 Solidity continues to improve with help from our global community. Check out these ways to get in

In [14]:
df = scraper.df_results()

In [15]:
len(df)

6

## need to handle codeblocks of remix editors inplace

In [54]:
import requests
from bs4 import BeautifulSoup

# https://remix.ethereum.org/#language=solidity&version=soljson-v0.8.28+commit.7893614a.js&code=Ly8gU1BEWC1MaWNlbnNlLUlkZW50aWZpZXI6IEdQTC0zLjAKcHJhZ21hIHNvbGlkaXR5ID49MC40LjE2IDwwLjkuMDsKCmNvbnRyYWN0IFNpbXBsZVN0b3JhZ2UgewogICAgdWludCBzdG9yZWREYXRhOwoKICAgIGZ1bmN0aW9uIHNldCh1aW50IHgpIHB1YmxpYyB7CiAgICAgICAgc3RvcmVkRGF0YSA9IHg7CiAgICB9CgogICAgZnVuY3Rpb24gZ2V0KCkgcHVibGljIHZpZXcgcmV0dXJucyAodWludCkgewogICAgICAgIHJldHVybiBzdG9yZWREYXRhOwogICAgfQp9&lang=en&optimize=false&runs=200&evmVersion=null
url = "https://remix.ethereum.org/#language=solidity&version=soljson-v0.8.28+commit.7893614a.js&code=Ly8gU1BEWC1MaWNlbnNlLUlkZW50aWZpZXI6IEdQTC0zLjAKcHJhZ21hIHNvbGlkaXR5ID49MC40LjE2IDwwLjkuMDsKCmNvbnRyYWN0IFNpbXBsZVN0b3JhZ2UgewogICAgdWludCBzdG9yZWREYXRhOwoKICAgIGZ1bmN0aW9uIHNldCh1aW50IHgpIHB1YmxpYyB7CiAgICAgICAgc3RvcmVkRGF0YSA9IHg7CiAgICB9CgogICAgZnVuY3Rpb24gZ2V0KCkgcHVibGljIHZpZXcgcmV0dXJucyAodWludCkgewogICAgICAgIHJldHVybiBzdG9yZWREYXRhOwogICAgfQp9&lang=en&optimize=false&runs=200&evmVersion=null"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="overflow-hidden">
 <head>
  <base href="/"/>
  <meta charset="utf-8"/>
  <!--
		The MIT License (MIT)
		Copyright (c) 2014, 2015, the individual contributors
		Permission is hereby granted, free of charge, to any person obtaining a copy
		of this software and associated documentation files (the "Software"), to deal
		in the Software without restriction, including without limitation the rights
		to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
		copies of the Software, and to permit persons to whom the Software is
		furnished to do so, subject to the following conditions:
		The above copyright notice and this permission notice shall be included in
		all copies or substantial portions of the Software.
		THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
		AUTHO

In [66]:
import requests
from bs4 import BeautifulSoup

import requests
from bs4 import BeautifulSoup

def extract_solidity_code(url):
    try:
        # Fetch the content of the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all <pre> elements
        pre_elements = soup.find_all('pre')

        solidity_codes = []

        # Iterate through <pre> elements
        for pre in pre_elements:
            # Extract text content, ignoring any tags like <span>
            code = ''.join(pre.stripped_strings)
            if 'pragma solidity' in code or 'contract' in code:
                solidity_codes.append(code)

        return solidity_codes

    except requests.exceptions.RequestException as e:
        print(f"An error occurred while fetching the URL: {e}")
        return []
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return []

# Example usage
url = "https://docs.soliditylang.org/en/v0.8.28/introduction-to-smart-contracts.html"
solidity_code_snippets = extract_solidity_code(url)

if solidity_code_snippets:
    print("Extracted Solidity Code:")
    for code in solidity_code_snippets:
        print(code)
else:
    print("No Solidity code found on the page.")


Extracted Solidity Code:
// SPDX-License-Identifier: GPL-3.0pragma solidity>=0.4.16<0.9.0;contractSimpleStorage{uintstoredData;functionset(uintx)public{storedData=x;}functionget()publicviewreturns(uint){returnstoredData;}}
// SPDX-License-Identifier: GPL-3.0pragma solidity^0.8.26;// This will only compile via IRcontractCoin{// The keyword "public" makes variables// accessible from other contractsaddresspublicminter;mapping(address=>uint)publicbalances;// Events allow clients to react to specific// contract changes you declareeventSent(addressfrom,addressto,uintamount);// Constructor code is only run when the contract// is createdconstructor(){minter=msg.sender;}// Sends an amount of newly created coins to an address// Can only be called by the contract creatorfunctionmint(addressreceiver,uintamount)public{require(msg.sender==minter);balances[receiver]+=amount;}// Errors allow you to provide information about// why an operation failed. They are returned// to the caller of the function.e

In [57]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.26.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting websocket-client~=1.8 (from selenium)
  Downloading websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.26.1-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloa

In [63]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

def fetch_all_elements_with_js(url):
    # Set up headless browser options
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    
    # Initialize WebDriver
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)
    
    # Get the page source
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Close the driver
    driver.quit()
    
    # Pretty print all elements
    return soup.prettify()

# Example usage for JavaScript-rendered pages
url = "https://docs.soliditylang.org/en/v0.8.28/introduction-to-smart-contracts.html"
all_elements_js = fetch_all_elements_with_js(url)
print(all_elements_js)

<html class="writer-html5" lang="en" style="--color-scheme: light">
 <head>
  <meta charset="utf-8"/>
  <meta content="Docutils 0.19: https://docutils.sourceforge.io/" name="generator"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <title>
   Introduction to Smart Contracts — Solidity 0.8.28 documentation
  </title>
  <style type="text/css">
   :root, :host {
  --fa-font-solid: normal 900 1em/1 "Font Awesome 6 Free";
  --fa-font-regular: normal 400 1em/1 "Font Awesome 6 Free";
  --fa-font-light: normal 300 1em/1 "Font Awesome 6 Pro";
  --fa-font-thin: normal 100 1em/1 "Font Awesome 6 Pro";
  --fa-font-duotone: normal 900 1em/1 "Font Awesome 6 Duotone";
  --fa-font-brands: normal 400 1em/1 "Font Awesome 6 Brands";
  --fa-font-sharp-solid: normal 900 1em/1 "Font Awesome 6 Sharp";
  --fa-font-sharp-regular: normal 400 1em/1 "Font Awesome 6 Sharp";
  --fa-font-sharp-light: normal 300 1em/1 "Font Awesome 6 Sharp";
  --fa-font-sharp-thin: normal 100 1em/1 "Font

In [64]:
"""solidity""" in all_elements_js

True

In [70]:
import requests
from bs4 import BeautifulSoup


def fetch_and_parse_solidity_code(url):
    try:
        # Fetch the webpage content
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad HTTP responses
        
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all targeted divs
        elements = soup.find_all('div', class_='highlight-solidity notranslate')
        
        # Extract and format the Solidity code
        for idx, element in enumerate(elements, 1):
            pre_tag = element.find('pre')  # Locate the <pre> tag containing the code
            if pre_tag:
                code = pre_tag.get_text()  # Extract the text from the <pre> tag
                print(f"Solidity Code Block {idx}:\n```solidity\n{code}\n```\n")
            else:
                print(f"Solidity Code Block {idx}:\nNo <pre> tag found.\n")
        
        print(f"\nTotal occurrences: {len(elements)}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")


# Example usage
url = "https://docs.soliditylang.org/en/v0.8.28/introduction-to-smart-contracts.html"
print(f"Occurrences: {fetch_and_parse_solidity_code(url)}")


Solidity Code Block 1:
```solidity
// SPDX-License-Identifier: GPL-3.0
pragma solidity >=0.4.16 <0.9.0;

contract SimpleStorage {
    uint storedData;

    function set(uint x) public {
        storedData = x;
    }

    function get() public view returns (uint) {
        return storedData;
    }
}

```

Solidity Code Block 2:
```solidity
// SPDX-License-Identifier: GPL-3.0
pragma solidity ^0.8.26;

// This will only compile via IR
contract Coin {
    // The keyword "public" makes variables
    // accessible from other contracts
    address public minter;
    mapping(address => uint) public balances;

    // Events allow clients to react to specific
    // contract changes you declare
    event Sent(address from, address to, uint amount);

    // Constructor code is only run when the contract
    // is created
    constructor() {
        minter = msg.sender;
    }

    // Sends an amount of newly created coins to an address
    // Can only be called by the contract creator
    function