In [2]:
import requests
from bs4 import BeautifulSoup
import json
import time
from urllib.parse import urljoin

# Define the base URL and starting URL
BASE_URL = "https://scalingbitcoin.org" #BASE URL WHICH IS USED TO VALIDATE FOUND SUB-PAGE LINKS
START_URL = "https://scalingbitcoin.org/transcripts" #START OF THE URL WHICH WILL BE USED TO EXTRACT CONTENTS AND LEAD TO OTHER PAGES

def get_page_content(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
               "AppleWebKit/537.36 (KHTML, like Gecko) "
               "Chrome/120.0.0.0 Safari/537.36"}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch {url}: {response.status_code}")
    return None

def extract_links(html, base_url):
    soup = BeautifulSoup(html, 'html.parser')
    links = set()
    for a in soup.find_all('a', href=True):
        href = a['href']
        full_url = urljoin(base_url, href)
        # Filter to ensure we only follow links within our base domain
        if full_url.startswith(base_url):
            links.add(full_url)
    return list(links)

def extract_data(html):
    soup = BeautifulSoup(html, 'html.parser')
    # Extract visible text
    text = soup.get_text(separator='\n', strip=True)
    
    # Extract hidden content (e.g., elements with display:none)
    hidden_content = []
    for element in soup.find_all(style=True):
        if "display:none" in element.get("style"):
            hidden_content.append(element.get_text(strip=True))
    
    return {"text": text, "hidden": hidden_content}

def scrape_site(start_url, max_urls=5):
    visited = set()
    data = {}
    to_visit = [start_url]
    
    while to_visit and len(visited) < max_urls:
        current_url = to_visit.pop(0)
        if current_url in visited:
            continue
        
        print(f"Scraping: {current_url}")
        html = get_page_content(current_url)
        if html:
            # Extract data from the page and store it with the URL as key
            data[current_url] = extract_data(html)
            # Extract and add new links to the queue
            new_links = extract_links(html, BASE_URL)
            for link in new_links:
                if link not in visited and link not in to_visit:
                    to_visit.append(link)
        visited.add(current_url)
        # Respectful delay between requests
        time.sleep(1)
    
    return data

if __name__ == "__main__":
    # Limit scraping to only 3 URLs (or input number)
    max_urls = 100
    scraped_data = scrape_site(START_URL, max_urls)
    # Save the scraped data in JSON format for later use
    with open("scraped_data.json", "w", encoding="utf-8") as f:
        json.dump(scraped_data, f, ensure_ascii=False, indent=4)


Scraping: https://scalingbitcoin.org/transcripts
Scraping: https://scalingbitcoin.org/transcript/hongkong2015/day-2-opening
Scraping: https://scalingbitcoin.org/transcript/telaviv2019/elastic-block-caps
Scraping: https://scalingbitcoin.org/presentations/#SB5-15
Scraping: https://scalingbitcoin.org/presentations/#SB1-31
Scraping: https://scalingbitcoin.org/transcript/hongkong2015/braiding-the-blockchain
Scraping: https://scalingbitcoin.org/transcript/montreal2015/security-of-diminishing-block-subsidy
Scraping: https://scalingbitcoin.org/transcript/telaviv2019/bitml
Scraping: https://scalingbitcoin.org/transcript/hongkong2015/security-assumptions
Scraping: https://scalingbitcoin.org/presentations/#SB1-28
Scraping: https://scalingbitcoin.org/presentations/#SB3-21
Scraping: https://scalingbitcoin.org/transcript/hongkong2015/segregated-witness-and-its-impact-on-scalability
Scraping: https://scalingbitcoin.org/transcript/telaviv2019/wip-vaults
Scraping: https://scalingbitcoin.org/presentatio

In [6]:
import json
import os
from pathlib import Path

# Create a new directory for the text files
output_dir = Path("extracted_texts")
output_dir.mkdir(exist_ok=True)


# Load the JSON data
with open("scraped_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Extract each URL and its text to a separate text file
for i, (url, content) in enumerate(data.items()):
    # Create a filename based on index and URL
    url_part = url.split('/')[-1]  # Get the last part of the URL
    filename = f"{i+1:03d}_{url_part}"
    
    # Clean the filename to make it suitable for the filesystem
    clean_filename = "".join(c if c.isalnum() or c in "._-" else "_" for c in filename)
    clean_filename = clean_filename[:50]  # Limit filename length
    
    # Extract text content
    text_content = content.get("text", "") if isinstance(content, dict) else ""
    
    # Write to file
    with open(output_dir / f"{clean_filename}.txt", "w", encoding="utf-8") as f:
        f.write(text_content)

print(f"Extracted {len(data)} items to {output_dir}")


Extracted 100 items to extracted_texts


In [None]:
!pip install firecrawl-py

Collecting firecrawl-py
  Downloading firecrawl_py-1.14.1-py3-none-any.whl.metadata (10 kB)


Downloading firecrawl_py-1.14.1-py3-none-any.whl (19 kB)
Installing collected packages: firecrawl-py
Successfully installed firecrawl-py-1.14.1


In [None]:
# Install with pip install firecrawl-py
from firecrawl import FirecrawlApp

app = FirecrawlApp(api_key='your api key')

crawl_result = app.crawl_url('https://scalingbitcoin.org/transcripts', params={
'limit': 2, #add limit accordingly to crawl for sublinks
'scrapeOptions': {
	'formats': [ 'markdown' ],
  }
})

print(crawl_result)

{'success': True, 'status': 'completed', 'completed': 2, 'total': 2, 'creditsUsed': 2, 'expiresAt': '2025-03-18T18:17:29.000Z', 'data': [{'markdown': 'Jump to\n\n1. [Confluence navigation](https://mifosforge.jira.com/wiki/spaces/projects/overview#AkTopNav)\n2. [Side navigation](https://mifosforge.jira.com/wiki/spaces/projects/overview#AkSideNavigation)\n3. [Page](https://mifosforge.jira.com/wiki/spaces/projects/overview#AkMainContent)\n\nAtlassian uses cookies to improve your browsing experience, perform analytics and research, and conduct advertising. Accept all cookies to indicate that you agree to our use of cookies on your device. [Atlassian cookies and tracking notice, (opens new window)](https://www.atlassian.com/legal/cookies)\n\nPreferencesOnly necessaryAccept all\n\nProjects\n\n# ![](https://mifosforge.jira.com/wiki/download/attachments/1736705/global.logo?version=2&modificationDate=1427914757501&cacheVersion=1&api=v2)Home\n\nThe Projects space is where we track and share with