In [1]:
import os
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import html2text
import hashlib
import shutil
import readability
import logging

# Set of visited URLs to prevent infinite recursion
visited_urls = set()

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def download_page(url, timeout=10):
    """
    Downloads the content of a web page from the given URL.

    Args:
        url (str): The URL of the web page to download.
        timeout (int): Timeout duration in seconds.

    Returns:
        str: The content of the web page as a string, or None if there was an error.

    Raises:
        requests.RequestException: If there was an error while downloading the web page.
    """
    try:
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()  # Raises an error for bad status codes
        return response.text
    except requests.RequestException as e:
        logging.warning(f"Error downloading {url}: {e}")
        return None

def extract_urls(html, base_url, ignored_extensions=None):
    """
    Extracts all URLs from the given HTML content, resolving relative URLs and ignoring hash fragments.

    Args:
        html (str): The HTML content to extract URLs from.
        base_url (str): The base URL used to resolve relative URLs.
        ignored_extensions (list): List of file extensions to ignore.

    Returns:
        set: A set of URLs extracted from the HTML content.
    """
    if ignored_extensions is None:
        ignored_extensions = ['.txt', '.pdf', '.docx']
    soup = BeautifulSoup(html, 'html.parser')
    urls = set()
    for link in soup.find_all('a', href=True):
        href = link['href']
        # Resolve relative URLs and filter by hash fragment
        full_url = urljoin(base_url, href.split('#', 1)[0])
        # Ignore URLs ending with specific file extensions
        if any(full_url.endswith(ext) for ext in ignored_extensions):
            continue
        if urlparse(full_url).netloc == urlparse(base_url).netloc:
            urls.add(full_url)
    return urls

def html_to_markdown(html):
    """
    Converts HTML content to Markdown format.

    Parameters:
    html (str): The HTML content to be converted.

    Returns:
    str: The Markdown representation of the HTML content.
    """
    # Using readability to extract the main content
    document = readability.Document(html)
    summary = document.summary()

    converter = html2text.HTML2Text()
    converter.ignore_links = False
    return converter.handle(summary)

def save_markdown(markdown, folder, filename):
    if not os.path.exists(folder):
        os.makedirs(folder)
    filepath = os.path.join(folder, filename)
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(markdown)

def generate_filename(url, base_url):
    """
    Generate a filename based on the given URL and base URL.

    Args:
        url (str): The URL from which the filename will be generated.
        base_url (str): The base URL used to remove the common path from the URL.

    Returns:
        str: The generated filename.
    """
    # Parse the URLs
    parsed_url = urlparse(url)
    parsed_base_url = urlparse(base_url)

    # Remove the base URL path to get the unique part of the path
    base_path = parsed_base_url.path.strip('/')
    unique_path = parsed_url.path.strip('/')

    # If the base_path is not empty, remove it from the start of unique_path
    if base_path and unique_path.startswith(base_path):
        unique_path = unique_path[len(base_path):].strip('/')

    # Split the path into segments and join them with hyphens
    if unique_path:
        filename = unique_path.replace('/', '-').lower() + ".md"
    else:
        filename = "index.md"
    return filename

def scrape_site(url, base_url, base_folder='', depth_limit=3, current_depth=0):
    """
    Scrapes a website recursively, saving the content as markdown files.

    Args:
        url (str): The URL of the website to scrape.
        base_url (str): The base URL of the website.
        base_folder (str, optional): The base folder to save the markdown files. Defaults to ''.
        depth_limit (int, optional): Maximum recursion depth. Defaults to 3.
        current_depth (int, optional): Current recursion depth. Defaults to 0.

    Returns:
        None
    """
    # Ensure the URL starts with the base URL
    if not url.startswith(base_url):
        return

    if url in visited_urls or urlparse(url).netloc != urlparse(base_url).netloc:
        return
    visited_urls.add(url)

    logging.info(f"Scraping {url} (depth: {current_depth})")
    html = download_page(url)
    if html:
        markdown = html_to_markdown(html)
        filename = generate_filename(url, base_url)
        folder = os.path.join(base_folder, urlparse(base_url).netloc)
        save_markdown(markdown, folder, filename)

        # Recursively scrape the extracted URLs up to the depth limit
        if current_depth < depth_limit:
            for link in extract_urls(html, url):
                scrape_site(link, base_url, base_folder, depth_limit, current_depth + 1)

def clean_directory(folder):
    """
    Deletes all files and folders in the specified directory.

    Args:
        folder (str): The path to the directory to be cleaned.

    Raises:
        OSError: If there is an error while deleting files or folders.
    """
    if os.path.exists(folder):
        for filename in os.listdir(folder):
            file_path = os.path.join(folder, filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception as e:
                logging.error(f'Failed to delete {file_path}. Reason: {e}')

# Example usage
base_url = 'https://docs.alpaca.markets/'  # Change this URL to your target
start_url = f'{base_url}'  # Change this URL to your target
base_folder = 'output'

clean_directory(base_folder)
scrape_site(start_url, start_url, base_folder, depth_limit=3)
logging.info("Scraping complete.")

2024-05-09 00:27:42,918 - INFO - Scraping https://docs.alpaca.markets/ (depth: 0)
2024-05-09 00:27:43,248 - INFO - Scraping https://docs.alpaca.markets/docs (depth: 1)
2024-05-09 00:27:43,588 - INFO - Scraping https://docs.alpaca.markets/docs/account-plans (depth: 2)
2024-05-09 00:27:44,633 - INFO - Scraping https://docs.alpaca.markets/docs/account-opening (depth: 3)
2024-05-09 00:27:45,127 - INFO - Scraping https://docs.alpaca.markets/docs/working-with-orders (depth: 3)
2024-05-09 00:27:45,659 - INFO - Scraping https://docs.alpaca.markets/docs/crypto-orders (depth: 3)
2024-05-09 00:27:46,155 - INFO - Scraping https://docs.alpaca.markets/docs/crypto-fees (depth: 3)
2024-05-09 00:27:46,707 - INFO - Scraping https://docs.alpaca.markets/docs/real-time-stock-pricing-data (depth: 3)
2024-05-09 00:27:47,398 - INFO - Scraping https://docs.alpaca.markets/docs/funding-via-journals (depth: 3)
2024-05-09 00:27:47,709 - INFO - Scraping https://docs.alpaca.markets/docs/user-protection (depth: 3)
20