In [None]:
# imports

import re
import os
import requests
import html2text
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [None]:
def scrape_paginated_pages(start_url, next_link_selector, next_link_attribute='href'):
    """
    Scrapes a website by following a 'next page' link until it's no longer found.

    Args:
        start_url (str): The URL of the first page to start scraping.
        next_link_selector (dict): A dictionary of attributes to find the 'next page' link,
                                   e.g., {'class': 'next-button'}.
        next_link_attribute (str): The attribute of the link tag that holds the URL,
                                   e.g., 'href'.

    Returns:
        list: A list containing all the scraped data from each page.
    """
    current_url = start_url
    all_scraped_data = []
    page_count = 1

    while current_url:

        if '_autosummary/nukescripts' in current_url:
            print("Stopping: Reached a nukescripts autosummary page.")
            break
        
        print(f"Scraping page {page_count}: {current_url}")
        
        try:
            
            # Fetch the page content
            response = requests.get(current_url, headers={'User-Agent': 'Mozilla/5.0'})
            response.raise_for_status() # Check for HTTP errors

            # Parse the page with BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find the 'next page' link using the provided selector
            next_link_tag = soup.find('a', **next_link_selector)

            all_scraped_data.append(current_url)

            if next_link_tag:
                # Get the URL from the specified attribute
                next_page_relative_url = next_link_tag.get(next_link_attribute)
                
                # Construct the full absolute URL
                # The fix is here: use `current_url` instead of `start_url`
                current_url = urljoin(current_url, next_page_relative_url)
                page_count += 1
            else:
                print("No 'next page' link found. Ending scraping.")
                current_url = None

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {current_url}: {e}")
            break
            
    return all_scraped_data

In [None]:
# Assuming your start_url is the first page of the website
start_url = 'https://learn.foundry.com/nuke/developers/16.0/pythondevguide/intro.html' 
# The selector for the next button, based on your screenshot
next_link_selector = {'rel': 'next'}

# Call the function to begin scraping
pages = scrape_paginated_pages(start_url, next_link_selector)

In [None]:
def scrape_nukescript_pages(start_url, next_link_selector, next_link_attribute='href'):
    """
    Scrapes a website by following a 'next page' link until it's no longer found.

    Args:
        start_url (str): The URL of the first page to start scraping.
        next_link_selector (dict): A dictionary of attributes to find the 'next page' link,
                                   e.g., {'class': 'next-button'}.
        next_link_attribute (str): The attribute of the link tag that holds the URL,
                                   e.g., 'href'.

    Returns:
        list: A list containing all the scraped data from each page.
    """
    current_url = start_url
    all_scraped_data = []
    page_count = 1

    while current_url:

        if '_autosummary/nukescripts' not in current_url:
            print("Stopping: Reached a nukescripts autosummary page.")
            break
            
        if '_autosummary/nukescripts.autoBackdrop.html' in current_url:
            current_url = 'https://learn.foundry.com/nuke/developers/16.0/pythondevguide/_autosummary/nukescripts.anySelectedVertexInfo.html'
        
        print(f"Scraping page {page_count}: {current_url}")
        
        try:
            
            # Fetch the page content
            response = requests.get(current_url, headers={'User-Agent': 'Mozilla/5.0'})
            response.raise_for_status() # Check for HTTP errors

            # Parse the page with BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find the 'next page' link using the provided selector
            next_link_tag = soup.find('a', **next_link_selector)

            all_scraped_data.append(current_url)

            if next_link_tag:
                # Get the URL from the specified attribute
                next_page_relative_url = next_link_tag.get(next_link_attribute)
                
                # Construct the full absolute URL
                # The fix is here: use `current_url` instead of `start_url`
                current_url = urljoin(current_url, next_page_relative_url)
                page_count += 1
            else:
                print("No 'next page' link found. Ending scraping.")
                current_url = None

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {current_url}: {e}")
            break
            
    return all_scraped_data

In [None]:
# Assuming your start_url is the first page of the website
start_url = 'https://learn.foundry.com/nuke/developers/16.0/pythondevguide/_autosummary/nukescripts.widgetgroup.Qt.html' 
# The selector for the next button, based on your screenshot
prev_link_selector = {'rel': 'prev'}

# Call the function to begin scraping
nukescript_pages = scrape_nukescript_pages(start_url, prev_link_selector)

In [None]:
print(len(nukescript_pages))

In [None]:
def get_page_response(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return

    return response

In [None]:
def scrape_data(urls, output):
    """
    Scrapes a URL, specifically finds 'highlight' classes for code,
    formats them as Markdown code blocks, and saves the result.
    """

    for url in urls:

        response = get_page_response(url)
    
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Isolate the main content area to avoid navbars, footers, etc.
        main_content = soup.find('main')
        if not main_content:
            main_content = soup.body

        # Find all divs with the 'highlight' class within the main content
        code_blocks = main_content.find_all(class_='highlight')
        
        print(f"Found {len(code_blocks)} code blocks to format.")

        for block in code_blocks:
            # Extract the raw text from the code block
            code_text = block.get_text()
    
            language = 'python'
            
            # Create the formatted Markdown code block as a string
            markdown_code_block = f"```{language}\n{code_text.strip()}\n```"
            
            # Create a new BeautifulSoup tag (<pre>) and replace the original
            # 'highlight' div with this new tag containing our Markdown text.
            new_tag = soup.new_tag("pre")
            new_tag.string = markdown_code_block
            block.replace_with(new_tag)
    
        # Now, convert the MODIFIED main_content to Markdown.
        # html2text will respect the content of the <pre> tags we just made.
        h = html2text.HTML2Text()
        h.body_width = 0
        markdown_output = h.handle(str(main_content))

        # Create a filename from the URL
        raw_filename = url.split("/")[-1].replace(".html", ".md")
        if '#' in raw_filename:
            raw_filename = raw_filename.split('#')[0]
            
        base_name, extension = os.path.splitext(raw_filename)
        
        if '.' in base_name:
            base_name = base_name.replace('.', '_')
        filename = base_name + extension
        output_path = f"{output}/{filename}"
            
        # Save the Markdown content to a file
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(markdown_output)

        print(f"Successfully saved {url} to {output_path}")


In [None]:
output_dir = os.path.join(os.getcwd(), '..', 'documents', 'raw')
# print(output_dir)

scrape_data(nukescript_pages, output_dir)
