In [None]:
import re
import requests
from bs4 import BeautifulSoup
import csv
import sqlite3
from datetime import datetime
import os  # Import the os module

# Preprocess text: Remove special characters and normalize spaces
def preprocess_text(text):
    return re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', text)).lower()

def fetch_html(url):
    """
    Fetches the HTML content of a given URL.

    Args:
      url: The URL to fetch.

    Returns:
      The HTML content as a string, or None if an error occurs.
    """
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an exception for bad status codes
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def parse_content(html, element_type, parent_element=None):
    """
    Parses the HTML content to extract text from specified elements.

    Args:
      html: The HTML content to parse.
      element_type: The type of element to extract text from (e.g., 'div', 'p').
      parent_element: (Optional) The parent element to search within.

    Returns:
      The extracted text as a string, or None if an error occurs.
    """
    try:
        soup = BeautifulSoup(html, 'html.parser')

        if parent_element:
            parent = soup.find(parent_element)
            if not parent:
                print(f"No parent element <{parent_element}> found.")
                return None
            elements = parent.find_all(element_type)
        else:
            elements = soup.find_all(element_type)

        if elements:
            content = ""
            for element in elements:
                content += element.get_text(separator=' ', strip=True) + " "
            return content
        else:
            print(f"No elements of type <{element_type}> found.")
            return None

    except Exception as e:
        print(f"Error parsing HTML: {e}")
        return None

def save_to_csv(data, filename, element_type, parent_element):
    """
    Saves the extracted content to a CSV file.

    Args:
      data: The content to save.
      filename: The name of the CSV file.
      element_type: The type of element the content was extracted from.
      parent_element: The parent element of the extracted content.
    """
    try:
        # Check if the file exists
        file_exists = os.path.isfile(filename)

        with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            if not file_exists:  # Write header only if file is new
                writer.writerow(["URL", "Content", "Timestamp", "Element Type", "Parent Element"])
            writer.writerow([target_url, data, datetime.now(), element_type, parent_element])
        print(f"Content saved to {filename}")
    except Exception as e:
        print(f"Error saving to CSV: {e}")

def save_to_db(data, db_name, element_type, parent_element):
    """
    Saves the extracted content to a SQLite database.

    Args:
      data: The content to save.
      db_name: The name of the database file.
      element_type: The type of element the content was extracted from.
      parent_element: The parent element of the extracted content.
    """
    try:
        conn = sqlite3.connect(db_name)
        cursor = conn.cursor()
        cursor.execute('''CREATE TABLE IF NOT EXISTS url_content
                                 (url TEXT, content TEXT, timestamp TEXT, element_type TEXT, parent_element TEXT)''')
        cursor.execute("INSERT INTO url_content VALUES (?, ?, ?, ?, ?)",
                         (target_url, data, datetime.now(), element_type, parent_element))
        conn.commit()
        conn.close()
        print(f"Content saved to {db_name}")
    except Exception as e:
        print(f"Error saving to database: {e}")

# @title Single Page (Text Only)
# @markdown **Step 1:** Select which elements to parse
element_type = ''  #@param ['section', 'article', 'div', 'p', 'span', 'main', 'header', 'footer', 'nav', 'aside'] {allow-input: true}
# @markdown &nbsp;&nbsp;&nbsp;&nbsp;<sub><sup>*(parent_element not required)*</sub></sup>
parent_element = ''  #@param ["main", "div", "body"] {allow-input: true}
# @markdown >
# @markdown **Step 2:** Add the URL to be scraped
target_url = "https://www.stonebranch.com/it-automation-solutions/workload-automation"  #@param {type:"string"}

# --- Main execution ---
if __name__ == "__main__":
    html = fetch_html(target_url)
    if html:
        content = parse_content(html, element_type, parent_element)
        if content:
            # Save content to CSV and database with timestamps
            save_to_csv(content, '/content/url_content.csv', element_type, parent_element)
            save_to_db(content, '/content/url_content.db', element_type, parent_element)
            print(f"Content from {target_url} saved successfully at {datetime.now()}")
        else:
            print(f"No content found in <{element_type}> elements of {target_url}")
    else:
        print(f"Failed to fetch HTML from {target_url}")

In [None]:
import re
import requests
from bs4 import BeautifulSoup
import csv
import sqlite3
from datetime import datetime
import os
import random
import time

# Preprocess text: Remove special characters and normalize spaces
def preprocess_text(text):
    return re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', text)).lower()

# List of common User-Agent strings (desktop & mobile)
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/537.36",
    "Mozilla/5.0 (iPad; CPU OS 16_0 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/537.36",
    "Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36",
]

# Set up a session (helps with efficiency & maintains cookies)
session = requests.Session()

def fetch_html(url, max_retries=3):
    """
    Fetches the HTML content of a given URL while rotating User-Agents.

    Args:
      url (str): The URL to fetch.
      max_retries (int): Number of times to retry on failure.

    Returns:
      str: HTML content if successful, None otherwise.
    """
    for attempt in range(max_retries):
        headers = {
            "User-Agent": random.choice(USER_AGENTS)  # Rotate User-Agent
        }

        try:
            response = session.get(url, headers=headers, timeout=10)
            response.raise_for_status()  # Raise an error for bad status codes

            # Introduce a random delay between requests (2-6 seconds)
            time.sleep(random.uniform(2, 6))

            return response.text  # Return the page content

        except requests.exceptions.HTTPError as http_err:
            print(f"[{attempt+1}/{max_retries}] HTTP error for {url}: {http_err}")
            if response.status_code == 403:
                print("403 Forbidden! Rotating User-Agent and retrying...")

        except requests.exceptions.RequestException as req_err:
            print(f"[{attempt+1}/{max_retries}] Request error for {url}: {req_err}")

        time.sleep(random.uniform(3, 7))  # Wait before retrying

    print(f"Failed to fetch {url} after {max_retries} retries.")
    return None  # Return None if all retries fail

def parse_content(html, element_type, parent_element):
    """
    Parses the HTML content to extract text from specified elements.

    Args:
      html: The HTML content to parse.
      element_type: The type of element to extract text from (e.g., 'div', 'p').
      parent_element: (Optional) The parent element to search within.

    Returns:
      The extracted text as a string, or None if an error occurs.
    """
    try:
        soup = BeautifulSoup(html, 'html.parser')

        if parent_element:
            parent = soup.find(parent_element)
            if not parent:
                print(f"No parent element <{parent_element}> found.")
                return None
            elements = parent.find_all(element_type)
        else:
            elements = soup.find_all(element_type)

        if elements:
            content = ""
            for element in elements:
                content += element.get_text(separator=' ', strip=True) + " "
            return content
        else:
            print(f"No elements of type <{element_type}> found.")
            return None

    except Exception as e:
        print(f"Error parsing HTML: {e}")
        return None

def save_to_csv(data, filename, element_type, parent_element):
    """
    Saves the extracted content to a CSV file.

    Args:
      data: The content to save.
      filename: The name of the CSV file.
      element_type: The type of element the content was extracted from.
      parent_element: The parent element of the extracted content.
    """
    try:
        # Check if the file exists
        file_exists = os.path.isfile(filename)

        with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            if not file_exists:  # Write header only if file is new
                writer.writerow(["URL", "Content", "Timestamp", "Element Type", "Parent Element"])
            writer.writerow([target_url, data, datetime.now(), element_type, parent_element])
        print(f"Content saved to {filename}")
    except Exception as e:
        print(f"Error saving to CSV: {e}")

def save_to_db(data, db_name, element_type, parent_element):
    """
    Saves the extracted content to a SQLite database.

    Args:
      data: The content to save.
      db_name: The name of the database file.
      element_type: The type of element the content was extracted from.
      parent_element: The parent element of the extracted content.
    """
    try:
        conn = sqlite3.connect(db_name)
        cursor = conn.cursor()
        cursor.execute('''CREATE TABLE IF NOT EXISTS url_content
                                    (url TEXT, content TEXT, timestamp TEXT, element_type TEXT, parent_element TEXT)''')
        cursor.execute("INSERT INTO url_content VALUES (?, ?, ?, ?, ?)",
                       (target_url, data, datetime.now(), element_type, parent_element))
        conn.commit()
        conn.close()
        print(f"Content saved to {db_name}")
    except Exception as e:
        print(f"Error saving to database: {e}")

# @title Multi Page (Text Only)
# @markdown **Requirements**
# @markdown 1. Add `urls.txt` to `/content /` folder.
# @markdown 2. Select which elements to parse below:
element_type = 'body'  #@param ['section', 'body', 'article', 'div', 'p', 'span', 'main', 'header', 'footer', 'nav', 'aside'] {allow-input: true}
# @markdown &nbsp;&nbsp;&nbsp;&nbsp;<sub><sup>*(parent_element not required)*</sub></sup>
parent_element = ''  #@param ["main", "div", "body"] {allow-input: true}

# --- Main execution ---
if __name__ == "__main__":
    # Read URLs from a text file
    with open("urls.txt", "r") as url_file:
        urls = [url.strip() for url in url_file.readlines()]

    print(f"Starting scrape for {len(urls)} URLs...\n")

    # Process each URL
    for index, target_url in enumerate(urls, start=1):
        try:
            print(f"[{index}/{len(urls)}] Fetching: {target_url}")

            # Introduce a random delay before each request (2-5 seconds)
            time.sleep(random.uniform(2, 5))

            # Fetch HTML content
            html = fetch_html(target_url)

            if html:
                # Parse content
                content = parse_content(html, element_type, parent_element)

                if content:
                    # Save content to CSV and database with timestamps
                    save_to_csv(content, 'url_content.csv', element_type, parent_element)
                    save_to_db(content, 'url_content.db', element_type, parent_element)

                    print(f"✔ Content from {target_url} saved successfully at {datetime.now()}\n")
                else:
                    print(f"⚠ No content found in <{element_type}> elements of {target_url}\n")
            else:
                print(f"❌ Failed to fetch HTML from {target_url}\n")

        except Exception as e:
            print(f"❗ Unexpected error processing {target_url}: {e}\n")
            time.sleep(random.uniform(3, 6))  # Extra delay on errors to avoid bans

In [24]:
import re
import requests
from bs4 import BeautifulSoup
import csv
import sqlite3
from datetime import datetime
import os  # Import the os module

# Preprocess text: Remove special characters and normalize spaces
def preprocess_text(text):
    return re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', text)).lower()

def fetch_html(url):
    """
    Fetches the HTML content of a given URL.

    Args:
      url: The URL to fetch.

    Returns:
      The HTML content as a string, or None if an error occurs.
    """
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an exception for bad status codes
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def parse_content(html, element_type, parent_element=None):
    """
    Parses the HTML content to extract text and embedded hyperlinks from specified elements.

    Args:
        html: The HTML content to parse.
        element_type: The type of element to extract text and hyperlinks from (e.g., 'div', 'p').
        parent_element: (Optional) The parent element to search within.

    Returns:
        A string containing the extracted text with embedded hyperlinks, or None if an error occurs.
    """
    try:
        soup = BeautifulSoup(html, 'html.parser')

        if parent_element:
            parent = soup.find(parent_element)
            if not parent:
                print(f"No parent element <{parent_element}> found.")
                return None
            elements = parent.find_all(element_type)
        else:
            elements = soup.find_all(element_type)

        if elements:
            content = ""
            for element in elements:
                # Extract text with embedded hyperlinks
                for child in element.children:
                    if child.name == 'a':
                        href = child.get('href')
                        if href:
                            href = href.strip()
                            href = href.replace('\xa0', ' ')
                            content += f"<a href='{href}'>{child.text}</a>"
                    else:
                        content += str(child)
                content += " "  # Add space between elements
            return content
        else:
            print(f"No elements of type <{element_type}> found.")
            return None

    except Exception as e:
        print(f"Error parsing HTML: {e}")
        return None

def save_to_csv(data, filename, element_type, parent_element):
    """
    Saves the extracted content to a CSV file.

    Args:
      data: The content to save.
      filename: The name of the CSV file.
      element_type: The type of element the content was extracted from.
      parent_element: The parent element of the extracted content.
    """
    try:
        # Check if the file exists
        file_exists = os.path.isfile(filename)

        with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            if not file_exists:  # Write header only if file is new
                writer.writerow(["URL", "Content", "Timestamp", "Element Type", "Parent Element"])
            writer.writerow([target_url, data, datetime.now(), element_type, parent_element])
        print(f"Content saved to {filename}")
    except Exception as e:
        print(f"Error saving to CSV: {e}")

def save_to_db(data, db_name, element_type, parent_element):
    """
    Saves the extracted content to a SQLite database.

    Args:
        data: The content to save (list of tuples).
        db_name: The name of the database file.
        element_type: The type of element the content was extracted from.
        parent_element: The parent element of the extracted content.
    """
    try:
        conn = sqlite3.connect(db_name)
        cursor = conn.cursor()
        cursor.execute('''CREATE TABLE IF NOT EXISTS url_content
                            (url TEXT, text_content TEXT, href TEXT, timestamp TEXT, element_type TEXT, parent_element TEXT)''')
        for text, href in data:  # Iterate over the list of tuples
            cursor.execute("INSERT INTO url_content VALUES (?, ?, ?, ?, ?, ?)",
                           (target_url, text, href, datetime.now(), element_type, parent_element))
        conn.commit()
        conn.close()
        print(f"Content saved to {db_name}")
    except Exception as e:
        print(f"Error saving to database: {e}")

# @title Single Page (Text + <a> href)
# @markdown **Step 1:** Select which elements to parse
element_type = 'section'  #@param ['section', 'article', 'div', 'p', 'span', 'main', 'header', 'footer', 'nav', 'aside'] {allow-input: true}
# @markdown &nbsp;&nbsp;&nbsp;&nbsp;<sub><sup>*(parent_element not required)*</sub></sup>
parent_element = 'main'  #@param ["main", "div", "body"] {allow-input: true}
# @markdown >
# @markdown **Step 2:** Add the URL to be scraped
target_url = "https://www.stonebranch.com/it-automation-solutions/workload-automation"  #@param {type:"string"}

# --- Main execution ---
if __name__ == "__main__":
    html = fetch_html(target_url)
    if html:
        content = parse_content(html, element_type, parent_element)
        if content:
            # Save content to CSV and database with timestamps
            save_to_csv(content, '/content/url_content.csv', element_type, parent_element)
            save_to_db(content, '/content/url_content.db', element_type, parent_element)
            print(f"Content from {target_url} saved successfully at {datetime.now()}")
        else:
            print(f"No content found in <{element_type}> elements of {target_url}")
    else:
        print(f"Failed to fetch HTML from {target_url}")

Content saved to /content/url_content.csv
Error saving to database: not enough values to unpack (expected 2, got 1)
Content from https://www.stonebranch.com/it-automation-solutions/workload-automation saved successfully at 2025-01-27 09:55:51.556619
