In [1]:
!pip install requests beautifulsoup4 pandas tqdm



In [16]:
import re
import requests
from bs4 import BeautifulSoup

def scrape_plugin(plugin_id: int):
    """
    Scrape one Tenable/Nessus plugin page (public) and return fields we care about.
    If page doesn't exist or we can't parse, return None.
    """
    url = f"https://www.tenable.com/plugins/nessus/{plugin_id}"

    try:
        resp = requests.get(
            url,
            timeout=15,
            headers={
                "User-Agent": "Mozilla/5.0 (compatible; TenableScraper/1.0)",
                "Accept-Language": "en-US,en;q=0.9",
            },
        )
    except requests.RequestException:
        return None

    # Some plugin IDs just don't exist or are restricted -> 404 or non-200
    if resp.status_code != 200:
        return None

    soup = BeautifulSoup(resp.text, "html.parser")

    # Normalise page text
    full_text = soup.get_text("\n", strip=True)
    full_text = full_text.replace("\r\n", "\n").replace("\r", "\n")

    # --- Title
    title_el = soup.select_one("h1")
    title = title_el.get_text(strip=True) if title_el else ""

    # Helper: generic "Label: value" grabber
    def grab_simple(label):
        """
        Look for 'Label: something' and return 'something'.
        Case-insensitive. Stops at newline.
        """
        pattern = rf"{re.escape(label)}\s*[:：]\s*([^\n]+)"
        m = re.search(pattern, full_text, flags=re.IGNORECASE)
        return m.group(1).strip() if m else ""

    # --- Straightforward label:value items from Plugin Details
    severity  = grab_simple("Severity")
    family    = grab_simple("Family")
    published = grab_simple("Published")

    # --- Exploit / Exploit Ease (Risk / Vulnerability Information)
    # Primary: Exploit Ease
    exploit = grab_simple("Exploit Ease")

    # Fallbacks: some plugins might still use other variants
    if not exploit:
        exploit = grab_simple("Exploit Available")
    if not exploit:
        exploit = grab_simple("Exploit")

    # --- CVSS v3 Base Score (Risk Information)
    cvss_score = ""
    # Try to isolate the CVSS v3 block first
    m_cvss_block = re.search(
        r"CVSS\s*v3(.*?)(?:CVSS\s*v2|Risk Information|Temporal Score|Vector|$)",
        full_text,
        flags=re.IGNORECASE | re.DOTALL,
    )
    if m_cvss_block:
        block = m_cvss_block.group(1)
        m_base = re.search(
            r"Base Score\s*[:：]\s*([0-9.]+)",
            block,
            flags=re.IGNORECASE,
        )
        if m_base:
            cvss_score = m_base.group(1).strip()

    # Fallback: search whole page if still empty
    if not cvss_score:
        m_base_any = re.search(
            r"CVSS\s*v3.*?Base Score\s*[:：]\s*([0-9.]+)",
            full_text,
            flags=re.IGNORECASE | re.DOTALL,
        )
        if m_base_any:
            cvss_score = m_base_any.group(1).strip()

    # --- Section extractor for Synopsis / Description / Solution
    def extract_section(label, next_labels):
        """
        Grab text that appears after `label` until the next of `next_labels`.
        Uses the plain-text version of the page.
        """
        label_esc = re.escape(label)
        next_esc  = [re.escape(nl) for nl in next_labels]

        # Example:
        # Synopsis\n...text...\n(Description|Solution|Plugin Details|Risk Information)
        pattern = (
            label_esc +
            r"\s*\n(.*?)\n(?:"
            + "|".join(next_esc) +
            r")\s*\n"
        )

        m = re.search(pattern, full_text, flags=re.DOTALL | re.IGNORECASE)
        if m:
            block = m.group(1).strip()
            # collapse multiple newlines / spaces
            block = re.sub(r"\n{2,}", "\n", block)
            block = re.sub(r"[ \t]{2,}", " ", block)
            return block
        return ""

    synopsis = extract_section(
        "Synopsis",
        ["Description", "Solution", "Plugin Details", "Risk Information"],
    )
    description = extract_section(
        "Description",
        ["Solution", "Plugin Details", "Risk Information"],
    )
    solution = extract_section(
        "Solution",
        ["Plugin Details", "Risk Information", "Severity:", "ID:"],
    )

    # --- Build final row
    row = {
        "Plugin ID": str(plugin_id),
        "Title": title.strip(),
        "Severity": severity,
        "Synopsis": synopsis,
        "Description": description,
        "Solution": solution,
        "CVSS Score": cvss_score,
        "Family": family,
        "Published": published,
        "Exploit": exploit,
        "Source URL": url,
    }

    # If title is empty, likely not a valid plugin page
    if not title:
        return None

    return row

In [18]:
test = scrape_plugin(213445)
test

{'Plugin ID': '213445',
 'Title': 'Debian dla-4007 : python-tornado-doc - security update',
 'Severity': 'Medium',
 'Synopsis': 'The remote Debian host is missing one or more security-related updates.',
 'Description': "The remote Debian 11 host has packages installed that are affected by multiple vulnerabilities as referenced in the dla-4007 advisory.\n------------------------------------------------------------------------- Debian LTS Advisory DLA-4007-1\n[email\xa0protected]\nhttps://www.debian.org/lts/security/ Daniel Leidert January 01, 2025 https://wiki.debian.org/LTS\n-------------------------------------------------------------------------\nPackage : python-tornado Version : 6.1.0-1+deb11u1 CVE ID : CVE-2023-28370 CVE-2024-52804 Debian Bug : 1036875 1088112\nTornado is a scalable, non-blocking Python web framework and asynchronous networking library.\nCVE-2023-28370\nAn open redirect vulnerability in Tornado versions 6.3.1 and earlier allows a remote unauthenticated attacker to

In [20]:
import pandas as pd
from tqdm import tqdm
import time

def scrape_range(start_id: int, end_id: int, pause=0.5, save_path: str = None):
    """
    Scrape plugins from start_id to end_id (inclusive) and return a DataFrame.
    Optionally saves to CSV if save_path is given.
    """
    rows = []
    for pid in tqdm(range(start_id, end_id + 1)):
        data = scrape_plugin(pid)
        if data:
            rows.append(data)
        time.sleep(pause)  # be polite, don't hammer Tenable

    # Full column list (matches scrape_plugin output)
    cols = [
        "Plugin ID",
        "Title",
        "Severity",
        "Synopsis",
        "Description",
        "Solution",
        "CVSS Score",
        "Family",
        "Published",
        "Exploit",
        "Source URL",
    ]

    if rows:
        df = pd.DataFrame(rows)
        # reorder columns in case dict order changes
        df = df[cols]
    else:
        df = pd.DataFrame(columns=cols)

    if save_path:
        df.to_csv(save_path, index=False)
        print(f"Saved {len(df)} rows to {save_path}")

    return df

# EXAMPLE: scrape a range of plugin IDs
START_ID = 213445
END_ID   = 272165

df_sample = scrape_range(START_ID, END_ID, pause=0.5,
                         save_path=f"tenable_plugins_{START_ID}_{END_ID}.csv")
df_sample.head()

100%|██████████████████████████████████| 58721/58721 [16:07:10<00:00,  1.01it/s]


Saved 57853 rows to tenable_plugins_213445_272165.csv


Unnamed: 0,Plugin ID,Title,Severity,Synopsis,Description,Solution,CVSS Score,Family,Published,Exploit,Source URL
0,213445,Debian dla-4007 : python-tornado-doc - securit...,Medium,The remote Debian host is missing one or more ...,The remote Debian 11 host has packages install...,Upgrade the python-tornado-doc packages.\nSee ...,6.4,Debian Local Security Checks,1/1/2025,No known exploits are available,https://www.tenable.com/plugins/nessus/213445
1,213446,Fedora 40 : libxml2 (2024-9f3765a04b),Critical,The remote Fedora host is missing one or more ...,The remote Fedora 40 host has a package instal...,Update the affected libxml2 package.\nSee Also...,7.5,Fedora Local Security Checks,1/1/2025,No known exploits are available,https://www.tenable.com/plugins/nessus/213446
2,213447,Photon OS 4.0: Squid PHSA-2024-4.0-0726,High,The remote PhotonOS host is missing multiple s...,An update of the squid package has been released.,Update the affected Linux packages.\nSee Also\...,7.8,PhotonOS Local Security Checks,1/1/2025,No known exploits are available,https://www.tenable.com/plugins/nessus/213447
3,213448,Photon OS 4.0: Cups PHSA-2024-4.0-0726,Critical,The remote PhotonOS host is missing multiple s...,An update of the cups package has been released.,Update the affected Linux packages.\nSee Also\...,10.0,PhotonOS Local Security Checks,1/1/2025,Exploits are available,https://www.tenable.com/plugins/nessus/213448
4,213449,Photon OS 5.0: Rubygem PHSA-2024-5.0-0432,High,The remote PhotonOS host is missing multiple s...,An update of the rubygem package has been rele...,Update the affected Linux packages.\nSee Also\...,7.8,PhotonOS Local Security Checks,1/1/2025,No known exploits are available,https://www.tenable.com/plugins/nessus/213449


In [23]:
output_csv = f"tenable_plugins_{START_ID}_{END_ID}.csv"
df_sample.to_csv(output_csv, index=False)
print("Saved:", output_csv, "with", len(df_sample), "rows")

Saved: tenable_plugins_213445_272165.csv with 57853 rows
