In [1]:
!pip install beautifulsoup4 requests



In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
def fetch_cwe_details(url):
    """Fetch the detailed description from the CWE detail page."""
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    description_tag = soup.find('div', {'id': 'CWEDefinition'})
    description = description_tag.find('div', {'class': 'detail'}).text.strip() if description_tag else 'No description available'
    return description

def scrape_cwe_data(base_url, start_page, end_page):
    """Scrape CWE data across multiple pages."""
    all_cwes = []

    for page in range(start_page, end_page + 1):
        url = f"{base_url}?page={page}&action=list"
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        rows = soup.find_all('tr')[1:]  # Skip header row
        for row in rows:
            link = row.find('a')
            if link and len(row.find_all('td')) > 1:  # Check if <a> exists and row has enough <td>
                cwe_id = link.text.strip()
                cwe_name = row.find_all('td')[1].text.strip()
                detail_url = 'https://www.security-database.com/' + link['href']
                description = fetch_cwe_details(detail_url)

                all_cwes.append({
                    'CWE ID': cwe_id,
                    'CWE Name': cwe_name,
                    'Description': description
                })
            else:
                print(f"Skipping a row in page {page} due to missing data.")

        print(f"Processed page {page}")

    return pd.DataFrame(all_cwes)

# Parameters
BASE_URL = 'https://www.security-database.com/cwe.php'
START_PAGE = 1
END_PAGE = 5  # Adjust as necessary based on the total number of pages

# Scrape data
cwe_df = scrape_cwe_data(BASE_URL, START_PAGE, END_PAGE)

# Save to JSON in Google Drive
file_path = '/content/drive/MyDrive/cwe_data.json'
cwe_df.to_json(file_path, orient='records', lines=True)

# Print the contents of the saved file
with open(file_path, 'r') as file:
    data = file.read()
    print(data)

print(f"Data successfully saved to {file_path}")

Skipping a row in page 1 due to missing data.
Processed page 1
Skipping a row in page 2 due to missing data.
Processed page 2
Skipping a row in page 3 due to missing data.
Processed page 3
Skipping a row in page 4 due to missing data.
Processed page 4
Skipping a row in page 5 due to missing data.
Processed page 5
{"CWE ID":"CWE-1","CWE Name":"Location","Description":"Description SummaryWeaknesses in this category are organized based on which phase they are introduced during the software development and deployment process."}
{"CWE ID":"CWE-2","CWE Name":"Environment","Description":"Description SummaryWeaknesses in this category are typically introduced during unexpected environmental conditions."}
{"CWE ID":"CWE-3","CWE Name":"Technology-specific Environment Issues","Description":"Description SummaryWeaknesses in this category are typically introduced during unexpected environmental conditions in particular technologies."}
{"CWE ID":"CWE-4","CWE Name":"J2EE Environment Issues","Descript