# Overview
1. Add CVE Descriptions to Top 25 removing newlines, tabs,... and escaping as required

In [3]:
from IPython.core.magic import register_cell_magic
from IPython.display import Markdown
import datetime
from datetime import date
import glob
import json
import jsonlines
import logging
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly
import warnings
import csv
import os



In [22]:
output_trimmed_file_json = './data_out/cwe_trimmed.json'
output_trimmed_file_jsonl = './data_out/cwe_trimmed.jsonl'
output_file_json = './data_out/cwe_trimmed_top25.json'
output_file_jsonl = './data_out/cwe_trimmed_top25.jsonl'
output_file_csv = './data_out/observed_examples.csv'



In [13]:
def remove_keys(data, keys_to_remove):
    if isinstance(data, dict):
        for key in keys_to_remove:
            data.pop(key, None)
        for value in data.values():
            remove_keys(value, keys_to_remove)
    elif isinstance(data, list):
        for item in data:
            remove_keys(item, keys_to_remove)

# Read the JSON data from the file
with open('./data_in/cwe.json', 'r') as file:
    json_data = json.load(file)

# Remove specified keys
#'RelatedAttackPatterns'
keys_to_remove = ['ContentHistory', 'Views', 'Categories', 'References', 'TaxonomyMappings']
remove_keys(json_data, keys_to_remove)

# Write the modified data back to a file
with open(output_trimmed_file_json, 'w') as file:
    json.dump(json_data, file, indent=2)

with open(output_trimmed_file_jsonl, 'w') as json_file:
    for weakness in json_data.get('Weaknesses', []):
        json.dump(weakness, json_file)
        json_file.write('\n')

print(f"The following keys have been removed: {', '.join(keys_to_remove)}. The result has been saved to cwe_trimmed.json")

The following keys have been removed: ContentHistory, Views, Categories, References, TaxonomyMappings. The result has been saved to cwe_trimmed.json


# Extract the ObservedExamples to a file


In [14]:
def extract_weakness_data(output_file):
    # Read the JSON data from the file
    with open(output_file_json, 'r') as file:
        data = json.load(file)

    # Extract the required information for all weaknesses
    extracted_data = []
    for weakness in data["Weaknesses"]:
        weakness_data = {
            "ID": f"CWE-{weakness['ID']}",  # Prefix ID with "CWE-"
            "ObservedExamples": [
                {
                    "Reference": example["Reference"],
                    "Description": example["Description"]
                }
                for example in weakness.get("ObservedExamples", [])
            ]
        }
        extracted_data.append(weakness_data)

    # Write the extracted data to the output JSON file
    with open(output_file, 'w') as f:
        json.dump(extracted_data, f, indent=2)

    print(f"Data extracted and saved to {output_file}")


# Usage
output_file = './data_out/cwe_observed_examples.json'  # Replace with your desired output file name
extract_weakness_data(output_file)

Data extracted and saved to ./data_out/cwe_observed_examples.json


In [23]:
def extract_weakness_data(output_file):
    # Read the JSON data from the file
    with open(output_file_json, 'r') as file:
        data = json.load(file)

    # Prepare data for DataFrame
    df_data = []
    for weakness in data["Weaknesses"]:
        cwe_id = f"CWE-{weakness['ID']}"
        for example in weakness.get("ObservedExamples", []):
            df_data.append({
                "CWE-ID": cwe_id,
                "CVE-ID": example["Reference"],
                "Description": example["Description"]
            })

    # Create DataFrame
    df = pd.DataFrame(df_data)

    # Save DataFrame to CSV
    df.to_csv(output_file, index=False)
    print(f"Data extracted and saved to {output_file_csv}")

    # Display first few rows of the DataFrame
    print(df.head())

    return df

df = extract_weakness_data(output_file_csv)
df

Data extracted and saved to ./data_out/observed_examples.csv
     CWE-ID          CVE-ID                                        Description
0  CWE-1004  CVE-2022-24045  Web application for a room automation system h...
1  CWE-1004   CVE-2014-3852  CMS written in Python does not include the HTT...
2  CWE-1004   CVE-2015-4138  Appliance for managing encrypted communication...
3  CWE-1007   CVE-2013-7236  web forum allows impersonation of users with h...
4  CWE-1007   CVE-2012-0584  Improper character restriction in URLs in web ...


Unnamed: 0,CWE-ID,CVE-ID,Description
0,CWE-1004,CVE-2022-24045,Web application for a room automation system h...
1,CWE-1004,CVE-2014-3852,CMS written in Python does not include the HTT...
2,CWE-1004,CVE-2015-4138,Appliance for managing encrypted communication...
3,CWE-1007,CVE-2013-7236,web forum allows impersonation of users with h...
4,CWE-1007,CVE-2012-0584,Improper character restriction in URLs in web ...
...,...,...,...
2977,CWE-98,CVE-2004-0127,Directory traversal vulnerability in PHP inclu...
2978,CWE-98,CVE-2005-1971,Directory traversal vulnerability in PHP inclu...
2979,CWE-98,CVE-2005-3335,"PHP file inclusion issue, both remote and loca..."
2980,CWE-98,CVE-2009-1936,chain: library file sends a redirect if it is ...


## Add the Top25 values from the CSV file to a JSON and JSONL file in a Top25Examples object

In [40]:
# Read the CSV file
with open('data_out/top25-mitre-mapping-analysis-2023-public_with_cve_descriptions.csv', 'r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    top25_examples = list(csv_reader)

top25_examples

[{'CVE': 'CVE-2021-0674',
  'New CWE': 'CWE-20',
  'Description': 'In alac decoder, there is a possible out of bounds read due to an incorrect bounds check. This could lead to local information disclosure with no additional execution privileges needed. User interaction is not needed for exploitation. Patch ID: ALPS06064258; Issue ID: ALPS06064237.'},
 {'CVE': 'CVE-2021-0674',
  'New CWE': 'CWE-125',
  'Description': 'In alac decoder, there is a possible out of bounds read due to an incorrect bounds check. This could lead to local information disclosure with no additional execution privileges needed. User interaction is not needed for exploitation. Patch ID: ALPS06064258; Issue ID: ALPS06064237.'},
 {'CVE': 'CVE-2021-0676',
  'New CWE': 'CWE-20',
  'Description': 'In geniezone driver, there is a possible out of bounds read due to an incorrect bounds check. This could lead to local information disclosure with System execution privileges needed. User interaction is not needed for exploita

In [41]:
# Process each row in the CSV
for example in top25_examples:
    cwe_id = example['New CWE'].replace('CWE-', '')
    cve = example['CVE']
    description = example['Description']

    # Find the corresponding entry in the JSON
    for weakness in json_data['Weaknesses']:
        if weakness['ID'] == cwe_id:
            # Create Top25Examples if it doesn't exist
            if 'Top25Examples' not in weakness:
                weakness['Top25Examples'] = []

            # Add the new example
            new_example = {
                "Reference": cve,
                "Description": description
            }
            weakness['Top25Examples'].append(new_example)


In [42]:
# Write the updated JSON back to the file
with open(output_file_json, 'w') as json_file:
    json.dump(json_data, json_file, indent=2)

In [43]:
with open(output_file_jsonl, 'w') as json_file:
    for weakness in json_data.get('Weaknesses', []):
        json.dump(weakness, json_file)
        json_file.write('\n')

## Split JSONL into 10 parts with a txt extension for import to NotebookLM

In [47]:
def save_jsonl_batches(json_data, output_dir, batch_size=100):
    weaknesses = json_data.get('Weaknesses', [])
    total_entries = len(weaknesses)
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for i in range(0, total_entries, batch_size):
        batch_number = (i // batch_size) + 1
        output_file = os.path.join(output_dir, f"{batch_number}.jsonl.txt")
        
        with open(output_file, 'w') as json_file:
            for weakness in weaknesses[i:i+batch_size]:
                json.dump(weakness, json_file)
                json_file.write('\n')
        
        print(f"Saved batch {batch_number} to {output_file}")

    print(f"Total batches saved: {(total_entries - 1) // batch_size + 1}")

In [48]:
# Usage
output_dir = './data_out/output_jsonl'
save_jsonl_batches(json_data, output_dir)

Saved batch 1 to ./data_out/output_jsonl/1.jsonl.txt
Saved batch 2 to ./data_out/output_jsonl/2.jsonl.txt
Saved batch 3 to ./data_out/output_jsonl/3.jsonl.txt
Saved batch 4 to ./data_out/output_jsonl/4.jsonl.txt
Saved batch 5 to ./data_out/output_jsonl/5.jsonl.txt
Saved batch 6 to ./data_out/output_jsonl/6.jsonl.txt
Saved batch 7 to ./data_out/output_jsonl/7.jsonl.txt
Saved batch 8 to ./data_out/output_jsonl/8.jsonl.txt
Saved batch 9 to ./data_out/output_jsonl/9.jsonl.txt
Saved batch 10 to ./data_out/output_jsonl/10.jsonl.txt
Total batches saved: 10
