# Overview
1. Add CVE Descriptions to Top 25 removing newlines, tabs,... and escaping as required

In [97]:
from IPython.core.magic import register_cell_magic
from IPython.display import Markdown
import datetime
from datetime import date
import glob
import json
import jsonlines
import logging
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly
import warnings
import csv
import os



In [98]:
output_trimmed_file_json = './data_out/cwe_trimmed.json'
output_trimmed_file_jsonl = './data_out/cwe_trimmed.jsonl'
output_file_json = './data_out/cwe_trimmed_top25.json'
output_file_jsonl = './data_out/cwe_trimmed_top25.jsonl'
output_file_csv = './data_out/observed_examples.csv'
output_file_md = './data_out/cwe_trimmed_top25.md'

cwe_1003_file = './data_in/1003.csv' #contains 130 CWE-IDs 1003 View



In [99]:
#https://cwe.mitre.org/data/downloads.html CWE Simplified Mapping
df_cwe_1003 = pd.read_csv(cwe_1003_file, usecols=["CWE-ID"], index_col=False)
df_cwe_1003

Unnamed: 0,CWE-ID
0,20
1,22
2,59
3,74
4,77
...,...
125,1188
126,1236
127,1284
128,1321


In [100]:
def add_1003_view(data, cwe_1003_df):
    if isinstance(data, dict):
        # Add 1003_view field for entries with ID
        if 'ID' in data:
            cwe_id = int(data['ID'])
            data['1003_view'] = 'Yes' if cwe_id in cwe_1003_df['CWE-ID'].values else 'No'
            
        # Process remaining key-value pairs
        for key, value in list(data.items()):
            data[key] = add_1003_view(value, cwe_1003_df)
                
    elif isinstance(data, list):
        return [add_1003_view(item, cwe_1003_df) for item in data]
        
    return data

In [101]:
# Remove CWE-IDs where ""MappingNotes": {"Usage": "Prohibited""
# Remove MappingNotes: Rationale, Comments, Reasons
# Remove various fields not useful for CWE mapping
# Keep BackgroundDetails
def remove_keys(data, keys_to_remove):
    if isinstance(data, dict):
        # Remove prohibited mappings and clean MappingNotes
        if 'MappingNotes' in data:
            if data['MappingNotes'].get('Usage') == 'Prohibited':
                return None
            else:
                # Remove specified fields from MappingNotes
                for key in ['Rationale', 'Comments', 'Reasons']:
                    data['MappingNotes'].pop(key, None)
                # Remove MappingNotes if empty or only contains Usage
                #if len(data['MappingNotes']) <= 1:
                #    data.pop('MappingNotes')
            
        for key in keys_to_remove:
            data.pop(key, None)
        
        # Recursively process remaining values
        for key, value in list(data.items()):
            new_value = remove_keys(value, keys_to_remove)
            if new_value is None:
                del data[key]
            else:
                data[key] = new_value
                
    elif isinstance(data, list):
        return [x for x in (remove_keys(item, keys_to_remove) for item in data) if x is not None]
        
    return data

# Read the JSON data from the file
with open('./data_in/cwe.json', 'r') as file:
    json_data = json.load(file)

keys_to_remove = [
    'ContentHistory', 
    'Views', 
    'Categories', 
    'References', 
    'TaxonomyMappings', 
    'Status', 
    'LikelihoodOfExploit',
    'WeaknessOrdinalities',
    'ApplicablePlatforms',
    'ModesOfIntroduction',
    'DetectionMethods',
    'PotentialMitigations',
    'DemonstrativeExamples',
    'RelatedAttackPatterns',
    'Structure',
    'CommonConsequences',
    #'Abstraction',
    'RelatedWeaknesses',
    'Notes'
]

json_data = remove_keys(json_data, keys_to_remove)

# Then add 1003_view field
json_data = add_1003_view(json_data, df_cwe_1003)

#Write the modified data back to files
with open(output_trimmed_file_json, 'w') as file:
    json.dump(json_data, file, indent=2)

with open(output_trimmed_file_jsonl, 'w') as json_file:
    for weakness in json_data.get('Weaknesses', []):
        json.dump(weakness, json_file)
        json_file.write('\n')

print(f"The following keys have been removed: {', '.join(keys_to_remove)}. \nThe result has been saved to cwe_trimmed.json")

The following keys have been removed: ContentHistory, Views, Categories, References, TaxonomyMappings, Status, LikelihoodOfExploit, WeaknessOrdinalities, ApplicablePlatforms, ModesOfIntroduction, DetectionMethods, PotentialMitigations, DemonstrativeExamples, RelatedAttackPatterns, Structure, CommonConsequences, RelatedWeaknesses, Notes. 
The result has been saved to cwe_trimmed.json


# Extract the ObservedExamples to a file


In [102]:
def extract_weakness_data(output_file):
    # Read the JSON data from the file
    with open(output_file_json, 'r') as file:
        data = json.load(file)

    # Prepare data for DataFrame
    df_data = []
    for weakness in data["Weaknesses"]:
        cwe_id = f"CWE-{weakness['ID']}"
        for example in weakness.get("ObservedExamples", []):
            df_data.append({
                "CWE-ID": cwe_id,
                "CVE-ID": example["Reference"],
                "Description": example["Description"]
            })

    # Create DataFrame
    df = pd.DataFrame(df_data)

    # Save DataFrame to CSV
    df.to_csv(output_file, index=False)
    print(f"Data extracted and saved to {output_file_csv}")

    # Display first few rows of the DataFrame
    print(df.head())

    return df

df = extract_weakness_data(output_file_csv)
df

Data extracted and saved to ./data_out/observed_examples.csv
     CWE-ID          CVE-ID                                        Description
0  CWE-1004  CVE-2022-24045  Web application for a room automation system h...
1  CWE-1004   CVE-2014-3852  CMS written in Python does not include the HTT...
2  CWE-1004   CVE-2015-4138  Appliance for managing encrypted communication...
3  CWE-1007   CVE-2013-7236  web forum allows impersonation of users with h...
4  CWE-1007   CVE-2012-0584  Improper character restriction in URLs in web ...


Unnamed: 0,CWE-ID,CVE-ID,Description
0,CWE-1004,CVE-2022-24045,Web application for a room automation system h...
1,CWE-1004,CVE-2014-3852,CMS written in Python does not include the HTT...
2,CWE-1004,CVE-2015-4138,Appliance for managing encrypted communication...
3,CWE-1007,CVE-2013-7236,web forum allows impersonation of users with h...
4,CWE-1007,CVE-2012-0584,Improper character restriction in URLs in web ...
...,...,...,...
2976,CWE-98,CVE-2004-0127,Directory traversal vulnerability in PHP inclu...
2977,CWE-98,CVE-2005-1971,Directory traversal vulnerability in PHP inclu...
2978,CWE-98,CVE-2005-3335,"PHP file inclusion issue, both remote and loca..."
2979,CWE-98,CVE-2009-1936,chain: library file sends a redirect if it is ...


## Add the Top25 values from the CSV file to a JSON and JSONL file in a Top25Examples object

In [103]:
# Read the CSV file
with open('data_out/top25-mitre-mapping-analysis-2023-public_with_cve_descriptions.csv', 'r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    top25_examples = list(csv_reader)

top25_examples

[{'CVE': 'CVE-2021-0674',
  'CWE': 'CWE-20',
  'Description': 'In alac decoder, there is a possible out of bounds read due to an incorrect bounds check. This could lead to local information disclosure with no additional execution privileges needed. User interaction is not needed for exploitation. Patch ID: ALPS06064258; Issue ID: ALPS06064237.',
  'Chains': 'CWE-20->CWE-125',
  'Weakness_Description': 'out of bounds read due to an incorrect bounds check'},
 {'CVE': 'CVE-2021-0676',
  'CWE': 'CWE-20',
  'Description': 'In geniezone driver, there is a possible out of bounds read due to an incorrect bounds check. This could lead to local information disclosure with System execution privileges needed. User interaction is not needed for exploitation. Patch ID: ALPS05863009; Issue ID: ALPS05863009.',
  'Chains': 'CWE-20->CWE-125',
  'Weakness_Description': 'out of bounds read due to an incorrect bounds check'},
 {'CVE': 'CVE-2021-0677',
  'CWE': 'CWE-190',
  'Description': 'In ccu driver, th

In [104]:
# Process each row in the CSV
for example in top25_examples:
    cwe_id = example['CWE'].replace('CWE-', '')
    cve = example['CVE']
    description = example['Description']

    # Find the corresponding entry in the JSON
    for weakness in json_data['Weaknesses']:
        if weakness['ID'] == cwe_id:
            # Create Top25Examples if it doesn't exist
            if 'Top25Examples' not in weakness:
                weakness['Top25Examples'] = []

            # Add the new example
            new_example = {
                "Reference": cve,
                "Description": description
            }
            weakness['Top25Examples'].append(new_example)


In [105]:
# Write the updated JSON back to the file
with open(output_file_json, 'w') as json_file:
    json.dump(json_data, json_file, indent=2)

In [106]:
with open(output_file_jsonl, 'w') as json_file:
    for weakness in json_data.get('Weaknesses', []):
        json.dump(weakness, json_file)
        json_file.write('\n')

## Split JSONL into 10 parts with a txt extension for import to NotebookLM

In [107]:
def save_jsonl_batches(json_data, output_dir, batch_size=100):
    weaknesses = json_data.get('Weaknesses', [])
    total_entries = len(weaknesses)
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for i in range(0, total_entries, batch_size):
        batch_number = (i // batch_size) + 1
        output_file = os.path.join(output_dir, f"{batch_number}.jsonl.txt")
        
        with open(output_file, 'w') as json_file:
            for weakness in weaknesses[i:i+batch_size]:
                json.dump(weakness, json_file)
                json_file.write('\n')
        
        print(f"Saved batch {batch_number} to {output_file}")

    print(f"Total batches saved: {(total_entries - 1) // batch_size + 1}")

In [108]:
# Usage
output_dir = './data_out/output_jsonl'
save_jsonl_batches(json_data, output_dir)

Saved batch 1 to ./data_out/output_jsonl/1.jsonl.txt
Saved batch 2 to ./data_out/output_jsonl/2.jsonl.txt
Saved batch 3 to ./data_out/output_jsonl/3.jsonl.txt
Saved batch 4 to ./data_out/output_jsonl/4.jsonl.txt
Saved batch 5 to ./data_out/output_jsonl/5.jsonl.txt
Saved batch 6 to ./data_out/output_jsonl/6.jsonl.txt
Saved batch 7 to ./data_out/output_jsonl/7.jsonl.txt
Saved batch 8 to ./data_out/output_jsonl/8.jsonl.txt
Saved batch 9 to ./data_out/output_jsonl/9.jsonl.txt
Total batches saved: 9


# Create Markdown file

In [109]:
def create_markdown(data, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for weakness in data.get('Weaknesses', []):
            # Write ID and Name as H1
            f.write(f"# CWE-{weakness['ID']} {weakness['Name']}\n\n")
            
            f.write("## Description\n\n")
            
            # Write Description
            if 'Description' in weakness:
                f.write(f"{weakness['Description']}\n\n")
                
            # Write ExtendedDescription
            if 'ExtendedDescription' in weakness:
                f.write(f"{weakness['ExtendedDescription'].strip()}\n\n")
                
            # Write BackgroundDetails
            if 'BackgroundDetails' in weakness:
                if isinstance(weakness['BackgroundDetails'], list):
                    f.write('\n'.join(weakness['BackgroundDetails']) + '\n\n')
                else:
                    f.write(f"{weakness['BackgroundDetails']}\n\n")
            
            # Write ObservedExamples
            if 'ObservedExamples' in weakness:
                f.write("## Observed Examples\n\n")
                for example in weakness['ObservedExamples']:
                    f.write(f"{example['Reference']}: {example['Description']}\n\n")
            
            # Write Top25Examples
            if 'Top25Examples' in weakness:
                f.write("## Top 25 Examples\n\n")
                for example in weakness['Top25Examples']:
                    f.write(f"{example['Reference']}: {example['Description']}\n\n")
            
            # Write Mapping section
            f.write("## Mapping\n\n")
            if 'MappingNotes' in weakness:
                f.write(f"- Usage: {weakness['MappingNotes'].get('Usage', 'N/A')}\n\n")
            else:
                f.write("- Usage: N/A\n\n")
            
            if 'Abstraction' in weakness:
                f.write(f"- Abstraction: {weakness['Abstraction']}\n\n")
            else:
                f.write("- Abstraction: N/A\n\n")
            
            if '1003_view' in weakness:
                f.write(f"- 1003 view: {weakness['1003_view']}\n\n")
            else:
                f.write("- 1003 view: N/A\n\n")
            

In [110]:
# Read and parse JSON file
with open(output_file_json, 'r', encoding='utf-8') as file:
    json_data = json.load(file)

# Create markdown file
create_markdown(json_data, output_file_md)