In [2]:
import sys
from pathlib import Path

# Get parent directory (Thesis-Edvin)
sys.path.append(str(Path.cwd().parent))

In [3]:
from utils import *
from rag import *

In [4]:
import pandas as pd
import json

with open("tmp/view_CWE-1000_all_weaknesses.json", "r") as file:
    data = json.load(file)

In [5]:
from datasets import load_dataset

cwes = load_dataset("Eathus/cwe_view1000_raw_list", split="train")
cwes_df = cwes.to_pandas()

Using the latest cached version of the dataset since Eathus/cwe_view1000_raw_list couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/edvinn/.cache/huggingface/datasets/Eathus___cwe_view1000_raw_list/default/0.0.0/2c8d79ed3c8c5b99885a627f055d60b440763da5 (last modified on Wed May 28 16:05:27 2025).


In [6]:
cwes_df.columns

Index(['ID', 'Name', 'Abstraction', 'Structure', 'Status', 'Description',
       'ExtendedDescription', 'ApplicablePlatforms', 'AlternateTerms',
       'ModesOfIntroduction', 'CommonConsequences', 'PotentialMitigations',
       'ObservedExamples', 'AffectedResources', 'TaxonomyMappings',
       'RelatedAttackPatterns', 'References', 'Notes', 'ContentHistory',
       'MappingNotes_Usage', 'MappingNotes_Rationale', 'MappingNotes_Comments',
       'MappingNotes_Reasons', 'MappingNotes_Suggestions', 'RelatedWeaknesses',
       'WeaknessOrdinalities', 'DetectionMethods', 'DemonstrativeExamples',
       'FunctionalAreas', 'Diagram', 'LikelihoodOfExploit',
       'BackgroundDetails', 'NumPaths', 'Paths', 'Children'],
      dtype='object')

In [7]:
from IPython.display import display, Markdown, Latex
import pickle
from pathlib import Path
import statistics as stat

file_path = "tmp/light_doc_cont_count.pkl"

cwe_dict = {cwe["ID"]: cwe for cwe in data["Weaknesses"]}
cwes_df["Summary"] = cwes_df["ID"].apply(
    lambda x: create_ordered_cwe_document(cwe_dict[x], old=True).page_content
)

documents = [
    create_ordered_cwe_document(weakness, old=True) for weakness in data["Weaknesses"]
]
path = Path(file_path)
if path.exists():
    with open(path, "rb") as file:
        content_token_counts = pickle.load(file)
else:
    content_token_counts = [
        count_tokens(doc.page_content, "text-embedding-3-large") for doc in documents
    ]
    with open(file_path, "wb") as file:
        pickle.dump(content_token_counts, file)

print(content_token_counts)
print("Mean token count:\t", stat.mean(content_token_counts))
print("Median token count:\t", stat.median(content_token_counts))
print("Max token count:\t", max(content_token_counts))
print("Min token count:\t", min(content_token_counts))

[262, 499, 743, 579, 577, 565, 676, 340, 367, 747, 736, 690, 223, 122, 289, 874, 1436, 661, 349, 426, 431, 303, 962, 783, 197, 690, 439, 345, 971, 358, 166, 433, 507, 416, 341, 423, 139, 232, 377, 78, 553, 186, 78, 233, 260, 76, 295, 38, 297, 89, 232, 40, 232, 134, 129, 53, 333, 162, 121, 172, 528, 426, 344, 264, 107, 242, 44, 143, 49, 95, 104, 181, 52, 299, 314, 28, 34, 44, 43, 45, 184, 226, 516, 109, 119, 36, 431, 292, 380, 110, 514, 376, 373, 212, 478, 441, 471, 50, 82, 367, 182, 347, 224, 232, 91, 130, 242, 409, 552, 174, 192, 198, 54, 130, 338, 212, 471, 118, 383, 141, 52, 62, 45, 160, 320, 229, 173, 228, 206, 152, 203, 127, 223, 40, 34, 289, 214, 201, 427, 183, 259, 505, 404, 221, 221, 269, 200, 251, 297, 592, 216, 293, 181, 191, 238, 310, 310, 124, 146, 303, 346, 303, 34, 294, 61, 56, 559, 188, 54, 52, 470, 547, 326, 58, 157, 668, 37, 107, 767, 167, 287, 119, 45, 125, 49, 329, 290, 305, 331, 442, 97, 39, 373, 370, 154, 161, 180, 407, 329, 339, 223, 275, 218, 165, 242, 273, 298, 

In [8]:
sample = cwes_df.dropna(subset=["ExtendedDescription", "DemonstrativeExamples"])
print("Number of complete descriptions:\t", len(sample))
sample = sample[sample["MappingNotes_Usage"] != "Prohibited"].sample(3)
display(sample[["Summary"]])

Number of complete descriptions:	 436


Unnamed: 0,Summary
192,# CWE-786: Access of Memory Location Before St...
872,# CWE-107: Struts: Unused Validation Form\n\n#...
290,# CWE-15: External Control of System or Config...


In [9]:
print(sample[["Summary"]].values[0][0])
display(Markdown(sample[["Summary"]].values[0][0]))

# CWE-786: Access of Memory Location Before Start of Buffer

## Description
The product reads or writes to a buffer using an index or pointer that references a memory location prior to the beginning of the buffer.

## Extended Description
This typically occurs when a pointer or its index is decremented to a position before the buffer, when pointer arithmetic results in a position before the beginning of the valid memory location, or when a negative index is used.

## Demonstrative Scenario

### Scenario

In the following C/C++ example, a utility function is used to trim trailing whitespace from a character string. The function copies the input string to a local character string and uses a while statement to remove the trailing whitespace by moving backward through the string and overwriting whitespace with a NUL character.

### Vulnerable C Code
```c
char* trimTrailingWhitespace(char *strMessage, int length) {
			char *retMessage;
			char *message = malloc(sizeof(char)*(length+1));

//

# CWE-786: Access of Memory Location Before Start of Buffer

## Description
The product reads or writes to a buffer using an index or pointer that references a memory location prior to the beginning of the buffer.

## Extended Description
This typically occurs when a pointer or its index is decremented to a position before the buffer, when pointer arithmetic results in a position before the beginning of the valid memory location, or when a negative index is used.

## Demonstrative Scenario

### Scenario

In the following C/C++ example, a utility function is used to trim trailing whitespace from a character string. The function copies the input string to a local character string and uses a while statement to remove the trailing whitespace by moving backward through the string and overwriting whitespace with a NUL character.

### Vulnerable C Code
```c
char* trimTrailingWhitespace(char *strMessage, int length) {
			char *retMessage;
			char *message = malloc(sizeof(char)*(length+1));

// copy input string to a temporary string* 
			char message[length+1];
			int index;
			for (index = 0; index < length; index++) {
			
				message[index] = strMessage[index];
			}
			message[index] = '\0';

// trim trailing whitespace* 
			int len = index-1;
			while (isspace(message[len])) {
			
				message[len] = '\0';
				len--;
			}

// return string without trailing whitespace* 
			retMessage = message;
			return retMessage;}
```

### Analysis

However, this function can cause a buffer underwrite if the input character string contains all whitespace. On some systems the while statement will move backwards past the beginning of a character string and will call the isspace() function on an address outside of the bounds of the local buffer.

In [20]:
from utils import *
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field


class ReplySchema(BaseModel):
    gpt_cwe_description: str = Field(
        description="complete CWE vulnerability description"
    )


llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.0,
    max_retries=2,
    api_key=OPENAI_API_KEY_KTH,
    max_tokens=10000,
)  #

prompts_dict = load_prompts(os.getcwd() + "/../utils/prompts")

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", prompts_dict["vul_desc_comp_final"]),
        (
            "human",
            """
            # CWE-1422: Exposure of Sensitive Information caused by Incorrect Data Forwarding during Transient Execution\n\n## Description\nA processor event or prediction may allow incorrect or stale data to\n\t\t be forwarded to transient operations, potentially exposing data over a\n\t\t covert channel.
            """,
        ),
        (
            "ai",
            """
            # CWE-1422: Exposure of Sensitive Information caused by Incorrect Data Forwarding during Transient Execution\n\n## Description\nA processor event or prediction may allow incorrect or stale data to\n\t\t be forwarded to transient operations, potentially exposing data over a\n\t\t covert channel.\n\n## Extended Description\n\n\nSoftware may use a variety of techniques to preserve the confidentiality of private data that is accessible within the current processor context. For example, the memory safety and type safety properties of some high-level programming languages help to prevent software written in those languages from exposing private data. As a second example, software sandboxes may co-locate multiple users' software within a single process. The processor's Instruction Set Architecture (ISA) may permit one user's software to access another user's data (because the software shares the same address space), but the sandbox prevents these accesses by using software techniques such as bounds checking.\n\n\nIf incorrect or stale data can be forwarded (for example, from a cache) to transient operations, then the operations' microarchitectural side effects may correspond to the data. If an attacker can trigger these transient operations and observe their side effects through a covert channel, then the attacker may be able to infer the data. For example, an attacker process may induce transient execution in a victim process that causes the victim to inadvertently access and then expose its private data via a covert channel. In the software sandbox example, an attacker sandbox may induce transient execution in its own code, allowing it to transiently access and expose data in a victim sandbox that shares the same address space.\n\n\nConsequently, weaknesses that arise from incorrect/stale data forwarding might violate users' expectations of software-based memory safety and isolation techniques. If the data forwarding behavior is not properly documented by the hardware vendor, this might violate the software vendor's expectation of how the hardware should behave.\n\n\n## Demonstrative Scenario\n\n### Scenario\n\nFaulting loads in a victim domain may trigger incorrect transient forwarding, which leaves secret-dependent traces in the microarchitectural state. Consider this code sequence example from [REF-1391].\n\n### Vulnerable C Code\n```c\nvoid call_victim(size_t untrusted_arg) {{\n\n\n\t *arg_copy = untrusted_arg;\n\t array[**trusted_ptr * 4096];\n }}\n```\n\n### Analysis\n\nA processor with this weakness will store the value of untrusted_arg (which may be provided by an attacker) to the stack, which is trusted memory. Additionally, this store operation will save this value in some microarchitectural buffer, for example, the store buffer.\n\n\nIn this code sequence, trusted_ptr is dereferenced while the attacker forces a page fault. The faulting load causes the processor to mis-speculate by forwarding untrusted_arg as the (transient) load result. The processor then uses untrusted_arg for the pointer dereference. After the fault has been handled and the load has been re-issued with the correct argument, secret-dependent information stored at the address of trusted_ptr remains in microarchitectural state and can be extracted by an attacker using a vulnerable code sequence.
            """,
        ),
        (
           "human",
           """
           # CWE-291: Reliance on IP Address for Authentication\n\n## Description\nThe product uses an IP address for authentication.\n\n## Extended Description\nIP addresses can be easily spoofed. Attackers can forge the source IP address of the packets they send, but response packets will return to the forged IP address. To see the response packets, the attacker has to sniff the traffic between the victim machine and the forged IP address. In order to accomplish the required sniffing, attackers typically attempt to locate themselves on the same subnet as the victim machine. Attackers may be able to circumvent this requirement by using source routing, but source routing is disabled across much of the Internet today. In summary, IP address verification can be a useful part of an authentication scheme, but it should not be the single factor required for authentication.
           """,
        ),
        (
           "ai",
           """
           # CWE-291: Reliance on IP Address for Authentication\n\n## Description\nThe product uses an IP address for authentication.\n\n## Extended Description\nIP addresses can be easily spoofed. Attackers can forge the source IP address of the packets they send, but response packets will return to the forged IP address. To see the response packets, the attacker has to sniff the traffic between the victim machine and the forged IP address. In order to accomplish the required sniffing, attackers typically attempt to locate themselves on the same subnet as the victim machine. Attackers may be able to circumvent this requirement by using source routing, but source routing is disabled across much of the Internet today. In summary, IP address verification can be a useful part of an authentication scheme, but it should not be the single factor required for authentication.\n\n## Demonstrative Scenario\n\n### Scenario\n\nBoth of these examples check if a request is from a trusted address before responding to the request.\n\n### Vulnerable Java Code\n```java\nwhile(true) {{\n\t\t\tDatagramPacket rp=new DatagramPacket(rData,rData.length);\n\t\t\toutSock.receive(rp);\n\t\t\tString in = new String(p.getData(),0, rp.getLength());\n\t\t\tInetAddress clientIPAddress = rp.getAddress();\n\t\t\tint port = rp.getPort();\n\t\t\tif (isTrustedAddress(clientIPAddress) & secretKey.equals(in)) {{\n\t\t\t\tout = secret.getBytes();\n\t\t\t\tDatagramPacket sp =new DatagramPacket(out,out.length, IPAddress, port); outSock.send(sp);\n\t\t\t}}\n\t}}\n```\n\n### Analysis\n\nThe code only verifies the address as stored in the request packet. An attacker can spoof this address, thus impersonating a trusted client.
                   """,
        ),
        #(
        #   "human",
        #   """
        #    CWE-319: Cleartext Transmission of Sensitive Information\n\n## Description\nThe product transmits sensitive or security-critical data in cleartext in a communication channel that can be sniffed by unauthorized actors.\n\n## Demonstrative Example\n\n### Intro\n\nThe following code attempts to establish a connection to a site to communicate sensitive information.\n\n### Vulnerable Code\n```java\ntry {{\n\t\tURL u = new URL("http://www.secret.example.org/");\n\t\tHttpURLConnection hu = (HttpURLConnection) u.openConnection();\n\t\thu.setRequestMethod("PUT");\n\t\thu.connect();\n\t\tOutputStream os = hu.getOutputStream();\n\t\thu.disconnect();\n\t}}\n\tcatch (IOException e) {{\n\n//...* \n\t\t}}\n```\n\n### Analysis\n\nThough a connection is successfully made, the connection is unencrypted and it is possible that all sensitive data sent to or received from the server will be read by unintended actors.
        #   """,
        #),
        #(
        #   "ai",
        #   """
        #   # CWE-319: Cleartext Transmission of Sensitive Information\n\n## Description\nThe product transmits sensitive or security-critical data in cleartext in a communication channel that can be sniffed by unauthorized actors.\n\n## Extended Description\n\n\nMany communication channels can be "sniffed" (monitored) by adversaries during data transmission. For example, in networking, packets can traverse many intermediary nodes from the source to the destination, whether across the internet, an internal network, the cloud, etc. Some actors might have privileged access to a network interface or any link along the channel, such as a router, but they might not be authorized to collect the underlying data. As a result, network traffic could be sniffed by adversaries, spilling security-critical data.\n\n\nApplicable communication channels are not limited to software products. Applicable channels include hardware-specific technologies such as internal hardware networks and external debug channels, supporting remote JTAG debugging. When mitigations are not applied to combat adversaries within the product's threat model, this weakness significantly lowers the difficulty of exploitation by such adversaries.\n\n\nWhen full communications are recorded or logged, such as with a packet dump, an adversary could attempt to obtain the dump long after the transmission has occurred and try to "sniff" the cleartext from the recorded communications in the dump itself. Even if the information is encoded in a way that is not human-readable, certain techniques could determine which encoding is being used, then decode the information. \n\n\n## Demonstrative Scenario\n\n### Scenario\n\nThe following code attempts to establish a connection to a site to communicate sensitive information.\n\n### Vulnerable Java Code\n```java\ntry {{\n\t\tURL u = new URL("http://www.secret.example.org/");\n\t\tHttpURLConnection hu = (HttpURLConnection) u.openConnection();\n\t\thu.setRequestMethod("PUT");\n\t\thu.connect();\n\t\tOutputStream os = hu.getOutputStream();\n\t\thu.disconnect();\n\t}}\n\tcatch (IOException e) {{\n\n//...* \n\t\t}}\n```\n\n### Analysis\n\nThough a connection is successfully made, the connection is unencrypted and it is possible that all sensitive data sent to or received from the server will be read by unintended actors.
        #   """,
        #),
        ("human", "{msg}"),
    ]
)


def parser(message: ReplySchema):
    return message.model_dump_json()


llm = llm.with_structured_output(ReplySchema)
chain = prompt | llm | parser

In [15]:
from utils import *
from pandarallel import pandarallel
from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type,
)
from openai import OpenAIError, RateLimitError  # Explicitly import errors

pandarallel.initialize(progress_bar=True, nb_workers=15)

"""
@retry(
    stop=stop_after_attempt(5),  # Retry up to 5 times
    wait=wait_exponential(multiplier=2, min=0.25, max=10),  # Exponential backoff
    retry=retry_if_exception_type(RateLimitError),  # Retry only on rate limit errors
)
"""


def _gpt_classify(msg):
    if not msg or not isinstance(msg, str):  # Check for empty/invalid messages
        return None
    return chain.invoke({"msg": msg})  # Adjusted for OpenAI API format


def gpt_classify(msg):
    try:
        return _gpt_classify(msg)
    except OpenAIError as e:  # Catch all OpenAI-specific errors
        print(f"OpenAI API error: {e}")
    except Exception as e:
        print(f"General error processing message: {e}")
    return None

INFO: Pandarallel will run on 15 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [16]:
import pickle
import os

file_path = "tmp/gpt_cwe_desc_few_df.pkl"

# Define the path to your pickle file

# Check if the file exists
if os.path.exists(file_path):
    with open(file_path, "rb") as file:
        cwes_df = pickle.load(file)
    print("Pickle file loaded successfully!")
else:
    print(f"The file at {file_path} does not exist. Setting gpt_response to 'None'")
    cwes_df["gpt_cwe_description"] = cwes_df.Summary
    cwes_df.loc[
        cwes_df.ExtendedDescription.isna() | cwes_df.DemonstrativeExamples.isna(),
        "gpt_cwe_description",
    ] = None

The file at tmp/gpt_cwe_desc_few_df.pkl does not exist. Setting gpt_response to 'None'


In [12]:
cwes_df.columns

Index(['ID', 'Name', 'Abstraction', 'Structure', 'Status', 'Description',
       'ExtendedDescription', 'ApplicablePlatforms', 'AlternateTerms',
       'ModesOfIntroduction', 'CommonConsequences', 'PotentialMitigations',
       'ObservedExamples', 'AffectedResources', 'TaxonomyMappings',
       'RelatedAttackPatterns', 'References', 'Notes', 'ContentHistory',
       'MappingNotes_Usage', 'MappingNotes_Rationale', 'MappingNotes_Comments',
       'MappingNotes_Reasons', 'MappingNotes_Suggestions', 'RelatedWeaknesses',
       'WeaknessOrdinalities', 'DetectionMethods', 'DemonstrativeExamples',
       'FunctionalAreas', 'Diagram', 'LikelihoodOfExploit',
       'BackgroundDetails', 'NumPaths', 'Paths', 'Children', 'Summary',
       'gpt_cwe_description'],
      dtype='object')

In [21]:
import pickle
import time

retries = 0
max_retries = 10
file_path = "tmp/gpt_cwe_desc_few_df.pkl"
DELAY = 1
# test_df['gpt_response'] = None
while not cwes_df[cwes_df.gpt_cwe_description.isna()].empty and retries < max_retries:
    # Get indices of rows needing processing
    na_indices = cwes_df[cwes_df.gpt_cwe_description.isna()].index

    if len(na_indices) == 0:
        break

    # Process ONLY those rows and assign directly to original DF
    cwes_df.loc[na_indices, "gpt_cwe_description"] = cwes_df.loc[
        na_indices, "Summary"
    ].parallel_apply(gpt_classify)

    with open(file_path, "wb") as file:  # 'wb' mode writes in binary format
        pickle.dump(cwes_df, file)
    retries += 1
    print(f"Retry {retries}: Processed {len(na_indices)} rows")
    time.sleep(DELAY)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1), Label(value='0 / 1'))),))

OpenAI API error: Could not parse response content as the length limit was reached - CompletionUsage(completion_tokens=10000, prompt_tokens=1615, total_tokens=11615, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=1408))
Retry 1: Processed 1 rows


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1), Label(value='0 / 1'))),))

OpenAI API error: Could not parse response content as the length limit was reached - CompletionUsage(completion_tokens=10000, prompt_tokens=1615, total_tokens=11615, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=1408))
Retry 2: Processed 1 rows


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1), Label(value='0 / 1'))),))

Retry 3: Processed 1 rows


In [22]:
import json

mask = cwes_df["gpt_cwe_description"].str.contains(
    r"^\{\"gpt_cwe_description\":", na=False
)
cwes_df.loc[mask, "gpt_cwe_description"] = cwes_df.loc[
    mask, "gpt_cwe_description"
].apply(lambda x: json.loads(x)["gpt_cwe_description"])

In [20]:
from IPython.display import display, Markdown

display(
    Markdown(
        cwes_df[cwes_df.gpt_cwe_description != cwes_df.Summary][
            "gpt_cwe_description"
        ].iloc[10]
    )
)

# CWE-623: Unsafe ActiveX Control Marked Safe For Scripting

## Description
An ActiveX control is intended for restricted use, but it has been marked as safe-for-scripting.

## Extended Description
This might allow attackers to use dangerous functionality via a web page that accesses the control, which can lead to different resultant vulnerabilities, depending on the control's behavior.

## Demonstrative Scenario

### Scenario
An attacker creates a malicious web page that utilizes an ActiveX control marked as safe-for-scripting. When a user visits this page, the control executes potentially harmful actions on the user's system without their consent.

### Vulnerable HTML Code
```html
<html>
<head>
<title>Malicious Page</title>
</head>
<body>
<script>
  var obj = new ActiveXObject('UnsafeControl');
  obj.dangerousMethod(); // This method could perform harmful actions
</script>
</body>
</html>
```

### Analysis
In this scenario, the ActiveX control 'UnsafeControl' is marked as safe-for-scripting, allowing it to be instantiated and used in a web page. The method 'dangerousMethod()' could perform actions such as accessing the file system, modifying system settings, or executing arbitrary code. Since the control is intended for restricted use, marking it as safe-for-scripting exposes users to significant security risks, as attackers can exploit this functionality to execute malicious actions on the user's machine.

In [36]:
from datasets import load_dataset
import pandas as pd

gpt_cwes = load_dataset("Eathus/cwe_view1000_list_gpt_few_cwe_desc", split="train")
gpt_cwes_df = gpt_cwes.to_pandas()

cwes_raw = load_dataset("Eathus/cwe_view1000_raw_list", split="train")
cwes_raw_df = cwes_raw.to_pandas()

print(gpt_cwes_df.columns)
print(cwes_raw_df.columns)

Using the latest cached version of the dataset since Eathus/cwe_view1000_list_gpt_few_cwe_desc couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/edvinn/.cache/huggingface/datasets/Eathus___cwe_view1000_list_gpt_few_cwe_desc/default/0.0.0/f514e2a15842a43c203e9e39aeab346e1d9050fd (last modified on Wed May 28 16:33:41 2025).
Using the latest cached version of the dataset since Eathus/cwe_view1000_raw_list couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/edvinn/.cache/huggingface/datasets/Eathus___cwe_view1000_raw_list/default/0.0.0/2c8d79ed3c8c5b99885a627f055d60b440763da5 (last modified on Wed May 28 16:05:27 2025).


Index(['ID', 'Name', 'Abstraction', 'Structure', 'Status', 'Description',
       'ExtendedDescription', 'ApplicablePlatforms', 'AlternateTerms',
       'ModesOfIntroduction', 'CommonConsequences', 'PotentialMitigations',
       'ObservedExamples', 'AffectedResources', 'TaxonomyMappings',
       'RelatedAttackPatterns', 'References', 'Notes', 'ContentHistory',
       'MappingNotes_Usage', 'MappingNotes_Rationale', 'MappingNotes_Comments',
       'MappingNotes_Reasons', 'MappingNotes_Suggestions', 'RelatedWeaknesses',
       'WeaknessOrdinalities', 'DetectionMethods', 'DemonstrativeExamples',
       'FunctionalAreas', 'Diagram', 'LikelihoodOfExploit',
       'BackgroundDetails', 'NumPaths', 'Paths', 'Children', 'Summary',
       'gpt_cwe_description'],
      dtype='object')
Index(['ID', 'Name', 'Abstraction', 'Structure', 'Status', 'Description',
       'ExtendedDescription', 'ApplicablePlatforms', 'AlternateTerms',
       'ModesOfIntroduction', 'CommonConsequences', 'PotentialMitigation

In [None]:
print(len(gpt_cwes_df))
print(len(cwes_raw_df))

In [38]:
import pandas as pd

cwes_df = pd.merge(
    cwes_raw_df, gpt_cwes_df[["ID", "Summary", "gpt_cwe_description"]], on="ID"
)
print(len(cwes_df))
print(cwes_df.columns)

940
Index(['ID', 'Name', 'Abstraction', 'Structure', 'Status', 'Description',
       'ExtendedDescription', 'ApplicablePlatforms', 'AlternateTerms',
       'ModesOfIntroduction', 'CommonConsequences', 'PotentialMitigations',
       'ObservedExamples', 'AffectedResources', 'TaxonomyMappings',
       'RelatedAttackPatterns', 'References', 'Notes', 'ContentHistory',
       'MappingNotes_Usage', 'MappingNotes_Rationale', 'MappingNotes_Comments',
       'MappingNotes_Reasons', 'MappingNotes_Suggestions', 'RelatedWeaknesses',
       'WeaknessOrdinalities', 'DetectionMethods', 'DemonstrativeExamples',
       'FunctionalAreas', 'Diagram', 'LikelihoodOfExploit',
       'BackgroundDetails', 'NumPaths', 'Paths', 'Children', 'Summary',
       'gpt_cwe_description'],
      dtype='object')


In [57]:
cwes_df.columns

Index(['ID', 'Name', 'Abstraction', 'Structure', 'Status', 'Description',
       'ExtendedDescription', 'ApplicablePlatforms', 'AlternateTerms',
       'ModesOfIntroduction', 'CommonConsequences', 'PotentialMitigations',
       'ObservedExamples', 'AffectedResources', 'TaxonomyMappings',
       'RelatedAttackPatterns', 'References', 'Notes', 'ContentHistory',
       'MappingNotes_Usage', 'MappingNotes_Rationale', 'MappingNotes_Comments',
       'MappingNotes_Reasons', 'MappingNotes_Suggestions', 'RelatedWeaknesses',
       'WeaknessOrdinalities', 'DetectionMethods', 'DemonstrativeExamples',
       'FunctionalAreas', 'Diagram', 'LikelihoodOfExploit',
       'BackgroundDetails', 'NumPaths', 'Paths', 'Children', 'Summary',
       'gpt_cwe_description'],
      dtype='object')

In [41]:
from datasets import Dataset

ds_view1000_complete = Dataset.from_pandas(cwes_df)
ds_view1000_complete.push_to_hub("Eathus/cwe_view1000_list_gpt_few_cwe_desc_final")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/Eathus/cwe_view1000_list_gpt_few_cwe_desc_final/commit/85e1ad6c587331a8c6290de59eb065656c8803c5', commit_message='Upload dataset', commit_description='', oid='85e1ad6c587331a8c6290de59eb065656c8803c5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Eathus/cwe_view1000_list_gpt_few_cwe_desc_final', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Eathus/cwe_view1000_list_gpt_few_cwe_desc_final'), pr_revision=None, pr_num=None)