In [1]:
import sys
from pathlib import Path
# Get parent directory (Thesis-Edvin)
sys.path.append(str(Path.cwd().parent))

In [2]:
import pandas as pd
import json

with open("tmp/view_CWE-1000_all_weaknesses.json", "r") as file:
    data = json.load(file)

cwes = data['Weaknesses']
print(len(cwes))
cwes = [w for w in cwes if w['MappingNotes']['Usage'] != 'Prohibited']
print(len(cwes))


940
881


In [3]:
import os
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from utils import *
class ReplySchema(BaseModel):
    gpt_cwe: str = Field(
        description="The CWE-ID (number) of the CWE entry that best fits the vulnerability description if any; otherwise, write None"
    )
    gpt_cwe_confidence: int = Field(
        description="An integer from 1 to 5 indicating your level of confidence  (1 = very low, 2 = low, 3 = medium, 4 = high, 5 = very high)."
    )

llm = ChatOpenAI(
    model="gpt-4.1-mini",
    temperature=0.0,
    api_key=OPENAI_API_KEY_KTH  # <- this overrides the default
) # maybe set max_token to 14000

prompts_dict = load_prompts(os.getcwd() + "/../utils/prompts")

prompt = ChatPromptTemplate.from_messages(
    [("system", prompts_dict["baseline_system_setup"]), ("human", "{desc}")],
)


def parser(message: ReplySchema):
    return message.model_dump_json()


llm = llm.with_structured_output(ReplySchema)
chain = prompt | llm | parser

In [4]:
def cwe_list_to_json(cwes) :
    return json.dumps({"Weaknesses": cwes}, indent=4)

In [5]:
import tiktoken


def count_tokens(text: str, model: str="gpt-4", echo=False) -> int:
    """Count tokens for OpenAI models using tiktoken"""
    try:
        encoding = tiktoken.encoding_for_model(model)
        if echo :
            print("Finished encoding using model:", model)
    except KeyError:
        print('invalid input model:', model + '.', "Defaulting to cl100k_base")
        encoding = tiktoken.get_encoding("cl100k_base")  # Fallback for most models
    return len(encoding.encode(text))

In [6]:
import statistics as stat
cwe_token_counts = [count_tokens(json.dumps(cwe, indent=4)) + 2 for cwe in cwes]
print(cwe_token_counts)
print("Mean token count:\t\t", stat.mean(cwe_token_counts))
print("Median token count:\t\t", stat.median(cwe_token_counts))
print("Max token count:\t\t", max(cwe_token_counts))
print("Min token count:\t\t", min(cwe_token_counts))
print("Total token count:\t\t", sum(cwe_token_counts))
print("Total packaged token count:\t", count_tokens(cwe_list_to_json(cwes)))

[4737, 5705, 3561, 1596, 1860, 3281, 2473, 2149, 1636, 2307, 2259, 4172, 3826, 1327, 2799, 2229, 2850, 1906, 2076, 2465, 1607, 1461, 2451, 2022, 1218, 2103, 1994, 1832, 1919, 1517, 1608, 1624, 2055, 3196, 1543, 1552, 1100, 5818, 8131, 2736, 2523, 1233, 1156, 1636, 3391, 2017, 2461, 2337, 3157, 3739, 3023, 1891, 2167, 1985, 2006, 1781, 8077, 677, 1832, 1632, 3487, 2197, 1585, 3396, 1832, 2161, 1349, 1374, 1491, 1203, 1479, 1603, 1108, 10124, 2431, 3761, 1398, 1303, 1939, 1461, 2852, 7058, 1832, 3334, 2265, 1668, 2359, 6271, 3646, 1346, 3242, 2128, 2122, 1471, 3240, 2413, 2901, 1489, 1121, 8522, 1583, 3543, 1614, 2098, 1028, 2731, 2149, 8835, 6391, 3928, 2743, 2714, 2299, 3062, 2292, 2002, 4053, 2039, 2303, 2490, 1338, 1806, 1405, 4772, 2198, 2441, 4628, 3786, 2634, 2538, 1998, 1346, 2440, 1620, 1434, 1794, 2037, 2219, 2773, 1306, 5434, 3703, 5037, 2201, 3187, 2083, 1970, 7561, 3135, 2769, 2138, 1609, 3926, 2198, 2548, 1807, 4096, 1831, 1972, 2563, 2387, 3220, 1293, 2587, 1631, 2082, 181

In [7]:
cwe_chunk_json = cwe_list_to_json(cwes[130:210])
count_tokens(cwe_chunk_json)

251702

In [8]:
from datasets import load_dataset

test_few = load_dataset(
    "Eathus/github-issues-vul-detection-gpt-few-vul-desc-results", split="test"
)
test_few_df = test_few.to_pandas()

In [9]:
true_pos_few = test_few_df[test_few_df.gpt_is_relevant & ~test_few_df.cve_id.isna()]
false_pos_few = test_few_df[test_few_df.gpt_is_relevant & test_few_df.cve_id.isna()]
all_true_few = test_few_df[test_few_df.gpt_is_relevant]

In [10]:
print(true_pos_few.iloc[0].gpt_description)
print(true_pos_few.iloc[0].cve_primary_cwe)

The issue describes a double free vulnerability in the LibreDWG library, specifically in the function `dwg_free_MATERIAL_private` at line 7662 of `dwg.spec`. The AddressSanitizer output indicates that the program attempts to free a memory region that has already been freed, which can lead to undefined behavior, crashes, or potential exploitation by an attacker.
415


In [11]:
import tiktoken

def count_chat_tokens(messages, model="gpt-4", echo=False):
    try:
        encoding = tiktoken.encoding_for_model(model)
        if echo :
            print("Finished encoding using model:", model)
        
    except KeyError:
        print('invalid input model:', model + '.', "Defaulting to cl100k_base")
        encoding = tiktoken.get_encoding("cl100k_base")

    tokens_per_message = 3
    tokens_per_name = 1

    total_tokens = 0
    for msg in messages:
        total_tokens += tokens_per_message
        for key, value in msg.items():
            total_tokens += len(encoding.encode(value))
            if key == "name":
                total_tokens += tokens_per_name
    total_tokens += 3  # priming
    return total_tokens


In [12]:


formatted_system_prompt = prompts_dict["baseline_system_setup"].format(cwe_entries=cwe_chunk_json)
messages = [
    {"role": "system", "content": formatted_system_prompt},
    {"role": "user", "content": true_pos_few.iloc[0].gpt_description}
]

print(count_chat_tokens(messages))
formatted_system_prompt = prompts_dict["baseline_system_setup"].format(cwe_entries='')
messages = [
    {"role": "system", "content": formatted_system_prompt},
    {"role": "user", "content": true_pos_few.iloc[0].gpt_description}
]
print(count_chat_tokens(messages))
print(count_tokens(cwe_chunk_json))

251995
293
251702


In [None]:
def find_cwe_list(high, low, cwes, max_request_size, cwe_tc_list, msg_tc) :
    #Recursive binary search function 
    mid = low + (high - low) // 2

    token_count = sum(cwe_tc_list[:mid]) + msg_tc + 11

    if mid == low :
        return cwes[:mid], mid, token_count
    if token_count == max_request_size :
        return cwes[:mid], mid, token_count
    if token_count < max_request_size :
        return find_cwe_list(high, mid, cwes, max_request_size, cwe_tc_list, msg_tc)
    else :
        return find_cwe_list(mid, low, cwes, max_request_size, cwe_tc_list, msg_tc)
    
def get_gpt_cwe(cwes, desc, max_request_size) :
    pass

In [23]:

formatted_system_prompt = prompts_dict["baseline_system_setup"].format(cwe_entries='')
messages = [
    {"role": "system", "content": formatted_system_prompt},
    {"role": "user", "content": true_pos_few.iloc[0].gpt_description}
]
msg_tc = count_chat_tokens(messages)
cwe_chunk, end_ind, size = find_cwe_list(len(cwes[100:]), 0, cwes[100:], 1000000, cwe_token_counts[100:], msg_tc)

In [24]:
formatted_system_prompt = prompts_dict["baseline_system_setup"].format(cwe_entries=cwe_list_to_json(cwe_chunk))
msgs = [
    {"role": "system", "content": formatted_system_prompt},
    {"role": "user", "content": true_pos_few.iloc[0].gpt_description}
]
print('end_ind:\t', end_ind)
print('true size:\t', count_chat_tokens(msgs))
print('size:\t', size)
print('len(cwe_chunk):\t', len(cwe_chunk))
print('cwe_chunk:\t', cwe_list_to_json(cwe_chunk))


end_ind:	 342
true size:	 998493
size:	 998493
len(cwe_chunk):	 342
cwe_chunk:	 {
    "Weaknesses": [
        {
            "ID": "1390",
            "Name": "Weak Authentication",
            "Abstraction": "Class",
            "Structure": "Simple",
            "Status": "Incomplete",
            "Description": "The product uses an authentication mechanism to restrict access to specific users or identities, but the mechanism does not sufficiently prove that the claimed identity is correct.",
            "ExtendedDescription": "\n\nAttackers may be able to bypass weak authentication faster and/or with less effort than expected.\n",
            "RelatedWeaknesses": [
                {
                    "Nature": "ChildOf",
                    "CweID": "287",
                    "ViewID": "1000",
                    "Ordinal": "Primary"
                }
            ],
            "ApplicablePlatforms": [
                {
                    "Type": "Language",
                    "C

In [22]:
chain.invoke(
    {
        'cwe_entries': cwe_list_to_json(cwe_chunk),
        'desc': true_pos_few.iloc[0].gpt_description
    }
)

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 1047576 tokens. However, your messages resulted in 1495357 tokens (including 127 in the response_format schemas.). Please reduce the length of the messages or schemas.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}