In [1]:
import sys
from pathlib import Path

# Get parent directory (Thesis-Edvin)
sys.path.append(str(Path.cwd().parent))

In [2]:
with open("tmp/cwe_output.json", "r") as file:
    file_id = file.fileno()
    print(file_id)

66


In [3]:
import pandas as pd
import json

with open("tmp/view_CWE-1000_all_weaknesses.json", "r") as file:
    data = json.load(file)

cwes = data["Weaknesses"]
print(len(cwes))
cwes = [w for w in cwes if w["MappingNotes"]["Usage"] != "Prohibited"]
print(len(cwes))

940
881


In [4]:
import os
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from utils import *


class ReplySchema(BaseModel):
    gpt_cwe: str = Field(
        description="The CWE-ID (number) of the CWE entry that best fits the vulnerability description if any; otherwise, write None"
    )
    gpt_cwe_confidence: int = Field(
        description="An integer from 1 to 5 indicating your level of confidence  (1 = very low, 2 = low, 3 = medium, 4 = high, 5 = very high)."
    )


llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.0,
    api_key=OPENAI_API_KEY_KTH,  # <- this overrides the default
)  # maybe set max_token to 14000

prompts_dict = load_prompts(os.getcwd() + "/../utils/prompts")

prompt = ChatPromptTemplate.from_messages(
    [("system", prompts_dict["baseline_system_setup"]), ("human", "{desc}")],
)


def parser(message: ReplySchema):
    return message.model_dump_json()


llm = llm.with_structured_output(ReplySchema)
chain = prompt | llm | parser

In [5]:
from utils import *
from pandarallel import pandarallel
from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type,
)
from openai import OpenAIError, RateLimitError  # Explicitly import errors


@retry(
    stop=stop_after_attempt(5),  # Retry up to 5 times
    wait=wait_exponential(multiplier=2, min=1, max=60),  # Exponential backoff
    retry=retry_if_exception_type(RateLimitError),  # Retry only on rate limit errors
)
def _gpt_classify(desc, cwe_entries):
    if (
        not desc
        or not isinstance(desc, str)
        or not cwe_entries
        or not isinstance(cwe_entries, str)
    ):  # Check for empty/invalid messages
        return None
    return chain.invoke(
        {
            "cwe_entries": cwe_entries,
            "desc": desc,
        }
    )  # Adjusted for OpenAI API format


def gpt_classify(desc, cwe_entries):
    try:
        return _gpt_classify(desc, cwe_entries)
    except OpenAIError as e:  # Catch all OpenAI-specific errors
        print(f"OpenAI API error: {e}")
    except Exception as e:
        print(f"General error processing message: {e}")
    return None

In [6]:
def cwe_list_to_json(cwes, indent=4):
    return json.dumps({"Weaknesses": cwes}, indent=indent)

In [7]:
import tiktoken


def count_tokens(text: str, model: str = "gpt-4", echo=False) -> int:
    """Count tokens for OpenAI models using tiktoken"""
    try:
        encoding = tiktoken.encoding_for_model(model)
        if echo:
            print("Finished encoding using model:", model)
    except KeyError:
        print("invalid input model:", model + ".", "Defaulting to cl100k_base")
        encoding = tiktoken.get_encoding("cl100k_base")  # Fallback for most models
    return len(encoding.encode(text))

In [8]:
import statistics as stat

cwe_token_counts = [count_tokens(json.dumps(cwe, indent=4)) + 2 for cwe in cwes]
print(cwe_token_counts)
print("Mean token count:\t\t", stat.mean(cwe_token_counts))
print("Median token count:\t\t", stat.median(cwe_token_counts))
print("Max token count:\t\t", max(cwe_token_counts))
print("Min token count:\t\t", min(cwe_token_counts))
print("Total token count:\t\t", sum(cwe_token_counts))
print("Total packaged token count:\t", count_tokens(cwe_list_to_json(cwes)))

[4737, 5705, 3561, 1596, 1860, 3281, 2473, 2149, 1636, 2307, 2259, 4172, 3826, 1327, 2799, 2229, 2850, 1906, 2076, 2465, 1607, 1461, 2451, 2022, 1218, 2103, 1994, 1832, 1919, 1517, 1608, 1624, 2055, 3196, 1543, 1552, 1100, 5818, 8131, 2736, 2523, 1233, 1156, 1636, 3391, 2017, 2461, 2337, 3157, 3739, 3023, 1891, 2167, 1985, 2006, 1781, 8077, 677, 1832, 1632, 3487, 2197, 1585, 3396, 1832, 2161, 1349, 1374, 1491, 1203, 1479, 1603, 1108, 10124, 2431, 3761, 1398, 1303, 1939, 1461, 2852, 7058, 1832, 3334, 2265, 1668, 2359, 6271, 3646, 1346, 3242, 2128, 2122, 1471, 3240, 2413, 2901, 1489, 1121, 8522, 1583, 3543, 1614, 2098, 1028, 2731, 2149, 8835, 6391, 3928, 2743, 2714, 2299, 3062, 2292, 2002, 4053, 2039, 2303, 2490, 1338, 1806, 1405, 4772, 2198, 2441, 4628, 3786, 2634, 2538, 1998, 1346, 2440, 1620, 1434, 1794, 2037, 2219, 2773, 1306, 5434, 3703, 5037, 2201, 3187, 2083, 1970, 7561, 3135, 2769, 2138, 1609, 3926, 2198, 2548, 1807, 4096, 1831, 1972, 2563, 2387, 3220, 1293, 2587, 1631, 2082, 181

In [9]:
cwe_chunk_json = cwe_list_to_json(cwes[130:210])
count_tokens(cwe_chunk_json)

251702

In [10]:
from datasets import load_dataset

test_few = load_dataset(
    "Eathus/github-issues-vul-detection-gpt-few-strict-vul-desc-results", split="test"
)
test_few_df = test_few.to_pandas()
print("all cves:\t\t\t", len(test_few_df))
test_few_df = test_few_df[~test_few_df.duplicated(subset="issue_github_id", keep=False)]
print("non duplicate issues cves:\t", len(test_few_df))

all cves:			 1778
non duplicate issues cves:	 1763


In [11]:
true_pos_few = test_few_df[test_few_df.gpt_is_relevant & ~test_few_df.cve_id.isna()]
false_pos_few = test_few_df[test_few_df.gpt_is_relevant & test_few_df.cve_id.isna()]
all_true_few = test_few_df[test_few_df.gpt_is_relevant]

print("true pssetive count:\t", len(true_pos_few))
print("false posetive count:\t", len(false_pos_few))
print("TP + FP count:\t\t", len(all_true_few))

true pssetive count:	 291
false posetive count:	 310
TP + FP count:		 601


In [12]:
print(len(test_few_df))
print(len(test_few_df.drop_duplicates(subset="issue_github_id")))
print(len(test_few_df.drop_duplicates(subset=["issue_github_id", "cve_id"])))

print(len(true_pos_few))
print(len(true_pos_few.drop_duplicates(subset="issue_github_id")))
print(len(true_pos_few.drop_duplicates(subset=["issue_github_id", "cve_id"])))

dupes = test_few_df[
    test_few_df.duplicated(subset="issue_github_id", keep=False)
].sort_values("issue_github_id")
print(len(dupes))
display(dupes[["cve_id", "issue_github_id", "issue_number", "cve_primary_cwe"]])

1763
1763
1763
291
291
291
0


Unnamed: 0,cve_id,issue_github_id,issue_number,cve_primary_cwe


In [13]:
print(true_pos_few.iloc[0].gpt_description)
print(true_pos_few.iloc[0].cve_primary_cwe)

The issue describes a double free vulnerability in the LibreDWG library, specifically in the function `dwg_free_MATERIAL_private` at line 7662 of `dwg.spec`. The AddressSanitizer output indicates that the program attempts to free the same memory address twice, which can lead to undefined behavior, including potential exploitation by attackers.
415


In [14]:
import tiktoken


def count_chat_tokens(messages, model="gpt-4", echo=False):
    try:
        encoding = tiktoken.encoding_for_model(model)
        if echo:
            print("Finished encoding using model:", model)

    except KeyError:
        print("invalid input model:", model + ".", "Defaulting to cl100k_base")
        encoding = tiktoken.get_encoding("cl100k_base")

    tokens_per_message = 3
    tokens_per_name = 1

    total_tokens = 0
    for msg in messages:
        total_tokens += tokens_per_message
        for key, value in msg.items():
            total_tokens += len(encoding.encode(value))
            if key == "name":
                total_tokens += tokens_per_name
    total_tokens += 3  # priming
    return total_tokens

In [15]:
formatted_system_prompt = prompts_dict["baseline_system_setup"].format(
    cwe_entries=cwe_chunk_json
)
messages = [
    {"role": "system", "content": formatted_system_prompt},
    {"role": "user", "content": true_pos_few.iloc[0].gpt_description},
]

print(count_chat_tokens(messages))
formatted_system_prompt = prompts_dict["baseline_system_setup"].format(cwe_entries="")
messages = [
    {"role": "system", "content": formatted_system_prompt},
    {"role": "user", "content": true_pos_few.iloc[0].gpt_description},
]
print(count_chat_tokens(messages))
print(count_tokens(cwe_chunk_json))

252024
322
251702


In [16]:
true_pos_few.columns

Index(['cve_id', 'cve_published', 'cve_descriptions', 'cve_metrics',
       'cve_references', 'cve_configurations', 'cve_primary_cwe', 'cve_tags',
       'issue_owner_repo', 'issue_body', 'issue_title', 'issue_comments_url',
       'issue_comments_count', 'issue_created_at', 'issue_updated_at',
       'issue_html_url', 'issue_github_id', 'issue_number', 'label',
       'issue_msg', 'issue_msg_n_tokens', 'issue_embedding',
       '__index_level_0__', 'gpt_description', 'gpt_vulnerability',
       'gpt_confidence', 'gpt_is_relevant'],
      dtype='object')

In [17]:
import json
import os
import pickle
import pandas as pd
from tqdm.notebook import tqdm
from utils import *
import time
from sklearn.metrics import accuracy_score, classification_report


def find_cwe_list(high, low, cwes, max_request_size, cwe_tc_list, msg_tc):
    # Recursive binary search function
    mid = low + (high - low) // 2
    token_count = sum(cwe_tc_list[:mid]) + msg_tc + 11

    if low == high:
        return [], 0, token_count

    if token_count == max_request_size:
        return cwes[:mid], mid, token_count
    if high - low == 1:
        max_tc = token_count + cwe_tc_list[mid]
        if max_tc < max_request_size:
            return cwes[:high], high, max_tc
        return cwes[:mid], mid, token_count

    if token_count < max_request_size:
        return find_cwe_list(high, mid, cwes, max_request_size, cwe_tc_list, msg_tc)
    else:
        return find_cwe_list(mid, low, cwes, max_request_size, cwe_tc_list, msg_tc)


def get_prompt_preamble_tc(desc, prompts_dict, ignore_user=False):
    formatted_system_prompt = prompts_dict["baseline_system_setup"].format(
        cwe_entries=""
    )
    messages = [
        {"role": "system", "content": formatted_system_prompt},
        {"role": "user", "content": ("" if ignore_user else desc)},
    ]
    return count_chat_tokens(messages)


def filter_possible_cwes(cwes):
    possible_cwes = [resp for resp in cwes if resp["gpt_cwe"] != "None"]

    if possible_cwes == []:
        return [min(cwes, key=(lambda x: x["gpt_cwe_confidence"]))] if cwes else [] 

    max_conf = max(possible_cwes, key=(lambda x: x["gpt_cwe_confidence"]))[
        "gpt_cwe_confidence"
    ]
    possible_cwes = [
        resp for resp in possible_cwes if resp["gpt_cwe_confidence"] == max_conf
    ]
    possible_cwes = list({p["gpt_cwe"]: p for p in possible_cwes}.values())
    return possible_cwes


def get_gpt_cwe(cwes, desc, max_request_size, echo=False):
    prompts_dict = load_prompts(os.getcwd() + "/../utils/prompts")
    cwe_dict = {cwe["ID"]: cwe for cwe in cwes}
    msg_tc = get_prompt_preamble_tc(desc, prompts_dict)

    while True:
        i = 0
        responses = []
        # print("cwes:\n", len(cwes))
        inner_pbar = tqdm(
            total=len(cwes),
            desc="Processing CWE chunks",
            leave=False,
            disable=not echo,
        )

        while i < len(cwes):
            # print("i:", i)
            cwe_chunk, next_i, _ = find_cwe_list(
                len(cwes[i:]),
                0,
                cwes[i:],
                max_request_size,
                cwe_token_counts[i:],
                msg_tc,
            )
            # print(any(cwe['ID'] == '415' for cwe in cwe_chunk))
            i += next_i

            # response = '{"gpt_cwe":"415","gpt_cwe_confidence":5}'
            # time.sleep(0.5)

            response = gpt_classify(desc, cwe_list_to_json(cwe_chunk))

            response = json.loads(response)
            # print(response)
            responses.append(response)

            inner_pbar.update(next_i)

        inner_pbar.close()

        poss_cwes = filter_possible_cwes(responses)
        if len(poss_cwes) == 1:
            return poss_cwes[0]
        if not poss_cwes: 
            return {"gpt_cwe": '0', 'gpt_cwe_confidence': 5}

        cwes = [cwe_dict[cwe["gpt_cwe"]] for cwe in poss_cwes]


def classify_issues(cwes, data_df, max_request_size, echo=False):
    issue_dict = {
        dat.issue_github_id: ([], dat.gpt_description)
        for (_, dat) in data_df.iterrows()
    }
    cwe_dict = {cwe["ID"]: cwe for cwe in cwes}

    prompts_dict = load_prompts(os.getcwd() + "/../utils/prompts")
    msg_tc = get_prompt_preamble_tc("", prompts_dict, True)

    i = 0
    req_number = 0
    chunks = []
    while i < len(cwes):
        # print("i:", i)
        cwe_chunk, next_i, _ = find_cwe_list(
            len(cwes[i:]),
            0,
            cwes[i:],
            max_request_size,
            cwe_token_counts[i:],
            msg_tc,
        )
        i += next_i

        chunks.append(cwe_chunk)
        req_number += 1

    chunk_iter = tqdm(chunks, desc="Processing CWE chunks", disable=not echo)
    for chunk in chunk_iter:
        data_iter = tqdm(
            data_df.iterrows(),
            total=len(data_df),
            desc="Classifying issues",
            leave=False,
            disable=not echo,
        )
        for _, dat in data_iter:
            # response = gpt_classify(dat.gpt_description, cwe_list_to_json(chunk))
            # issue_dict[dat.issue_github_id][0].append(response)

            time.sleep(0.001)
            issue_dict[dat.issue_github_id][0].extend(
                [
                    {"gpt_cwe": "415", "gpt_cwe_confidence": 5},
                    {"gpt_cwe": "664", "gpt_cwe_confidence": 5},
                    {"gpt_cwe": "666", "gpt_cwe_confidence": 5},
                    {"gpt_cwe": "416", "gpt_cwe_confidence": 5},
                    {"gpt_cwe": "1341", "gpt_cwe_confidence": 5},
                ]
            )

    data_iter = tqdm(
        issue_dict.items(),
        desc="Re-classifying issues",
        disable=not echo,
        leave=False,
    )
    for key, val in data_iter:
        poss_cwes = filter_possible_cwes(val[0])
        if len(poss_cwes) == 1:
            issue_dict[key] = poss_cwes[0]
        else:
            remaining_cwes = [cwe_dict[cwe["gpt_cwe"]] for cwe in poss_cwes]
            issue_dict[key] = get_gpt_cwe(
                remaining_cwes, val[1], max_request_size, echo
            )

    return issue_dict


def batch_cwe_request(
    cwes, desc, git_issue_id, max_request_size, model="gpt-4o-mini", echo=True
):
    prompts_dict = load_prompts(os.getcwd() + "/../utils/prompts")
    msg_tc = get_prompt_preamble_tc(desc, prompts_dict)

    i = 0
    req_number = 0
    requests = []

    inner_pbar = tqdm(
        total=len(cwes),
        desc=f"Gathering CWE requests for issue {git_issue_id}",
        leave=False,
        disable=not echo,
    )
    while i < len(cwes):
        # print("i:", i)
        cwe_chunk, next_i, _ = find_cwe_list(
            len(cwes[i:]),
            0,
            cwes[i:],
            max_request_size,
            cwe_token_counts[i:],
            msg_tc,
        )
        i += next_i

        formatted_system_prompt = prompts_dict["baseline_system_setup"].format(
            cwe_entries=cwe_list_to_json(cwe_chunk)
        )
        request = {
            "custom_id": str(git_issue_id) + "-" + str(req_number),
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": model,
                "messages": [
                    {"role": "system", "content": formatted_system_prompt},
                    {"role": "user", "content": desc},
                ],
                "max_tokens": 1000,
            },
        }
        requests.append(request)
        req_number += 1

        inner_pbar.update(next_i)

    inner_pbar.close()

    return requests


def generate_batch_post_files(
    cwes,
    data_df,
    max_request_size,
    post_folder_path="tmp",
    max_size_mb=150,
    output_dir="batches",
    output_prefix="batch",
    echo=True,
):
    output_dir = os.path.join(post_folder_path, output_dir)
    os.makedirs(output_dir, exist_ok=True)

    # requests = []
    batch_info_dict = {}

    max_size_bytes = max_size_mb * 1024 * 1024
    part_num = 1
    current_size = 0

    output_path = os.path.join(output_dir, f"{output_prefix}_{part_num}.jsonl")
    output_file = open(output_path, "w", encoding="utf-8")

    data_iter = tqdm(
        data_df.iterrows(),
        total=len(data_df),
        desc="Writing batch files",
        disable=not echo,
    )
    for _, row in data_iter:
        requests = batch_cwe_request(
            cwes, row.gpt_description, row.issue_github_id, max_request_size
        )
        req_iter = tqdm(
            requests,
            desc=f"Writing issue {row.issue_github_id} batch files",
            leave=False,
            disable=not echo,
        )
        for req in req_iter:
            req_line = json.dumps(req) + "\n"
            line_size = len(req_line.encode("utf-8"))
            if current_size + line_size > max_size_bytes:
                output_file.close()
                part_num += 1
                output_path = os.path.join(
                    output_dir, f"{output_prefix}_{part_num}.jsonl"
                )
                output_file = open(output_path, "w", encoding="utf-8")
                current_size = 0
            output_file.write(req_line)
            current_size += line_size
        del requests

    output_file.close()
    batch_info_dict["batch_count"] = part_num
    batch_info_dict["batch_prefix"] = output_prefix
    batch_info_dict["batch_size"] = max_size_mb
    batch_info_dict["batch_issues"] = data_df["issue_github_id"].tolist()
    batch_info_dict["batch_dict"] = {}

    pickle_file = os.path.join(output_dir, "batch_info.pkl")
    with open(pickle_file, "wb") as file:
        pickle.dump(batch_info_dict, file)


def post_batches(post_folder_path="tmp", batches_dir="batches", safe=False):
    batches_dir = os.path.join(post_folder_path, batches_dir)
    pickle_file = os.path.join(batches_dir, "batch_info.pkl")
    with open(pickle_file, "rb") as file:
        batch_info_dict = pickle.load(file)

    if safe:
        pickle_backup = os.path.join(batches_dir, "batch_info_backup.pkl")
        with open(pickle_backup, "wb") as file:
            pickle.dump(batch_info_dict, file)
        print("Backup of previous batch info successfully saved.")

    # files = []
    # batches = {}
    for i in tqdm(
        range(1, batch_info_dict["batch_count"] + 1),
        desc=f"Posting batches in {batches_dir}",
    ):
        batch_name = f"{batch_info_dict['batch_prefix']}_{i}"
        if (
            batch_name in batch_info_dict["batch_dict"]
            and batch_info_dict["batch_dict"][batch_name].status == "completed"
        ):
            print(
                "Batch:",
                batch_name,
                "id:",
                batch_info_dict["batch_dict"][batch_name].id,
                "already completed, skipping.",
            )
            continue
        path = os.path.join(batches_dir, batch_name + ".jsonl")
        file = client.files.create(file=open(path, "rb"), purpose="batch")
        # files.append(file)
        batch = client.batches.create(
            input_file_id=file.id,
            endpoint="/v1/chat/completions",
            completion_window="24h",
            metadata={"batch_name": batch_name},
        )
        batch_info_dict["batch_dict"][batch_name] = batch
        print("Batch:", batch_name, "id:", batch.id, "successfully posted.")

    # batch_info_dict["batch_ids"] = list(zip(batches, ["in_progress" for _ in range(len(batches))]))
    with open(pickle_file, "wb") as file:
        pickle.dump(batch_info_dict, file)

    print("All new batch info successfully saved.")


def retrieve_batch_res(
    post_folder_path="tmp", batches_dir="batches", output_dir="responses", safe=False
):
    batches_dir = os.path.join(post_folder_path, batches_dir)
    output_dir = os.path.join(batches_dir, output_dir)

    pickle_file = os.path.join(batches_dir, "batch_info.pkl")
    with open(pickle_file, "rb") as file:
        batch_info_dict = pickle.load(file)

    if safe:
        pickle_backup = os.path.join(batches_dir, "batch_info_backup.pkl")
        with open(pickle_backup, "wb") as file:
            pickle.dump(batch_info_dict, file)
        print("Backup of previous batch info successfully saved.")

    for batch in tqdm(
        batch_info_dict["batch_dict"].values(),
        desc=f"Retrieving results from batches in {batches_dir}",
    ):
        status = batch.status
        if status == "completed":
            continue
        batch = client.batches.retrieve(batch.id)
        status = batch.status
        if status == "completed":
            file_response = client.files.content(batch.output_file_id)
            output_file = os.path.join(
                output_dir, batch.metadata["batch_name"] + "_response.jsonl"
            )

            if not os.path.exists(output_dir):
                os.makedirs(output_dir, exist_ok=True)
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(file_response.text)
            print(
                "batch,",
                batch.metadata["batch_name"],
                "successfully completed, retrieved and saved in",
                output_dir,
            )
        else:
            print("batch,", batch.metadata["batch_name"], "status:\t", status)
            # Pretty print errors if available
            if hasattr(batch, "errors") and hasattr(batch.errors, "data"):
                print("\nErrors:")
                for error in batch.errors.data:
                    print(
                        f"- Line {error.line}: [{error.code}] {error.message} (param: {error.param})"
                    )
            else:
                print("No detailed errors found.")

        batch_info_dict["batch_dict"][batch.metadata["batch_name"]] = batch

    with open(pickle_file, "wb") as file:
        pickle.dump(batch_info_dict, file)

    print("All new batch info successfully saved.")

    return batch_info_dict


def retrieve_req_res(
    data_df,
    post_folder_path="tmp",
    batches_dir="batches",
    output_dir="responses",
    clean=False,
):
    batches_dir = os.path.join(post_folder_path, batches_dir)
    output_dir = os.path.join(batches_dir, output_dir)

    pickle_file = os.path.join(batches_dir, "batch_info.pkl")
    with open(pickle_file, "rb") as file:
        batch_info_dict = pickle.load(file)

    res_dict = {}
    for i in tqdm(
        range(1, batch_info_dict["batch_count"] + 1),
        desc=f"Retrieving gpt responses from {output_dir}",
    ):
        path = os.path.join(
            output_dir, f"{batch_info_dict['batch_prefix']}_{i}_response.jsonl"
        )
        data = []
        with open(path, "r") as f:
            for line in f:
                data.append(json.loads(line))

        for d in data:
            git_id = d["custom_id"].split("-")[0]
            resp = d["response"]["body"]["choices"][0]["message"]["content"]

            first_brace = resp.find("{")
            last_brace = resp.rfind("}")

            if first_brace == -1 or last_brace == -1:
                raise ValueError("No valid JSON found in the string")

            try:
                json_resp = json.loads(resp[first_brace : last_brace + 1])
            except:
                print(
                    f"""Something went wrong when doing `json.loads`, skipping.\n
                    GPT-JSON-response-string:\n{resp[first_brace : last_brace + 1]}"""
                )
                continue

            if clean:
                json_resp["gpt_cwe_confidence"] = json_resp.pop("gpt_confidence")

            if git_id not in res_dict:
                res_dict[git_id] = [json_resp]
            else:
                res_dict[git_id].append(json_resp)

    res_dict = {int(key): filter_possible_cwes(res) for key, res in res_dict.items()}
    res_df = pd.DataFrame(
        {"issue_github_id": res_dict.keys(), "cwe_candidates": res_dict.values()}
    )
    ret_df = pd.merge(data_df, res_df, on="issue_github_id")

    return ret_df

    # get_gpt_cwe


def get_final_res(
    data_df, cwes, max_retries=10, delay=1, file_path="tmp/gpt_response_few_cwe_df.pkl"
):
    cwe_dict = {cwe["ID"]: cwe for cwe in cwes}
    ret_df = data_df.copy()
    if os.path.exists(file_path):
        with open(file_path, "rb") as file:
            ret_df = pickle.load(file)
        print("Pickle file loaded successfully!")
    else:
        print(f"The file at {file_path} does not exist. Setting gpt_response to 'None'")
        ret_df["gpt_response"] = None
    retries = 0
    # test_df['gpt_response'] = None
    while (not ret_df[ret_df.gpt_response.isna()].empty) and retries < max_retries:
        # Get indices of rows needing processing
        na_indices = ret_df[
            ret_df.gpt_response.isna() & ~ret_df.cwe_candidates.isna()
        ].index

        if len(na_indices) == 0:
            break

        # Process ONLY those rows and assign directly to original DF
        ret_df.loc[na_indices, "gpt_response"] = ret_df.loc[
            na_indices, ["gpt_description", "cwe_candidates"]
        ].parallel_apply(
            lambda x: get_gpt_cwe(
                [
                    cwe_dict[cwe["gpt_cwe"]]
                    for cwe in x.cwe_candidates
                    if cwe["gpt_cwe"] in cwe_dict
                ],
                x.gpt_description,
                100000,
            ),
            axis=1,
        )

        with open(file_path, "wb") as file:  # 'wb' mode writes in binary format
            pickle.dump(ret_df, file)
        retries += 1
        print(f"Retry {retries}: Processed {len(na_indices)} rows")
        time.sleep(delay)

    return ret_df


def cancel_batches(post_folder_path="tmp", batches_dir="batches"):
    batches_dir = os.path.join(post_folder_path, batches_dir)

    pickle_file = os.path.join(batches_dir, "batch_info.pkl")
    with open(pickle_file, "rb") as file:
        batch_info_dict = pickle.load(file)
    for name, batch in batch_info_dict["batch_dict"].items():
        client.batches.cancel(batch.id)
        print("Batch:", name, "id:", batch.id, "canceled.")


def evaluate_baseline(df, col):
    y_test = df.cve_primary_cwe
    y_pred = df[col]

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    # print(len(df[df.cve_primary_cwe == df[col]]) / len(df))

    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)

In [18]:
type(true_pos_few["issue_github_id"].tolist())

list

In [21]:
rag_res = load_dataset("Eathus/github-issues-vul-label-rag-results", split="test")
rag_res_df = rag_res.to_pandas()

README.md:   0%|          | 0.00/7.43k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/6.50M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/291 [00:00<?, ? examples/s]

In [27]:
rag_res_df["cwe_candidates"] = rag_res_df.rag_candidates.map(lambda x: [{'gpt_cwe': str(cwe), 'gpt_cwe_confidence': 5} for cwe in x])

In [29]:
pandarallel.initialize(progress_bar=True, nb_workers=15)

final_df = get_final_res(rag_res_df, cwes, file_path="tmp/rag_predictions_df.pkl")

INFO: Pandarallel will run on 15 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The file at tmp/rag_predictions_df.pkl does not exist. Setting gpt_response to 'None'


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=20), Label(value='0 / 20'))), HBox…

Retry 1: Processed 291 rows


In [30]:
test_df = final_df.copy()
tmp = pd.json_normalize(test_df["gpt_response"])
test_df = pd.concat([test_df.drop(columns=["gpt_response"]), tmp], axis=1)
evaluate_baseline(test_df, "gpt_cwe")

Accuracy: 0.5051546391752577
Classification Report:
              precision    recall  f1-score   support

        1050       0.00      0.00      0.00         1
         116       0.00      0.00      0.00         1
        1176       0.00      0.00      0.00         0
         119       0.00      0.00      0.00         7
         120       0.44      0.29      0.35        14
         121       0.00      0.00      0.00         0
         122       0.03      1.00      0.05         1
         125       0.75      0.18      0.29        17
        1333       0.50      1.00      0.67         1
        1335       0.00      0.00      0.00         0
        1392       0.00      0.00      0.00         0
         150       1.00      1.00      1.00         1
         190       0.75      0.75      0.75         4
          20       0.25      0.33      0.29         3
         200       0.00      0.00      0.00         1
         212       0.00      0.00      0.00         1
          22       0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Entire Baseline

In [None]:
generate_batch_post_files(cwes, true_pos_few, 100000, max_size_mb=100)
# classify_issues(cwes, true_pos_few, 100000, True)

In [44]:
post_batches()

Posting batches in tmp/batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batch: batch_1 id: batch_684bf022756c81908e2ee98bfdb30552 successfully posted.
Batch: batch_2 id: batch_684bf0c68f3081908fb3d86d0384ac22 successfully posted.
Batch: batch_3 id: batch_684bf1599e288190a7dbb5f8b2364976 successfully posted.
Batch: batch_4 id: batch_684bf1ea7d108190973fc1b25d029a28 successfully posted.
Batch: batch_5 id: batch_684bf1fd5e7081909237ae343bd66515 successfully posted.
Batch: batch_6 id: batch_684bf20fc6648190b81d8a73fcd0e6ec successfully posted.
Batch: batch_7 id: batch_684bf23e43ec8190acc0d57ffb657e93 successfully posted.
Batch: batch_8 id: batch_684bf2500cfc819083f0f531c077b366 successfully posted.
Batch: batch_9 id: batch_684bf2600b08819088d2b85e3a8726db successfully posted.
Batch: batch_10 id: batch_684bf279085c8190a0a94d58fbbc0aeb successfully posted.
Batch: batch_11 id: batch_684bf28b8f748190b9ef9470b21fc985 successfully posted.
Batch: batch_12 id: batch_684bf3567cc481909ad31cc17aa00855 successfully posted.
Batch: batch_13 id: batch_684bf3f1d69c8190bcaf29c

In [51]:
batch_info = retrieve_batch_res()

Retrieving results from batches in tmp/batches:   0%|          | 0/40 [00:00<?, ?it/s]

batch, batch_37 successfully completed, retrieved and saved in tmp/batches/responses
All new batch info successfully saved.


In [18]:
df = retrieve_req_res(true_pos_few)

Retrieving gpt responses from tmp/batches/responses:   0%|          | 0/40 [00:00<?, ?it/s]

Something went wrong when doing `json.loads`, skipping.

                    GPT-JSON-response-string:
{
    "gpt_cwe": "';
    "gpt_cwe_confidence": 5
}


In [None]:
df.cwe_candidates

Index(['cve_id', 'cve_published', 'cve_descriptions', 'cve_metrics',
       'cve_references', 'cve_configurations', 'cve_primary_cwe', 'cve_tags',
       'issue_owner_repo', 'issue_body', 'issue_title', 'issue_comments_url',
       'issue_comments_count', 'issue_created_at', 'issue_updated_at',
       'issue_html_url', 'issue_github_id', 'issue_number', 'label',
       'issue_msg', 'issue_msg_n_tokens', 'issue_embedding',
       '__index_level_0__', 'gpt_description', 'gpt_vulnerability',
       'gpt_confidence', 'gpt_is_relevant', 'cwe_candidates'],
      dtype='object')

In [76]:
pandarallel.initialize(progress_bar=True, nb_workers=15)

final_df = get_final_res(df, cwes, file_path="tmp/baseline_predictions_df.pkl")

INFO: Pandarallel will run on 15 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Pickle file loaded successfully!


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1), Label(value='0 / 1'))),))

Retry 1: Processed 1 rows


In [73]:
final_df.gpt_response.iloc[0]['gpt_cwe']

'415'

In [74]:
test_df.gpt_cwe.iloc[0]

'415'

In [77]:
test_df = final_df.copy()
tmp = pd.json_normalize(test_df["gpt_response"])
test_df = pd.concat([test_df.drop(columns=["gpt_response"]), tmp], axis=1)
evaluate_baseline(test_df, "gpt_cwe")

Accuracy: 0.4639175257731959
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
        1050       0.00      0.00      0.00         1
         116       0.00      0.00      0.00         1
         117       0.00      0.00      0.00         0
         119       0.00      0.00      0.00         7
         120       0.60      0.43      0.50        14
         121       0.00      0.00      0.00         0
         122       0.02      1.00      0.05         1
         125       0.67      0.12      0.20        17
        1284       0.00      0.00      0.00         0
         129       0.00      0.00      0.00         0
        1333       1.00      1.00      1.00         1
        1392       0.00      0.00      0.00         0
         150       0.00      0.00      0.00         1
         190       0.60      0.75      0.67         4
          20       0.14      0.33      0.20         3
         200       0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Tests

### Normal Test

In [None]:
generate_batch_post_files(
    cwes, true_pos_few[0:30], 100000, max_size_mb=100, output_dir="test_batches"
)
# classify_issues(cwes, true_pos_few, 100000, True)

In [29]:
post_batches(batches_dir="test_batches")

Batch: batch_1 id: batch_684b4f0014248190952540fdd22ae275 successfully posted.
Batch: batch_2 id: batch_684b4f11a3e08190b2a0ed470e2f2c77 successfully posted.
Batch: batch_3 id: batch_684b4f22adfc819088af7b850ec626f7 successfully posted.
Batch: batch_4 id: batch_684b4f31a2cc8190b4e7b42ca821e4ea successfully posted.
Batch: batch_5 id: batch_684b4f339d94819092704b543873dc63 successfully posted.
All batch info successfully saved.


In [18]:
batch_info = retrieve_batch_res(batches_dir="test_batches")
print("")

Retrieving results from batches in tmp/test_batches:   0%|          | 0/5 [00:00<?, ?it/s]

All new batch info successfully saved.



In [19]:
df = retrieve_req_res(true_pos_few[0:30], batches_dir="test_batches")

Retrieving gpt responses from tmp/test_batches/responses:   0%|          | 0/5 [00:00<?, ?it/s]

In [20]:
pandarallel.initialize(progress_bar=True, nb_workers=15)

final_df = get_final_res(df, cwes, file_path="tmp/baseline_test_predictions_df.pkl")

INFO: Pandarallel will run on 15 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The file at tmp/baseline_test_predictions_df.pkl does not exist. Setting gpt_response to 'None'


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2), Label(value='0 / 2'))), HBox(c…

Retry 1: Processed 30 rows


In [70]:
final_df.gpt_response

0       {'gpt_cwe': '415', 'gpt_cwe_confidence': 5}
1      {'gpt_cwe': '1392', 'gpt_cwe_confidence': 5}
2       {'gpt_cwe': '122', 'gpt_cwe_confidence': 5}
3        {'gpt_cwe': '89', 'gpt_cwe_confidence': 5}
4        {'gpt_cwe': '95', 'gpt_cwe_confidence': 5}
                           ...                     
286     {'gpt_cwe': '122', 'gpt_cwe_confidence': 5}
287     {'gpt_cwe': '404', 'gpt_cwe_confidence': 4}
288      {'gpt_cwe': '20', 'gpt_cwe_confidence': 5}
289     {'gpt_cwe': '863', 'gpt_cwe_confidence': 5}
290      {'gpt_cwe': '79', 'gpt_cwe_confidence': 5}
Name: gpt_response, Length: 291, dtype: object

In [21]:
test_df = final_df.copy()
tmp = pd.json_normalize(test_df["gpt_response"])
test_df = pd.concat([test_df.drop(columns=["gpt_response"]), tmp], axis=1)

In [22]:
evaluate_baseline(test_df, "gpt_cwe")

Accuracy: 0.43333333333333335
Classification Report:
              precision    recall  f1-score   support

         119       0.00      0.00      0.00         1
         120       0.00      0.00      0.00         0
         121       0.00      0.00      0.00         0
         122       0.00      0.00      0.00         0
         125       0.00      0.00      0.00         1
        1392       0.00      0.00      0.00         0
         190       1.00      1.00      1.00         2
          20       0.00      0.00      0.00         0
         200       0.00      0.00      0.00         0
         400       0.00      0.00      0.00         1
         401       1.00      1.00      1.00         2
         415       1.00      1.00      1.00         1
         416       1.00      0.50      0.67         2
         476       0.67      1.00      0.80         4
         532       1.00      1.00      1.00         1
         617       0.00      0.00      0.00         2
          74       0.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
batches_dir = os.path.join("tmp", "test_batches")
pickle_file = os.path.join(batches_dir, "batch_info.pkl")
with open(pickle_file, "rb") as file:
    batch_info_dict = pickle.load(file)

print("")

### Small test

In [None]:
generate_batch_post_files(
    cwes, true_pos_few[0:3], 100000, max_size_mb=190, output_dir="test_batches_small"
)

In [49]:
post_batches(batches_dir="test_batches_small")

In [19]:
retrieve_batch_res(batches_dir="test_batches_small")

batch, batch_1 successfully completed, retrieved and saved


{'batch_count': 1,
 'batch_prefix': 'batch',
 'batch_size': 190,
 'batch_issues': [670848928, 2581924154, 1110212655],
 'batch_dict': {'batch_1': Batch(id='batch_6847eadb3f888190a12fdebbbd98681a', completion_window='24h', created_at=1749543643, endpoint='/v1/chat/completions', input_file_id='file-1Qx3fGTEijL7sVs3MMrwYS', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1749596983, error_file_id=None, errors=None, expired_at=None, expires_at=1749630043, failed_at=None, finalizing_at=1749596973, in_progress_at=1749543646, metadata={'batch_name': 'batch_1'}, output_file_id='file-3jVrRWHAGM66fk1ZbEmBHr', request_counts=BatchRequestCounts(completed=75, failed=0, total=75))}}

In [22]:
df = retrieve_req_res(true_pos_few[0:3], batches_dir="test_batches_small")

In [19]:
display(df.cwe_candidates)

0    [{'gpt_cwe': '399', 'gpt_cwe_confidence': 5}, ...
1    [{'gpt_cwe': '284', 'gpt_cwe_confidence': 5}, ...
2    [{'gpt_cwe': '122', 'gpt_cwe_confidence': 5}, ...
3    [{'gpt_cwe': '89', 'gpt_cwe_confidence': 5}, {...
4    [{'gpt_cwe': '787', 'gpt_cwe_confidence': 5}, ...
Name: cwe_candidates, dtype: object

In [24]:
pandarallel.initialize(progress_bar=True, nb_workers=15)

final_df = get_final_res(
    df, cwes, file_path="tmp/baseline_small_test_predictions_df.pkl"
)

INFO: Pandarallel will run on 15 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The file at tmp/baseline_small_test_predictions_df.pkl does not exist. Setting gpt_response to 'None'


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1), Label(value='0 / 1'))), HBox(c…

Retry 1: Processed 3 rows


In [25]:
test_df = final_df.copy()
tmp = pd.json_normalize(test_df["gpt_response"])
test_df = pd.concat([test_df.drop(columns=["gpt_response"]), tmp], axis=1)
# test_df.columns

In [26]:
evaluate_baseline(test_df, "gpt_cwe")

Accuracy: 0.3333333333333333
Classification Report:
              precision    recall  f1-score   support

         122       0.00      0.00      0.00         0
        1392       0.00      0.00      0.00         0
         415       1.00      1.00      1.00         1
         416       0.00      0.00      0.00         1
         798       0.00      0.00      0.00         1

    accuracy                           0.33         3
   macro avg       0.20      0.20      0.20         3
weighted avg       0.33      0.33      0.33         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Backup

In [None]:
cancel_batches()

In [67]:
client.batches.cancel("batch_6846d63671f481909f1dfe6860ba11e7")

Batch(id='batch_6846d63671f481909f1dfe6860ba11e7', completion_window='24h', created_at=1749472822, endpoint='/v1/chat/completions', input_file_id='file-Jgh11BRXqDzafZfXvsoDUt', object='batch', status='cancelling', cancelled_at=None, cancelling_at=1749499802, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1749559222, failed_at=None, finalizing_at=None, in_progress_at=1749472825, metadata={'batch_name': 'batch_1'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=125))

In [85]:
json_string = """
{\n
    "name": "John Doe",
    "age": 25,
    "is_student": false,
    "courses": ["Mathematics", "Computer Science", "Physics"],
    "address": {
        "street": "123 Main St",
        "city": "Stockholm",
        "postal_code": "12345"
    }\n
}
"""
json.loads(json_string)

{'name': 'John Doe',
 'age': 25,
 'is_student': False,
 'courses': ['Mathematics', 'Computer Science', 'Physics'],
 'address': {'street': '123 Main St',
  'city': 'Stockholm',
  'postal_code': '12345'}}

## Miscellaneous

In [None]:
{
    "custom_id": "request-1",
    "method": "POST",
    "url": "/v1/chat/completions",
    "body": {
        "model": "gpt-3.5-turbo-0125",
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Hello world!"},
        ],
        "max_tokens": 1000,
    },
}
{
    "custom_id": "request-2",
    "method": "POST",
    "url": "/v1/chat/completions",
    "body": {
        "model": "gpt-3.5-turbo-0125",
        "messages": [
            {"role": "system", "content": "You are an unhelpful assistant."},
            {"role": "user", "content": "Hello world!"},
        ],
        "max_tokens": 1000,
    },
}

In [None]:
formatted_system_prompt = prompts_dict["baseline_system_setup"].format(cwe_entries="")
messages = [
    {"role": "system", "content": formatted_system_prompt},
    {"role": "user", "content": ""},
]
msg_tc = count_chat_tokens(messages)
cwe_chunk, end_ind, size = find_cwe_list(
    len(cwes[186:]), 0, cwes[186:], 100000, cwe_token_counts[186:], msg_tc
)

In [None]:
formatted_system_prompt = prompts_dict["baseline_system_setup"].format(
    cwe_entries=cwe_list_to_json(cwe_chunk)
)
msgs = [
    {"role": "system", "content": formatted_system_prompt},
    {"role": "user", "content": true_pos_few.iloc[0].gpt_description},
]
print("end_ind:\t", end_ind)
print("true size:\t", count_chat_tokens(msgs))
print("size:\t", size)
print("len(cwe_chunk):\t", len(cwe_chunk))
print("cwe_chunk:\t", cwe_list_to_json(cwe_chunk))
# display(cwe_list_to_json(cwe_chunk, 4))

formatted_system_prompt = prompts_dict["baseline_system_setup"].format(
    cwe_entries=cwe_list_to_json(cwe_chunk)
)
display(formatted_system_prompt)

In [None]:
len(cwes)

In [None]:
maxr_size = 100000
resp = get_gpt_cwe(cwes, true_pos_few.iloc[0].gpt_description, maxr_size, echo=True)

In [None]:
print(type(resp))
print(resp)