In [None]:
import sys
from pathlib import Path

# Get parent directory (Thesis-Edvin)
sys.path.append(str(Path.cwd().parent))

In [None]:
with open("tmp/cwe_output.json", "r") as file:
    file_id = file.fileno()
    print(file_id)

In [None]:
import pandas as pd
import json

with open("tmp/view_CWE-1000_all_weaknesses.json", "r") as file:
    data = json.load(file)

cwes = data["Weaknesses"]
print(len(cwes))
cwes = [w for w in cwes if w["MappingNotes"]["Usage"] != "Prohibited"]
print(len(cwes))

In [None]:
import os
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from utils import *


class ReplySchema(BaseModel):
    gpt_cwe: str = Field(
        description="The CWE-ID (number) of the CWE entry that best fits the vulnerability description if any; otherwise, write None"
    )
    gpt_cwe_confidence: int = Field(
        description="An integer from 1 to 5 indicating your level of confidence  (1 = very low, 2 = low, 3 = medium, 4 = high, 5 = very high)."
    )


llm = ChatOpenAI(
    model="gpt-40-mini",
    temperature=0.0,
    api_key=OPENAI_API_KEY_KTH,  # <- this overrides the default
)  # maybe set max_token to 14000

prompts_dict = load_prompts(os.getcwd() + "/../utils/prompts")

prompt = ChatPromptTemplate.from_messages(
    [("system", prompts_dict["baseline_system_setup"]), ("human", "{desc}")],
)


def parser(message: ReplySchema):
    return message.model_dump_json()


llm = llm.with_structured_output(ReplySchema)
chain = prompt | llm | parser

In [None]:
from utils import *
from pandarallel import pandarallel
from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type,
)
from openai import OpenAIError, RateLimitError  # Explicitly import errors


@retry(
    stop=stop_after_attempt(5),  # Retry up to 5 times
    wait=wait_exponential(multiplier=2, min=1, max=60),  # Exponential backoff
    retry=retry_if_exception_type(RateLimitError),  # Retry only on rate limit errors
)
def _gpt_classify(desc, cwe_entries):
    if (
        not desc
        or not isinstance(desc, str)
        or not cwe_entries
        or not isinstance(cwe_entries, str)
    ):  # Check for empty/invalid messages
        return None
    return chain.invoke(
        {
            "cwe_entries": cwe_entries,
            "desc": desc,
        }
    )  # Adjusted for OpenAI API format


def gpt_classify(desc, cwe_entries):
    try:
        return _gpt_classify(desc, cwe_entries)
    except OpenAIError as e:  # Catch all OpenAI-specific errors
        print(f"OpenAI API error: {e}")
    except Exception as e:
        print(f"General error processing message: {e}")
    return None

In [None]:
def cwe_list_to_json(cwes, indent=4):
    return json.dumps({"Weaknesses": cwes}, indent=indent)

In [None]:
import tiktoken


def count_tokens(text: str, model: str = "gpt-4", echo=False) -> int:
    """Count tokens for OpenAI models using tiktoken"""
    try:
        encoding = tiktoken.encoding_for_model(model)
        if echo:
            print("Finished encoding using model:", model)
    except KeyError:
        print("invalid input model:", model + ".", "Defaulting to cl100k_base")
        encoding = tiktoken.get_encoding("cl100k_base")  # Fallback for most models
    return len(encoding.encode(text))

In [None]:
import statistics as stat

cwe_token_counts = [count_tokens(json.dumps(cwe, indent=4)) + 2 for cwe in cwes]
print(cwe_token_counts)
print("Mean token count:\t\t", stat.mean(cwe_token_counts))
print("Median token count:\t\t", stat.median(cwe_token_counts))
print("Max token count:\t\t", max(cwe_token_counts))
print("Min token count:\t\t", min(cwe_token_counts))
print("Total token count:\t\t", sum(cwe_token_counts))
print("Total packaged token count:\t", count_tokens(cwe_list_to_json(cwes)))

In [None]:
cwe_chunk_json = cwe_list_to_json(cwes[130:210])
count_tokens(cwe_chunk_json)

In [None]:
from datasets import load_dataset

test_few = load_dataset(
    "Eathus/github-issues-vul-detection-gpt-few-vul-desc-results", split="test"
)
test_few_df = test_few.to_pandas()
print("all cves:\t\t\t", len(test_few_df))
test_few_df = test_few_df[~test_few_df.duplicated(subset="issue_github_id", keep=False)]
print("non duplicate issues cves:\t", len(test_few_df))

In [None]:
true_pos_few = test_few_df[test_few_df.gpt_is_relevant & ~test_few_df.cve_id.isna()]
false_pos_few = test_few_df[test_few_df.gpt_is_relevant & test_few_df.cve_id.isna()]
all_true_few = test_few_df[test_few_df.gpt_is_relevant]

print("true pssetive count:\t", len(true_pos_few))
print("false posetive count:\t", len(false_pos_few))
print("TP + FP count:\t\t", len(all_true_few))

In [None]:
print(len(test_few_df))
print(len(test_few_df.drop_duplicates(subset="issue_github_id")))
print(len(test_few_df.drop_duplicates(subset=["issue_github_id", "cve_id"])))

print(len(true_pos_few))
print(len(true_pos_few.drop_duplicates(subset="issue_github_id")))
print(len(true_pos_few.drop_duplicates(subset=["issue_github_id", "cve_id"])))

dupes = test_few_df[
    test_few_df.duplicated(subset="issue_github_id", keep=False)
].sort_values("issue_github_id")
print(len(dupes))
display(dupes[["cve_id", "issue_github_id", "issue_number", "cve_primary_cwe"]])

In [None]:
print(true_pos_few.iloc[0].gpt_description)
print(true_pos_few.iloc[0].cve_primary_cwe)

In [None]:
import tiktoken


def count_chat_tokens(messages, model="gpt-4", echo=False):
    try:
        encoding = tiktoken.encoding_for_model(model)
        if echo:
            print("Finished encoding using model:", model)

    except KeyError:
        print("invalid input model:", model + ".", "Defaulting to cl100k_base")
        encoding = tiktoken.get_encoding("cl100k_base")

    tokens_per_message = 3
    tokens_per_name = 1

    total_tokens = 0
    for msg in messages:
        total_tokens += tokens_per_message
        for key, value in msg.items():
            total_tokens += len(encoding.encode(value))
            if key == "name":
                total_tokens += tokens_per_name
    total_tokens += 3  # priming
    return total_tokens

In [None]:
formatted_system_prompt = prompts_dict["baseline_system_setup"].format(
    cwe_entries=cwe_chunk_json
)
messages = [
    {"role": "system", "content": formatted_system_prompt},
    {"role": "user", "content": true_pos_few.iloc[0].gpt_description},
]

print(count_chat_tokens(messages))
formatted_system_prompt = prompts_dict["baseline_system_setup"].format(cwe_entries="")
messages = [
    {"role": "system", "content": formatted_system_prompt},
    {"role": "user", "content": true_pos_few.iloc[0].gpt_description},
]
print(count_chat_tokens(messages))
print(count_tokens(cwe_chunk_json))

In [None]:
true_pos_few.columns

In [None]:
import json
import os
import pickle
import pandas as pd
from tqdm.notebook import tqdm
from utils import *
import time


def find_cwe_list(high, low, cwes, max_request_size, cwe_tc_list, msg_tc):
    # Recursive binary search function
    mid = low + (high - low) // 2
    token_count = sum(cwe_tc_list[:mid]) + msg_tc + 11

    if low == high:
        return [], 0, token_count

    if token_count == max_request_size:
        return cwes[:mid], mid, token_count
    if high - low == 1:
        max_tc = token_count + cwe_tc_list[mid]
        if max_tc < max_request_size:
            return cwes[:high], high, max_tc
        return cwes[:mid], mid, token_count

    if token_count < max_request_size:
        return find_cwe_list(high, mid, cwes, max_request_size, cwe_tc_list, msg_tc)
    else:
        return find_cwe_list(mid, low, cwes, max_request_size, cwe_tc_list, msg_tc)


def get_prompt_preamble_tc(desc, prompts_dict, ignore_user=False):
    formatted_system_prompt = prompts_dict["baseline_system_setup"].format(
        cwe_entries=""
    )
    messages = [
        {"role": "system", "content": formatted_system_prompt},
        {"role": "user", "content": ("" if ignore_user else desc)},
    ]
    return count_chat_tokens(messages)


def filter_possible_cwes(cwes):
    possible_cwes = [resp for resp in cwes if resp["gpt_cwe"] != "None"]

    if possible_cwes == []:
        return [min(cwes, key=(lambda x: x["gpt_cwe_confidence"]))]

    max_conf = max(possible_cwes, key=(lambda x: x["gpt_cwe_confidence"]))[
        "gpt_cwe_confidence"
    ]
    possible_cwes = [
        resp for resp in possible_cwes if resp["gpt_cwe_confidence"] == max_conf
    ]
    possible_cwes = list({p["gpt_cwe"]: p for p in possible_cwes}.values())
    return possible_cwes


def get_gpt_cwe(cwes, desc, max_request_size, echo=False):
    prompts_dict = load_prompts(os.getcwd() + "/../utils/prompts")
    cwe_dict = {cwe["ID"]: cwe for cwe in cwes}
    msg_tc = get_prompt_preamble_tc(desc, prompts_dict)

    while True:
        i = 0
        responses = []
        # print("cwes:\n", len(cwes))
        inner_pbar = tqdm(
            total=len(cwes),
            desc="Processing CWE chunks",
            leave=False,
            disable=not echo,
        )

        while i < len(cwes):
            # print("i:", i)
            cwe_chunk, next_i, _ = find_cwe_list(
                len(cwes[i:]),
                0,
                cwes[i:],
                max_request_size,
                cwe_token_counts[i:],
                msg_tc,
            )
            # print(any(cwe['ID'] == '415' for cwe in cwe_chunk))
            i += next_i

            response = '{"gpt_cwe":"415","gpt_cwe_confidence":5}'
            time.sleep(0.5)
            # response = gpt_classify(desc, cwe_list_to_json(cwe_chunk))

            response = json.loads(response)
            # print(response)
            responses.append(response)

            inner_pbar.update(next_i)

        inner_pbar.close()

        poss_cwes = filter_possible_cwes(responses)
        if len(poss_cwes) == 1:
            return poss_cwes[0]

        cwes = [cwe_dict[cwe["gpt_cwe"]] for cwe in poss_cwes]


def classify_issues(cwes, data_df, max_request_size, echo=False):
    issue_dict = {
        dat.issue_github_id: ([], dat.gpt_description)
        for (_, dat) in data_df.iterrows()
    }
    cwe_dict = {cwe["ID"]: cwe for cwe in cwes}

    prompts_dict = load_prompts(os.getcwd() + "/../utils/prompts")
    msg_tc = get_prompt_preamble_tc("", prompts_dict, True)

    i = 0
    req_number = 0
    chunks = []
    while i < len(cwes):
        # print("i:", i)
        cwe_chunk, next_i, _ = find_cwe_list(
            len(cwes[i:]),
            0,
            cwes[i:],
            max_request_size,
            cwe_token_counts[i:],
            msg_tc,
        )
        i += next_i

        chunks.append(cwe_chunk)
        req_number += 1

    chunk_iter = tqdm(chunks, desc="Processing CWE chunks", disable=not echo)
    for chunk in chunk_iter:
        data_iter = tqdm(
            data_df.iterrows(),
            total=len(data_df),
            desc="Classifying issues",
            leave=False,
            disable=not echo,
        )
        for _, dat in data_iter:
            # response = gpt_classify(dat.gpt_description, cwe_list_to_json(chunk))
            # issue_dict[dat.issue_github_id][0].append(response)

            time.sleep(0.001)
            issue_dict[dat.issue_github_id][0].extend(
                [
                    {"gpt_cwe": "415", "gpt_cwe_confidence": 5},
                    {"gpt_cwe": "664", "gpt_cwe_confidence": 5},
                    {"gpt_cwe": "666", "gpt_cwe_confidence": 5},
                    {"gpt_cwe": "416", "gpt_cwe_confidence": 5},
                    {"gpt_cwe": "1341", "gpt_cwe_confidence": 5},
                ]
            )

    data_iter = tqdm(
        issue_dict.items(),
        desc="Re-classifying issues",
        disable=not echo,
        leave=False,
    )
    for key, val in data_iter:
        poss_cwes = filter_possible_cwes(val[0])
        if len(poss_cwes) == 1:
            issue_dict[key] = poss_cwes[0]
        else:
            remaining_cwes = [cwe_dict[cwe["gpt_cwe"]] for cwe in poss_cwes]
            issue_dict[key] = get_gpt_cwe(
                remaining_cwes, val[1], max_request_size, echo
            )

    return issue_dict


def batch_cwe_request(cwes, desc, git_issue_id, max_request_size, model="gpt-40-mini", echo=True):
    prompts_dict = load_prompts(os.getcwd() + "/../utils/prompts")
    msg_tc = get_prompt_preamble_tc(desc, prompts_dict)

    i = 0
    req_number = 0
    requests = []

    inner_pbar = tqdm(
        total=len(cwes),
        desc=f"Gathering CWE requests for issue {git_issue_id}",
        leave=False,
        disable=not echo,
    )
    while i < len(cwes):
        # print("i:", i)
        cwe_chunk, next_i, _ = find_cwe_list(
            len(cwes[i:]),
            0,
            cwes[i:],
            max_request_size,
            cwe_token_counts[i:],
            msg_tc,
        )
        i += next_i

        formatted_system_prompt = prompts_dict["baseline_system_setup"].format(
            cwe_entries=cwe_list_to_json(cwe_chunk)
        )
        request = {
            "custom_id": str(git_issue_id) + "-" + str(req_number),
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": model,
                "messages": [
                    {"role": "system", "content": formatted_system_prompt},
                    {"role": "user", "content": desc},
                ],
                "max_tokens": 1000,
            },
        }
        requests.append(request)
        req_number += 1


        inner_pbar.update(next_i)

    inner_pbar.close()

    return requests


def generate_batch_post_files(
    cwes,
    data_df,
    max_request_size,
    post_folder_path='tmp',
    max_size_mb=150,
    output_dir="batches",
    output_prefix="batch",
    echo=True
):
    output_dir = os.path.join(post_folder_path, output_dir)
    os.makedirs(output_dir, exist_ok=True)
    
    #requests = []
    batch_info_dict = {}

    max_size_bytes = max_size_mb * 1024 * 1024
    part_num = 1
    current_size = 0

    output_path = os.path.join(output_dir, f"{output_prefix}_{part_num}.jsonl")
    output_file = open(output_path, 'w', encoding='utf-8')

    data_iter = tqdm(
        data_df.iterrows(),
        total=len(data_df),
        desc="Writing batch files",
        disable=not echo,
    )
    for _, row in data_iter:
        requests = batch_cwe_request(
            cwes, row.gpt_description, row.issue_github_id, max_request_size
        ) 
        req_iter = tqdm(
            requests,
            desc=f"Writing issue {row.issue_github_id} batch files",
            leave=False,
            disable=not echo,
        )
        for req in req_iter:
            req_line = json.dumps(req) + "\n"
            line_size = len(req_line.encode('utf-8'))
            if current_size + line_size > max_size_bytes:
                    output_file.close()
                    part_num += 1
                    output_path = os.path.join(output_dir, f"{output_prefix}_{part_num}.jsonl")
                    output_file = open(output_path, 'w', encoding='utf-8')
                    current_size = 0
            output_file.write(req_line)
            current_size += line_size
        del requests

    output_file.close()
    batch_info_dict['batch_count'] = part_num
    batch_info_dict['batch_prefix'] = output_prefix
    batch_info_dict['batch_size'] = max_size_mb
    batch_info_dict['batch_issues'] = data_df['issue_github_id'].tolist()

    pickle_file = os.path.join(output_dir, 'batch_info.pkl')
    with open(pickle_file, 'wb') as file:
        pickle.dump(batch_info_dict, file)

def post_batches(
    post_folder_path='tmp',
    batches_dir="batches",
) :
    batches_dir = os.path.join(post_folder_path, batches_dir)
    pickle_file = os.path.join(batches_dir, 'batch_info.pkl')
    with open(pickle_file, 'rb') as file:
        batch_info_dict = pickle.load(file)
    
    #files = []
    batches = []
    for i in range(1, batch_info_dict['batch_count'] + 1) :
        path = os.path.join(batches_dir, f"{batch_info_dict['batch_prefix']}_{i}.jsonl")
        file = client.files.create(
            file=open(path, "rb"),
            purpose="batch"
        )
        #files.append(file)
        batch = client.batches.create(
            input_file_id=file.id,
            endpoint="/v1/chat/completions",
            completion_window="24h",
            metadata={
                "batch_name": f"{batch_info_dict['batch_prefix']}_{i}"
            }
        )
        batches.append(batch['id'])
    
    batch_info_dict['batch_ids'] = batches
    with open(pickle_file, 'wb') as file:
        pickle.dump(batch_info_dict, file)

def retrieve_batch_res(
    post_folder_path='tmp',
    batches_dir="batches",
    output_dir='responses'

) :
    batches_dir = os.path.join(post_folder_path, batches_dir)
    output_dir = os.path.join(post_folder_path, output_dir)

    pickle_file = os.path.join(batches_dir, 'batch_info.pkl')
    with open(pickle_file, 'rb') as file:
        batch_info_dict = pickle.load(file)
    
    statuses = {}
    for batch_id in batch_info_dict['batch_ids'] :
        batch = client.batches.retrieve(batch_id)
        status = batch['status'] 
        statuses[batch['metadata']['batch_name']] = status
        if status == 'completed' :
            file_response = client.files.content(batch['id'])
            output_file = os.path.join(output_dir, batch['metadata']['batch_name'] + 'response')
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(file_response.text)
            batch_info_dict['batch_ids'].remove(batch_id)
        else :
            print('batch,', batch['metadata']['batch_name'], 'status:\t', status)
    
    return statuses

In [None]:
type(true_pos_few['issue_github_id'].tolist())

In [None]:
generate_batch_post_files(cwes, true_pos_few, 100000, max_size_mb=190)
# classify_issues(cwes, true_pos_few, 100000, True)

In [None]:
{
    "custom_id": "request-1",
    "method": "POST",
    "url": "/v1/chat/completions",
    "body": {
        "model": "gpt-3.5-turbo-0125",
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Hello world!"},
        ],
        "max_tokens": 1000,
    },
}
{
    "custom_id": "request-2",
    "method": "POST",
    "url": "/v1/chat/completions",
    "body": {
        "model": "gpt-3.5-turbo-0125",
        "messages": [
            {"role": "system", "content": "You are an unhelpful assistant."},
            {"role": "user", "content": "Hello world!"},
        ],
        "max_tokens": 1000,
    },
}

In [None]:
formatted_system_prompt = prompts_dict["baseline_system_setup"].format(cwe_entries="")
messages = [
    {"role": "system", "content": formatted_system_prompt},
    {"role": "user", "content": ""},
]
msg_tc = count_chat_tokens(messages)
cwe_chunk, end_ind, size = find_cwe_list(
    len(cwes[186:]), 0, cwes[186:], 100000, cwe_token_counts[186:], msg_tc
)

In [None]:
formatted_system_prompt = prompts_dict["baseline_system_setup"].format(
    cwe_entries=cwe_list_to_json(cwe_chunk)
)
msgs = [
    {"role": "system", "content": formatted_system_prompt},
    {"role": "user", "content": true_pos_few.iloc[0].gpt_description},
]
print("end_ind:\t", end_ind)
print("true size:\t", count_chat_tokens(msgs))
print("size:\t", size)
print("len(cwe_chunk):\t", len(cwe_chunk))
print("cwe_chunk:\t", cwe_list_to_json(cwe_chunk))
# display(cwe_list_to_json(cwe_chunk, 4))

formatted_system_prompt = prompts_dict["baseline_system_setup"].format(
    cwe_entries=cwe_list_to_json(cwe_chunk)
)
display(formatted_system_prompt)

In [None]:
len(cwes)

In [None]:
maxr_size = 100000
resp = get_gpt_cwe(cwes, true_pos_few.iloc[0].gpt_description, maxr_size, echo=True)

In [None]:
print(type(resp))
print(resp)