In [18]:
import sys, os, random
import csv
import json
import numpy as np
from collections import defaultdict, OrderedDict

from datasets import load_dataset
import tiktoken
import mistune

In [22]:
temperature = 0.6
#model = "gpt-4o-mini-2024-07-18"
model = "ft:gpt-4o-mini-2024-07-18:uedin:semantic-inequiv-bob-run-0-round-6-generation-0:AyS2gJFd"
max_tokens = 2048
max_input_tokens = 128000-max_tokens
n = 1
#n = 9

In [23]:
#filename_base = "./gpt-4o-mini"
filename_base = "./sinq-gpt-4o-mini"

In [24]:
filename_to_openai = filename_base+"_to_openai.jsonl"
filename_from_openai = filename_base+"_from_openai.jsonl"

In [25]:
dataset = load_dataset("sunlab/PySecDB", split="train") # there is not "test"
dataset = list(dataset)

In [26]:
system_message = \
"""You are an expert computer scientist. Your task is to take a diff patch for a Python program and determine whether it contains a fix to a security vulnerability.
Write your output the following JSON format:
Think step by step before writing your program. Use the following Markdown format, making sure that the following sections are delimited by level 1 headings, since they will have to be automatically parsed:
# Analysis
step by step analysis. This section can include sub-headings and code blocks
# Output
```json
{
  "vulnerability_fix": true or false
}
```
"""

def user_message_fn_0(example):
    return f"""Diff patch:

```
{example["content"]}
```
"""

In [27]:
def example_to_openai_api_format(example):
    messages = [{"role": "system", "content": system_message}, 
                {"role": "user", "content": user_message_fn_0(example)}]
    rv = {
        "model": model,
        "messages": messages,
        "temperature": temperature,
        "max_tokens": max_tokens,
        "n": n,
        #"response_format": { "type": "json_object" }
    }
    return json.dumps(rv)

In [28]:
with open(filename_to_openai, "w") as out_fs:
    for example in dataset:
        example["llm_request"] = example_to_openai_api_format(example)
        encoding = tiktoken.encoding_for_model("gpt-4o-mini-2024-07-18")
        example["llm_request_token_count"] = len(encoding.encode(example["llm_request"]))
        if example["llm_request_token_count"] > max_input_tokens:
            del example["llm_request"]
        else:
            print(example["llm_request"], file=out_fs)

In [29]:
#

In [30]:
responses_from_openai = dict()
with open(filename_from_openai, "r") as in_fs:
    for line in in_fs:
        response_k, response_v = json.loads(line)
        response_k_str = json.dumps(response_k)
        responses_from_openai[response_k_str] = response_v

markdown = mistune.create_markdown(renderer='ast')
for example in dataset:
    if ("llm_request" in example) and (example["llm_request"] in responses_from_openai):
        example["llm_response"] = responses_from_openai[example["llm_request"]]
        content = example["llm_response"]["choices"][0]["message"]["content"]
        try:
            parsed_markdown = markdown(content)
            output_i = [i for i, e in enumerate(parsed_markdown) if (e['type'] == "heading") and (e["children"][0]["raw"] == "Output")][0]
            output_code_block = [e["raw"] for e in parsed_markdown[output_i:] if (e['type'] == "block_code") and ("attrs" in e) and (e["attrs"]["info"]=="json")][0]
            parsed = json.loads(output_code_block)
            example["llm_prediction"] = parsed["vulnerability_fix"]
        except:
            print("Invalid response")
            pass

Invalid response


In [31]:
num_examples = 0
num_correct = 0

for example in dataset:
    if "llm_request" in example:
        num_examples += 1
        try:
            majority_prediction = (np.array(example["llm_prediction"]).mean() >= 0.5)
            if (example["label"] == "security") == majority_prediction:
                num_correct += 1
        except:
            pass

accuracy = float(num_correct) / num_examples
print(f"Model: {model}, n: {n}, temperature: {temperature}, evaluated examples: {num_examples}, num. correct: {num_correct}, accuracy: {accuracy}")

Model: ft:gpt-4o-mini-2024-07-18:uedin:semantic-inequiv-bob-run-0-round-6-generation-0:AyS2gJFd, n: 1, temperature: 0.6, evaluated examples: 4041, num. correct: 2950, accuracy: 0.7300173224449393


In [32]:
# Model: gpt-4o-mini-2024-07-18, n: 1, temperature: 0.6, evaluated examples: 4041, num. correct: 2972, accuracy: 0.7354615194258847
# Model: ft:gpt-4o-mini-2024-07-18:uedin:semantic-inequiv-bob-run-0-round-6-generation-0:AyS2gJFd, n: 1, temperature: 0.6, evaluated examples: 4041, num. correct: 2950, accuracy: 0.7300173224449393