In [1]:
import sys, os, random
import csv
import json
import numpy as np
from collections import defaultdict, OrderedDict

from datasets import load_dataset
import tiktoken

In [26]:
#temperature = 0.0
temperature = 1.0
#model = "gpt-4o-mini-2024-07-18"
model = "ft:gpt-4o-mini-2024-07-18:uedin:semantic-inequiv-bob-run-0-round-6-generation-0:AyS2gJFd"
max_tokens = 256
max_input_tokens = 128000
#n = 1
n = 9

In [27]:
#filename_base = "./gpt-4o-mini"
filename_base = "./sinq-gpt-4o-mini"

In [28]:
filename_to_openai = filename_base+"_to_openai.jsonl"
filename_from_openai = filename_base+"_from_openai.jsonl"

In [29]:
dataset = load_dataset("sunlab/PySecDB", split="train") # there is not "test"
dataset = list(dataset)

In [30]:
system_message = \
"""You are an expert computer scientist. Your task is to take a diff patch for a Python program and determine whether it contains a fix to a security vulnerability.
Write your output the following JSON format:

```json
{
  "vulnerability_fix": true or false
}
```

Do not write anything else.
"""

def user_message_fn_0(example):
    return f"""Diff patch:

```
{example["content"]}
```
"""

In [31]:
def example_to_openai_api_format(example):
    messages = [{"role": "system", "content": system_message}, 
                {"role": "user", "content": user_message_fn_0(example)}]
    rv = {
        "model": model,
        "messages": messages,
        "temperature": temperature,
        "max_tokens": max_tokens,
        "n": n,
        "response_format": { "type": "json_object" }
    }
    return json.dumps(rv)

In [32]:
with open(filename_to_openai, "w") as out_fs:
    for example in dataset:
        example["llm_request"] = example_to_openai_api_format(example)
        encoding = tiktoken.encoding_for_model("gpt-4o-mini-2024-07-18")
        example["llm_request_token_count"] = len(encoding.encode(example["llm_request"]))
        if example["llm_request_token_count"] > max_input_tokens:
            del example["llm_request"]
        else:
            print(example["llm_request"], file=out_fs)

In [33]:
#

In [34]:
responses_from_openai = dict()
with open(filename_from_openai, "r") as in_fs:
    for line in in_fs:
        response_k, response_v = json.loads(line)
        response_k_str = json.dumps(response_k)
        responses_from_openai[response_k_str] = response_v

for example in dataset:
    if ("llm_request" in example) and (example["llm_request"] in responses_from_openai):
        example["llm_response"] = responses_from_openai[example["llm_request"]]
        example["llm_prediction"] = []
        for k, choice in enumerate(example["llm_response"]["choices"]):
            try:
                parsed = json.loads(choice["message"]["content"])
                example["llm_prediction"].append(parsed["vulnerability_fix"])
            except Exception:
                pass

In [35]:
num_examples = 0
num_correct = 0

for example in dataset:
    if "llm_request" in example:
        num_examples += 1
        try:
            majority_prediction = (np.array(example["llm_prediction"]).mean() >= 0.5)
            if (example["label"] == "security") == majority_prediction:
                num_correct += 1
        except:
            pass

accuracy = float(num_correct) / num_examples
print(f"Model: {model}, n: {n}, temperature: {temperature}, evaluated examples: {num_examples}, num. correct: {num_correct}, accuracy: {accuracy}")

Model: ft:gpt-4o-mini-2024-07-18:uedin:semantic-inequiv-bob-run-0-round-6-generation-0:AyS2gJFd, n: 9, temperature: 1.0, evaluated examples: 4042, num. correct: 3347, accuracy: 0.8280554181098466


In [36]:
# Model: gpt-4o-mini-2024-07-18, n: 1, temperature: 0.0, evaluated examples: 4042, num. correct: 3332, accuracy: 0.8243443839683325
# Model: ft:gpt-4o-mini-2024-07-18:uedin:semantic-inequiv-bob-run-0-round-6-generation-0:AyS2gJFd, n: 1, temperature: 0.0, evaluated examples: 4042, num. correct: 3335, accuracy: 0.8250865907966354

# Model: gpt-4o-mini-2024-07-18, n: 9, temperature: 1.0, evaluated examples: 4042, num. correct: 3334, accuracy: 0.8248391885205344
# Model: ft:gpt-4o-mini-2024-07-18:uedin:semantic-inequiv-bob-run-0-round-6-generation-0:AyS2gJFd, n: 9, temperature: 1.0, evaluated examples: 4042, num. correct: 3347, accuracy: 0.8280554181098466