Import packages

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import json
from tqdm import tqdm

Loading original prompts ("prompt_develop.jsonl") and your jailbreak prompts ("jailbreak_develop.jsonl")

In [None]:
def get_prompts(file_name):
    prompt_list = []
    with open(file_name, 'r') as file:
        for line in file:
            json_record = json.loads(line)
            prompt_list.append(json_record["prompt"])
    return prompt_list
original_prompt_list = get_prompts(file_name="prompt_test.jsonl") # prompt in test phase
jailbreak_prompt_list = get_prompts(file_name="./submission/jailbreak_test.jsonl")

Loading model inference pipeline

In [None]:
def get_model_inference_pipeline(model_id = "meta-llama/Meta-Llama-3-8B-Instruct"):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1024)
    return pipe
model_id = "google/gemma-2b-it" # one test-phase model (we also hold another LLM for test)
pipe = get_model_inference_pipeline(model_id)

Loading functions to compute jailbreak scores and stealthiness scores

In [None]:
from utils import get_jailbreak_score
from utils import get_stl_score

Peform model inference with jailbreak prompts and evaluation

In [None]:
results = []
avg_jailbreak_score, avg_stealthiness_score = 0., 0.
tokenizer = AutoTokenizer.from_pretrained(model_id)
for original_prompt, jailbreak_prompt in tqdm(zip(original_prompt_list, jailbreak_prompt_list)):
    messages = [
        {"role": "user", "content": jailbreak_prompt},
    ]
    response = pipe(messages)[0]['generated_text'][1]['content']
    jailbreak_score = get_jailbreak_score(pipe, original_prompt, jailbreak_prompt, response, tokenizer) # In the test phase, the evaluation model and evaluation prompts are held out to avoid adaptive attack on the judge model. This is just an example of model judge. We will consistently use Gemma-2b-it tokenizer to detect jailbreak prompt with length over 100.
    stl_score = get_stl_score(original_prompt, jailbreak_prompt)
    record = {"original_prompt": original_prompt, "jailbreak_prompt": jailbreak_prompt, "response": response, "jailbreak_score": jailbreak_score, "stealthiness_score": stl_score}
    results.append(record)
    avg_jailbreak_score += jailbreak_score / len(jailbreak_prompt_list)
    avg_stealthiness_score += stl_score / len(jailbreak_prompt_list)

Store and show final results

In [None]:
file_name = "results_test.jsonl"
with open(file_name, 'w') as file:
    for dictionary in results:
        json_record = json.dumps(dictionary)
        file.write(json_record + "\n")
print(f'Average jailbreak score: {avg_jailbreak_score}') # 0.151 for PAP jailbreak
print(f'Average stealthiness score: {avg_stealthiness_score}') # 0.220 for PAP jailbreak