# Benchmark models

Using qualifire dataset

In [41]:
import sys
import pathlib
import joblib

import pandas as pd
import numpy as np
from datasets import load_dataset

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

Set cwd

In [8]:
root = pathlib.Path().resolve().parent
sys.path.insert(0, str(root))

Import vectorizer

In [9]:
from app.rag.vectorizer import Vectorizer

## Load data

Qualifire dataset

In [56]:
df = pd.read_csv("prompt-injections-benchmark.csv")
df['label'] = df['label'].replace({'jailbreak': 1, 'benign': 0})

X_qualifire = df['text']
y_qualifire = df['label']

y_qualifire.value_counts(normalize=True)

  df['label'] = df['label'].replace({'jailbreak': 1, 'benign': 0})


label
0    0.6002
1    0.3998
Name: proportion, dtype: float64

Custom dataset

In [43]:
bening = load_dataset("HuggingFaceH4/helpful-instructions", split='train')
jailbreak = pd.read_csv('jailbreak_prompts.csv')

df_b = pd.DataFrame({'text': bening['instruction'], 'label': 0})
df_j = pd.DataFrame({'text': jailbreak['Prompt'], 'label': 1})
df = pd.concat([df_b, df_j])

X_custom = df['text']
y_custom = df['label']

y_custom.value_counts(normalize=True)

label
0    0.986173
1    0.013827
Name: proportion, dtype: float64

## Score function

In [44]:
def score(y_test:np.ndarray, y_pred:np.ndarray) -> None:
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print('accuracy', accuracy)
    print('precision', precision)
    print('recall', recall)
    print('f1 score', f1)

## multi-lang SVM

In [45]:
vectorizer = Vectorizer(model_name='paraphrase-multilingual-MiniLM-L12-v2')
X_emb = vectorizer.generate_embeddings(X_custom.to_list())

In [46]:
model = joblib.load('svc_model_multi.joblib')

In [47]:
y_pred = model.predict(X_emb)
score(y_custom, y_pred)

accuracy 0.9752365182905253
precision 0.3149570718481699
recall 0.6731047802993723
f1 score 0.42912113283053716


## english SVM

In [48]:
vectorizer = Vectorizer()
X_emb = vectorizer.generate_embeddings(X_custom.to_list())

In [49]:
model = joblib.load('svc_model_en.joblib')

In [50]:
y_pred = model.predict(X_emb)
score(y_custom, y_pred)

accuracy 0.6271790728883607
precision 0.03445947115967376
recall 0.9608884596813134
f1 score 0.0665329321297225


## Custom XGBoost

In [58]:
vectorizer = Vectorizer()
X_emb = vectorizer.generate_embeddings(X_qualifire.to_list())

In [61]:
model = joblib.load('xgboost_custom.joblib')

In [63]:
y_pred = model.predict(X_emb)
score(y_qualifire, y_pred)

accuracy 0.702
precision 0.6413103831204886
recall 0.5777888944472236
f1 score 0.6078947368421053


## Custom SVM

In [None]:
model = joblib.load('custom.joblib')

In [65]:
y_pred = model.predict(X_emb)
score(y_qualifire, y_pred)

accuracy 0.7388
precision 0.6431226765799256
recall 0.7788894447223612
f1 score 0.7045248868778281
