In [1]:
import sys
import pathlib
import joblib

import pandas as pd
import numpy as np

from sklearn.base import clone
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

Set cwd

In [2]:
root = (
    pathlib.Path().resolve().parent
)  # or parent.parent, depending on notebook location
sys.path.insert(0, str(root))

Import app vectorizer

In [3]:
from app.rag.vectorizer import Vectorizer

  from .autonotebook import tqdm as notebook_tqdm


## Loading the data

We are combining two datasets:
- Qualifire `prompt-infections-benchmark` dataset (https://huggingface.co/datasets/qualifire/prompt-injections-benchmark) translated to french
- Jayavibhav `prompt-injection-safety` dataset (https://huggingface.co/datasets/jayavibhav/prompt-injection-safety) for english

In [4]:
df_fr = pd.read_csv("prompt-injections-benchmark-fr.csv")
df_fr["language"] = "fr"
df_fr.shape

(5000, 3)

In [5]:
splits = {
    "train": "data/train-00000-of-00001.parquet",
    "test": "data/test-00000-of-00001.parquet",
}
df_en = pd.read_parquet(
    "hf://datasets/jayavibhav/prompt-injection-safety/" + splits["train"]
)
df_en = df_en[df_en["label"] != 2]
df_en["language"] = "en"

# Sample 5000 rows
sample_size = min(5000, len(df_en))
df_en = df_en.sample(n=sample_size, random_state=1).reset_index(drop=True)

df_en.shape

(5000, 3)

In [189]:
print("df_fr", df_fr.columns)
print("df_en", df_en.columns)

df_fr Index(['text', 'label', 'language'], dtype='object')
df_en Index(['text', 'label', 'language'], dtype='object')


Uniformize and concat into `df`

In [6]:
df = pd.concat([df_fr, df_en], ignore_index=True)
df["label"] = df["label"].replace({"benign": 0, "jailbreak": 1})
df["label"].value_counts(normalize=True)

  df['label'] = df['label'].replace({'benign': 0, 'jailbreak': 1})


label
0    0.5612
1    0.4388
Name: proportion, dtype: float64

Keep a ratio of 80% safe prompt and 20% injections

In [None]:
# target_ratio = 0.20

# mask_pos = df['label'] == 1
# n_pos = mask_pos.sum()
# n_neg = len(df) - n_pos

# # number of positive rows needed
# target_pos = int((target_ratio / (1 - target_ratio)) * n_neg)
# target_pos = min(target_pos, n_pos)  # can’t keep more than we have

# n_drop = n_pos - target_pos

# drop_idx = df.loc[mask_pos].sample(n=n_drop, random_state=0).index
# df = df.drop(index=drop_idx)

In [None]:
print(df.shape)
print(df["label"].value_counts(normalize=True))

(7015, 3)
label
0    0.8
1    0.2
Name: proportion, dtype: float64


Train / Test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=1, stratify=df["label"]
)

## Vectorizing

Vectorize text using app Vectorizer class and multi-language `paraphrase-multilingual-MiniLM-L12-v2` model

In [8]:
vectorizer = Vectorizer(model_name="paraphrase-multilingual-MiniLM-L12-v2")
X_train_emb = vectorizer.generate_embeddings(X_train.to_list())
X_test_emb = vectorizer.generate_embeddings(X_test.to_list())

## Scoring function

In [9]:
def score(y_test: np.ndarray, y_pred: np.ndarray) -> None:
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print("accuracy", accuracy)
    print("precision", precision)
    print("recall", recall)
    print("f1 score", f1)

## Logistic regression

In [10]:
clf = LogisticRegression(random_state=1).fit(X_train_emb, y_train)

Evaluate model on test

In [11]:
y_pred = clf.predict(X_test_emb)

In [12]:
score(y_test, y_pred)

accuracy 0.83
precision 0.8008948545861297
recall 0.8154897494305239
f1 score 0.8081264108352144


## Random forest

In [13]:
clf = RandomForestClassifier(n_estimators=200, max_depth=None, random_state=1)
clf.fit(X_train_emb, y_train)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",200
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [14]:
y_pred = clf.predict(X_test_emb)
score(y_test, y_pred)

accuracy 0.7755
precision 0.7926330150068213
recall 0.6617312072892938
f1 score 0.7212911235257604


#### Grid Search

In [111]:
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

In [None]:
grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring="f1",  # change to 'accuracy' or others if you prefer
    cv=5,
    n_jobs=-1,
    verbose=2,
)
grid_search.fit(X_train_emb, y_train)

In [None]:
best_model = grid_search.best_estimator_
best_model

In [175]:
y_pred = best_model.predict(X_test_emb)

In [None]:
score(y_test, y_pred)

## XGBoost

In [15]:
clf = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    max_features="sqrt",
    random_state=1,
)

clf.fit(X_train_emb, y_train)

0,1,2
,"loss  loss: {'log_loss', 'exponential'}, default='log_loss' The loss function to be optimized. 'log_loss' refers to binomial and multinomial deviance, the same as used in logistic regression. It is a good choice for classification with probabilistic outputs. For loss 'exponential', gradient boosting recovers the AdaBoost algorithm.",'log_loss'
,"learning_rate  learning_rate: float, default=0.1 Learning rate shrinks the contribution of each tree by `learning_rate`. There is a trade-off between learning_rate and n_estimators. Values must be in the range `[0.0, inf)`. For an example of the effects of this parameter and its interaction with ``subsample``, see :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_regularization.py`.",0.05
,"n_estimators  n_estimators: int, default=100 The number of boosting stages to perform. Gradient boosting is fairly robust to over-fitting so a large number usually results in better performance. Values must be in the range `[1, inf)`.",200
,"subsample  subsample: float, default=1.0 The fraction of samples to be used for fitting the individual base learners. If smaller than 1.0 this results in Stochastic Gradient Boosting. `subsample` interacts with the parameter `n_estimators`. Choosing `subsample < 1.0` leads to a reduction of variance and an increase in bias. Values must be in the range `(0.0, 1.0]`.",0.8
,"criterion  criterion: {'friedman_mse', 'squared_error'}, default='friedman_mse' The function to measure the quality of a split. Supported criteria are 'friedman_mse' for the mean squared error with improvement score by Friedman, 'squared_error' for mean squared error. The default value of 'friedman_mse' is generally the best as it can provide a better approximation in some cases. .. versionadded:: 0.18",'friedman_mse'
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, values must be in the range `[2, inf)`. - If float, values must be in the range `(0.0, 1.0]` and `min_samples_split`  will be `ceil(min_samples_split * n_samples)`. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, values must be in the range `[1, inf)`. - If float, values must be in the range `(0.0, 1.0)` and `min_samples_leaf`  will be `ceil(min_samples_leaf * n_samples)`. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. Values must be in the range `[0.0, 0.5]`.",0.0
,"max_depth  max_depth: int or None, default=3 Maximum depth of the individual regression estimators. The maximum depth limits the number of nodes in the tree. Tune this parameter for best performance; the best value depends on the interaction of the input variables. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. If int, values must be in the range `[1, inf)`.",3
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. Values must be in the range `[0.0, inf)`. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0


In [16]:
y_pred = clf.predict(X_test_emb)
score(y_test, y_pred)

accuracy 0.775
precision 0.7431818181818182
recall 0.744874715261959
f1 score 0.7440273037542662


#### Grid Search

In [184]:
clf = GradientBoostingClassifier(random_state=1)

In [185]:
param_grid = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [3, 5, 7],
    "subsample": [0.6, 0.8, 1.0],
    "max_features": [None, "sqrt"],
}

grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring="f1",  # change to 'accuracy', 'precision', etc., as needed
    cv=5,
    n_jobs=-1,
    verbose=2,
)

In [None]:
grid_search.fit(X_train_emb, y_train)

## SVM

In [17]:
clf = SVC(kernel="rbf", C=1.0, probability=False)

clf.fit(X_train_emb, y_train)

0,1,2
,"C  C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.",1.0
,"kernel  kernel: {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf' Specifies the kernel type to be used in the algorithm. If none is given, 'rbf' will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape ``(n_samples, n_samples)``. For an intuitive visualization of different kernel types see :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`.",'rbf'
,"degree  degree: int, default=3 Degree of the polynomial kernel function ('poly'). Must be non-negative. Ignored by all other kernels.",3
,"gamma  gamma: {'scale', 'auto'} or float, default='scale' Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - if ``gamma='scale'`` (default) is passed then it uses  1 / (n_features * X.var()) as value of gamma, - if 'auto', uses 1 / n_features - if float, must be non-negative. .. versionchanged:: 0.22  The default value of ``gamma`` changed from 'auto' to 'scale'.",'scale'
,"coef0  coef0: float, default=0.0 Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'.",0.0
,"shrinking  shrinking: bool, default=True Whether to use the shrinking heuristic. See the :ref:`User Guide `.",True
,"probability  probability: bool, default=False Whether to enable probability estimates. This must be enabled prior to calling `fit`, will slow down that method as it internally uses 5-fold cross-validation, and `predict_proba` may be inconsistent with `predict`. Read more in the :ref:`User Guide `.",False
,"tol  tol: float, default=1e-3 Tolerance for stopping criterion.",0.001
,"cache_size  cache_size: float, default=200 Specify the size of the kernel cache (in MB).",200
,"class_weight  class_weight: dict or 'balanced', default=None Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.",


In [18]:
y_pred = clf.predict(X_test_emb)

In [19]:
score(y_test, y_pred)

accuracy 0.86
precision 0.8314855875831486
recall 0.8542141230068337
f1 score 0.8426966292134831


In [24]:
joblib.dump(clf, "svc_model_multi.joblib")

['svc_model_multi.joblib']

#### Grid Search

In [20]:
pipeline = Pipeline([("scaler", StandardScaler()), ("svm", SVC())])

In [21]:
param_grid = {
    "svm__kernel": ["rbf", "linear"],
    "svm__C": [0.1, 1, 10],
    "svm__gamma": ["scale", "auto"],
}

In [22]:
grid_search = GridSearchCV(
    estimator=pipeline, param_grid=param_grid, cv=5, scoring="f1", n_jobs=-1, verbose=2
)

In [23]:
grid_search.fit(X_train_emb, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ...svm__C=0.1, svm__gamma=scale, svm__kernel=linear; total time=  14.2s
[CV] END ...svm__C=0.1, svm__gamma=scale, svm__kernel=linear; total time=  14.0s
[CV] END ...svm__C=0.1, svm__gamma=scale, svm__kernel=linear; total time=  14.2s
[CV] END ...svm__C=0.1, svm__gamma=scale, svm__kernel=linear; total time=  14.5s
[CV] END ...svm__C=0.1, svm__gamma=scale, svm__kernel=linear; total time=  14.4s
[CV] END ......svm__C=0.1, svm__gamma=scale, svm__kernel=rbf; total time=  14.9s
[CV] END ......svm__C=0.1, svm__gamma=scale, svm__kernel=rbf; total time=  15.0s
[CV] END ......svm__C=0.1, svm__gamma=scale, svm__kernel=rbf; total time=  14.9s
[CV] END ......svm__C=0.1, svm__gamma=scale, svm__kernel=rbf; total time=  15.0s
[CV] END ......svm__C=0.1, svm__gamma=scale, svm__kernel=rbf; total time=  15.0s
[CV] END ....svm__C=0.1, svm__gamma=auto, svm__kernel=linear; total time=  13.3s
[CV] END ....svm__C=0.1, svm__gamma=auto, svm__k

KeyboardInterrupt: 

In [None]:
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_model

In [151]:
y_pred = best_model.predict(X_test_emb)
score(y_test, y_pred)

accuracy 0.8715
precision 0.8648960739030023
recall 0.84251968503937
f1 score 0.8535612535612536


#### Improve precision to minimize false positive

In [182]:
updated_model = clone(best_model)
updated_model.named_steps["svm"].set_params(class_weight={0: 10, 1: 1})
updated_model.fit(X_train_emb, y_train)

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('scaler', ...), ('svm', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"copy  copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.",True
,"with_mean  with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.",True
,"with_std  with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).",True

0,1,2
,"C  C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.",10
,"kernel  kernel: {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf' Specifies the kernel type to be used in the algorithm. If none is given, 'rbf' will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape ``(n_samples, n_samples)``. For an intuitive visualization of different kernel types see :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`.",'rbf'
,"degree  degree: int, default=3 Degree of the polynomial kernel function ('poly'). Must be non-negative. Ignored by all other kernels.",3
,"gamma  gamma: {'scale', 'auto'} or float, default='scale' Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - if ``gamma='scale'`` (default) is passed then it uses  1 / (n_features * X.var()) as value of gamma, - if 'auto', uses 1 / n_features - if float, must be non-negative. .. versionchanged:: 0.22  The default value of ``gamma`` changed from 'auto' to 'scale'.",'scale'
,"coef0  coef0: float, default=0.0 Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'.",0.0
,"shrinking  shrinking: bool, default=True Whether to use the shrinking heuristic. See the :ref:`User Guide `.",True
,"probability  probability: bool, default=False Whether to enable probability estimates. This must be enabled prior to calling `fit`, will slow down that method as it internally uses 5-fold cross-validation, and `predict_proba` may be inconsistent with `predict`. Read more in the :ref:`User Guide `.",False
,"tol  tol: float, default=1e-3 Tolerance for stopping criterion.",0.001
,"cache_size  cache_size: float, default=200 Specify the size of the kernel cache (in MB).",200
,"class_weight  class_weight: dict or 'balanced', default=None Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.","{0: 10, 1: 1}"


In [183]:
y_pred = updated_model.predict(X_test_emb)
score(y_test, y_pred)

accuracy 0.869
precision 0.8556581986143187
recall 0.8439635535307517
f1 score 0.8497706422018348


## Conclusion

The best model is a SVM with a f1 score of 85%. The precision is at 86%, so some false positive can be predicted