# Spam/Jailbreak Classification

**Fine-Tuning a Base BERT Model**

---

## Dependencies

### Modules

In [None]:
%pip install fastai

In [None]:
%pip install torch

In [None]:
%pip install --upgrade transformers

In [None]:
%pip install datasets

In [None]:
%pip install tokenizers

In [None]:
%pip install scikit-learn

In [None]:
%pip install matplotlib

In [None]:
%pip install spacy

In [None]:
%pip install evaluate

In [None]:
%pip install accelerate

In [None]:
%pip install kaggle

### Imports

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, pipeline
from datasets import Dataset, DatasetDict
import numpy as np
import evaluate
import os
from sklearn.metrics import precision_recall_fscore_support

---

## Data

190K+ Spam | Ham Email Dataset for Classification: https://www.kaggle.com/datasets/meruvulikith/190k-spam-ham-email-dataset-for-classification

Emails for spam or ham classification (Trec 2007): https://www.kaggle.com/datasets/bayes2003/emails-for-spam-or-ham-classification-trec-2007?select=email_text.csv

### Filtering

In [5]:
directory = "data"
try:
    os.mkdir(directory)
    print(f"Directory '{directory}' created successfully.")
except FileExistsError:
    print(f"Directory '{directory}' already exists.")

Directory 'data' already exists.


Download 2 datasets into "data" folder

In [14]:
!kaggle datasets download -d meruvulikith/190k-spam-ham-email-dataset-for-classification -p ./data --unzip

Dataset URL: https://www.kaggle.com/datasets/meruvulikith/190k-spam-ham-email-dataset-for-classification
License(s): MIT
Downloading 190k-spam-ham-email-dataset-for-classification.zip to ./data
  0%|                                                | 0.00/107M [00:00<?, ?B/s]
100%|████████████████████████████████████████| 107M/107M [00:00<00:00, 3.55GB/s]


In [None]:
!kaggle datasets download -d bayes2003/emails-for-spam-or-ham-classification-trec-2007 -p ./data --unzip


Dataset URL: https://www.kaggle.com/datasets/bayes2003/emails-for-spam-or-ham-classification-trec-2007
License(s): ODbL-1.0
Downloading emails-for-spam-or-ham-classification-trec-2007.zip to ./data
 57%|██████████████████████▊                 | 276M/483M [00:00<00:00, 2.88GB/s]
100%|████████████████████████████████████████| 483M/483M [00:00<00:00, 2.97GB/s]


delete unnecessary files

In [None]:
os.remove("data/email_origin.csv") ; os.remove("data/trec07p.tgz")

In [19]:
df_a = pd.read_csv("data/email_text.csv")
df_b = pd.read_csv("data/spam_Emails_data.csv")

In [20]:
df_a

Unnamed: 0,label,text
0,1,do you feel the pressure to perform and not ri...
1,0,hi i've just updated from the gulus and i chec...
2,1,mega authenticv i a g r a discount pricec i a ...
3,1,hey billy it was really fun going out the othe...
4,1,system of the home it will have the capabiliti...
...,...,...
53663,1,versuchen sie unser produkt und sie werden fuh...
53664,1,while we may have high expectations of our ass...
53665,0,for those who are interested i just cook a lit...
53666,0,hello as i wrote i call sqlfetch channel t stu...


In [21]:
df_b['label'] = df_b['label'].map({"Spam": 1, "Ham": 0})

In [22]:
df_b['label'].unique()

array([1, 0])

In [23]:
merged = pd.concat([df_a[['label', 'text']], df_b[['label', 'text']]], ignore_index=True)
merged

Unnamed: 0,label,text
0,1,do you feel the pressure to perform and not ri...
1,0,hi i've just updated from the gulus and i chec...
2,1,mega authenticv i a g r a discount pricec i a ...
3,1,hey billy it was really fun going out the othe...
4,1,system of the home it will have the capabiliti...
...,...,...
247515,0,on escapenumber escapenumber escapenumber rob ...
247516,1,we have everything you need escapelong cialesc...
247517,0,hi quick question say i have a date variable i...
247518,1,thank you for your loan request which we recie...


In [None]:
merged['label'].value_counts()

In [None]:
merged['label'].unique()

In [None]:
merged['text'].unique().shape

In [None]:
merged = merged.drop_duplicates().reset_index(drop=True)

In [None]:
merged = merged.dropna(subset=['text']).reset_index(drop=True)
merged = merged.drop_duplicates(subset=['text']).reset_index(drop=True)

In [None]:
merged.to_csv("data/merged_spam_ham.csv", index=False)

### Splitting

In [None]:
train_val, test = train_test_split(
    merged,
    train_size=0.9,
    stratify=merged['label'],  
    shuffle=True,
)
train, val = train_test_split(
    train_val,
    train_size=0.8,
    stratify=train_val['label'],  
    shuffle=True,
)

train = train.reset_index(drop=True) #72%
val = val.reset_index(drop=True) #18%
test = test.reset_index(drop=True) #10%


In [None]:
directory = "filtered_data"
try:
    os.mkdir(directory)
    print(f"Directory '{directory}' created successfully.")
except FileExistsError:
    print(f"Directory '{directory}' already exists.")

In [None]:
train.to_csv("filtered_data/spam_ham_train.csv", index=False)
val.to_csv("filtered_data/spam_ham_val.csv", index=False)
test.to_csv("filtered_data/spam_ham_test.csv", index=False)

---

In [None]:
train = pd.read_csv("filtered_data/spam_ham_train.csv")
val = pd.read_csv("filtered_data/spam_ham_val.csv")
test = pd.read_csv("filtered_data/spam_ham_test.csv")

## Classifier

In [None]:
model_name = "bert-base-uncased" #test cased/uncased
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
#freeze all parameters (weights)
for name, param in model.base_model.named_parameters():
    param.requires_grad = False
#unfreezzing pooler layer's parameters, 
#fine-tuning during training while keeping the rest of the model fixed.
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

In [None]:
reduced_train = train
reduced_val = val
reduced_test = test

In [None]:
train_ds = Dataset.from_pandas(reduced_train)
val_ds = Dataset.from_pandas(reduced_val)
test_ds = Dataset.from_pandas(reduced_test)

In [None]:
dataset_dict = DatasetDict({"train": train_ds, "val": val_ds, "test": test_ds})

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_data = dataset_dict.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
accuracy = evaluate.load("accuracy")

Precision measures how many of the emails predicted as spam (positive class) are actually spam. In other words, it tells us how accurate the model's predictions for spam are.

$Precision= \frac{TP} {TP + FP}$

Recall measures how many of the actual spam emails are correctly identified by the model as spam. It tells us how well the model is able to catch spam emails.

$Recall= \frac{TP} {TP + FN}$

The F1 score is the harmonic mean of precision and recall. It provides a balance between precision and recall, especially when there's a trade-off between the two metrics.

$F1 Score= 2 * \frac{Precision * Recall} {Precision + Recall}$

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    accuracy_res = accuracy.compute(predictions=predictions, references=labels)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, pos_label=1, average='binary')
    
    
    return {
        "accuracy": accuracy_res["accuracy"],
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [None]:
lr = 1e-4
batch_sz = 32
epoch = 3
wd = 0.01

training_args = TrainingArguments(
    output_dir="bert-spam-ham-classifier-full_dataset",
    per_device_train_batch_size=batch_sz,
    per_device_eval_batch_size=batch_sz,
    num_train_epochs=epoch,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,

    learning_rate= lr,
    weight_decay= wd,
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
!pip3 install tqdm==4.62.1

### Testing

In [None]:
test_results = trainer.predict(tokenized_data["test"])

In [None]:
test_results.metrics

chose a path where the trained model has been saved

In [27]:
model_testing_path = "bert-spam-ham-classifier-full_dataset/checkpoint-13086"

In [None]:
model_test = AutoModelForSequenceClassification.from_pretrained(model_testing_path)
tokenizer_testing = AutoTokenizer.from_pretrained(model_testing_path)

In [None]:
clf = pipeline("text-classification", model=model_test, tokenizer=tokenizer_testing)

In [None]:
text = "Your session expired. Click here to sign in again."
prediction = clf(text)

prediction

In [None]:
text = "Congratulations! You are the lucky winner of a $1,000 gift card to Amazon! This incredible prize is just a click away.\
To claim your prize, all you need to do is follow the instructions below:\
Click on the link to claim your prize: [Claim My $1,000 Gift Card]\
Fill out your details on the next page.\
Complete a quick survey to verify your information.\
But hurry! This offer is available for a limited time only. You must claim your prize within the next 24 hours to avoid missing out.\
If you have any questions, feel free to reach out to our support team. Don’t miss out on this exclusive opportunity!\
Warm regards,\
Your Amazon Prize Team"
prediction = clf(text)

prediction

In [None]:
text = "Dear [Recipient],\
We hope you’re enjoying the benefits of your Premium Subscription with us. This is a quick reminder that your subscription will automatically renew in the next 7 days.\
To ensure uninterrupted service, the renewal payment of $49.99 will be processed on [Renewal Date]. If you’d like to update your payment information or make any changes to your subscription, please visit your account page here:\
[Update My Subscription]\
If you no longer wish to continue with your subscription, you can cancel it at any time before the renewal date, and no further charges will apply. Simply go to your account settings, and follow the instructions to cancel.\
If you have any questions or need assistance, our support team is here to help. Feel free to contact us at [support@company.com].\
Thank you for choosing [Company Name], and we look forward to continuing to serve you!\
Best regards,\ [Company Name] Team Customer Support"
prediction = clf(text)

prediction

#### Testing learning rate

In [None]:
results_lr = []

batch_sz = 32
epoch = 3
learning_rates = [1e-5, 2e-5, 3e-5, 5e-5, 1e-4]
for lr in learning_rates:
    print(f"Testing learning rate: {lr}")
    
    training_args = TrainingArguments(
        output_dir="bert-spam-ham-classifier-testing-learning-rate",
        per_device_train_batch_size=batch_sz,
        per_device_eval_batch_size=batch_sz,
        num_train_epochs=epoch,
        eval_strategy="epoch",
        logging_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    
        learning_rate= lr,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_data["train"],
        eval_dataset=tokenized_data["val"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()
    test_results = trainer.predict(tokenized_data["test"])

    results_lr.append({"learning_rate": lr, 
                    "loss": test_results.metrics["test_loss"],
                   "accuracy": test_results.metrics["test_accuracy"]})



print("----------------------")
results_lr = sorted(results_lr, key=lambda x: x["accuracy"], reverse=True)
print(results_lr)
print("----------------------")

#### Testing weight decay

In [None]:
results_wd = []
lr = 1e-4
batch_sz = 32
epoch = 3
weight_decay_values = [0.0, 0.01, 0.05, 0.1, 0.2]
for wd in weight_decay_values:
    print(f"Testing weight decay: {wd}")
    
    training_args = TrainingArguments(
        output_dir="bert-spam-ham-classifier-testing-learning-rate",
        per_device_train_batch_size=batch_sz,
        per_device_eval_batch_size=batch_sz,
        num_train_epochs=epoch,
        eval_strategy="epoch",
        logging_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    
        learning_rate= lr,
        weight_decay=wd,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_data["train"],
        eval_dataset=tokenized_data["val"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()
    test_results = trainer.predict(tokenized_data["test"])

    results_wd.append({"weight_decay": wd, 
                    "loss": test_results.metrics["test_loss"],
                   "accuracy": test_results.metrics["test_accuracy"]})

In [None]:
print("----------------------")
results_wd = sorted(results_wd, key=lambda x: x["accuracy"], reverse=True)
for res in results_wd:
    print(res)
print("----------------------")

---