# Spam/Jailbreak Classification

---

## Dependencies

### Modules

In [None]:
%pip install fastai

In [None]:
%pip install torch

In [None]:
%pip install transformers

In [None]:
%pip install datasets

In [None]:
%pip install tokenizers

In [None]:
%pip install scikit-learn

In [None]:
%pip install matplotlib

In [None]:
%pip install spacy

In [None]:
%pip install evaluate

In [None]:
%pip install accelerate

### Imports

In [54]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from fastai.text.all import *
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset, DatasetDict
import numpy as np
import evaluate
import os

---

## Data

190K+ Spam | Ham Email Dataset for Classification: https://www.kaggle.com/datasets/meruvulikith/190k-spam-ham-email-dataset-for-classification

Emails for spam or ham classification (Trec 2007): https://www.kaggle.com/datasets/bayes2003/emails-for-spam-or-ham-classification-trec-2007?select=email_text.csv

### Filtering

In [None]:
directory = "data"
try:
    os.mkdir(directory)
    print(f"Directory '{directory}' created successfully.")
except FileExistsError:
    print(f"Directory '{directory}' already exists.")

Download 2 datasets into "data" folder

In [None]:
# df_a = pd.read_csv("data/email_text.csv")
# df_b = pd.read_csv("data/spam_Emails_data.csv")

#kaggle 
df_a = pd.read_csv("/kaggle/input/emails-for-spam-or-ham-classification-trec-2007/email_text.csv")
df_b = pd.read_csv("/kaggle/input/190k-spam-ham-email-dataset-for-classification/spam_Emails_data.csv")

In [None]:
df_b['label'] = df_b['label'].map({"Spam": 1, "Ham": 0})

In [None]:
df_b['label'].unique()

In [None]:
merged = pd.concat([df_a[['label', 'text']], df_b[['label', 'text']]], ignore_index=True)
merged

In [None]:
merged['label'].unique()

In [None]:
merged['text'].unique().shape

In [None]:
merged = merged.drop_duplicates().reset_index(drop=True)

In [None]:
merged = merged.dropna(subset=['text']).reset_index(drop=True)
merged = merged.drop_duplicates(subset=['text']).reset_index(drop=True)

In [None]:
merged.to_csv("data/merged_spam_ham.csv", index=False)

### Splitting

In [61]:
train_val, test = train_test_split(
    merged,
    train_size=0.9,
    stratify=merged['label'],  
    shuffle=True,
    random_state=42
)
train, val = train_test_split(
    train_val,
    train_size=0.8,
    stratify=train_val['label'],  
    shuffle=True,
    random_state=42
)

train = train.reset_index(drop=True) #72%
val = val.reset_index(drop=True) #18%
test = test.reset_index(drop=True) #10%


In [None]:
directory = "filtered_data"
try:
    os.mkdir(directory)
    print(f"Directory '{directory}' created successfully.")
except FileExistsError:
    print(f"Directory '{directory}' already exists.")

In [None]:
train.to_csv("filtered_data/spam_ham_train.csv", index=False)
val.to_csv("filtered_data/spam_ham_val.csv", index=False)
test.to_csv("filtered_data/spam_ham_test.csv", index=False)

---

In [None]:
train = pd.read_csv("filtered_data/spam_ham_train.csv")
val = pd.read_csv("filtered_data/spam_ham_val.csv")
test = pd.read_csv("filtered_data/spam_ham_test.csv")

## Classifier

In [None]:
model_name = "bert-base-uncased" #test cased/uncased
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

In [64]:
reduced_train = train.sample(n=20000, random_state=42)
reduced_val = val.sample(n=20000, random_state=42)
reduced_test = test.sample(n=19000, random_state=42)

In [65]:
train_ds = Dataset.from_pandas(reduced_train)
val_ds = Dataset.from_pandas(reduced_val)
test_ds = Dataset.from_pandas(reduced_test)

In [66]:
dataset_dict = DatasetDict({"train": train_ds, "val": val_ds, "test": test_ds})

In [67]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_data = dataset_dict.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [69]:
accuracy = evaluate.load("accuracy")

In [70]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    accuracy_res = accuracy.compute(predictions=predictions, references=labels)
    return {"accuracy" : accuracy_res["accuracy"]}

In [74]:
lr = 1e-4
batch_sz = 32
epoch = 3
wd = 0.01

training_args = TrainingArguments(
    output_dir="bert-spam-ham-classifier-20000",
    per_device_train_batch_size=batch_sz,
    per_device_eval_batch_size=batch_sz,
    num_train_epochs=epoch,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,

    learning_rate= lr,
    weight_decay= wd,
    report_to="none"
)

In [75]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [76]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2311,0.189207,0.9269
2,0.2172,0.180918,0.9289
3,0.2099,0.177861,0.92965


TrainOutput(global_step=1875, training_loss=0.21939038899739582, metrics={'train_runtime': 1942.2148, 'train_samples_per_second': 30.893, 'train_steps_per_second': 0.965, 'total_flos': 1.578460776648e+16, 'train_loss': 0.21939038899739582, 'epoch': 3.0})

In [None]:
!pip3 install tqdm==4.62.1

### Testing

In [77]:
test_results = trainer.predict(tokenized_data["test"])

In [78]:
test_results.metrics

{'test_loss': 0.18524383008480072,
 'test_accuracy': 0.9298421052631579,
 'test_runtime': 298.5916,
 'test_samples_per_second': 63.632,
 'test_steps_per_second': 1.989}

### Testing learning rate

In [None]:
results = []

batch_sz = 32
epoch = 3
learning_rates = [1e-5, 2e-5, 3e-5, 5e-5, 1e-4]
for lr in learning_rates:
    print(f"Testing learning rate: {lr}")
    
    training_args = TrainingArguments(
        output_dir="bert-spam-ham-classifier-testing-learning-rate",
        per_device_train_batch_size=batch_sz,
        per_device_eval_batch_size=batch_sz,
        num_train_epochs=epoch,
        eval_strategy="epoch",
        logging_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    
        learning_rate= lr,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_data["train"],
        eval_dataset=tokenized_data["val"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()
    test_results = trainer.predict(tokenized_data["test"])

    results.append({"learning_rate": lr, 
                    "loss": test_results.metrics["test_loss"],
                   "accuracy": test_results.metrics["test_accuracy"]})



print("----------------------")
results = sorted(results, key=lambda x: x["accuracy"], reverse=True)
print(results)
print("----------------------")

### Testing weight decay

In [None]:
results = []
lr = 1e-4
batch_sz = 32
epoch = 3
weight_decay_values = [0.0, 0.01, 0.05, 0.1, 0.2]
for wd in weight_decay_values:
    print(f"Testing weight decay: {wd}")
    
    training_args = TrainingArguments(
        output_dir="bert-spam-ham-classifier-testing-learning-rate",
        per_device_train_batch_size=batch_sz,
        per_device_eval_batch_size=batch_sz,
        num_train_epochs=epoch,
        eval_strategy="epoch",
        logging_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    
        learning_rate= lr,
        weight_decay=wd,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_data["train"],
        eval_dataset=tokenized_data["val"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()
    test_results = trainer.predict(tokenized_data["test"])

    results.append({"weight_decay": wd, 
                    "loss": test_results.metrics["test_loss"],
                   "accuracy": test_results.metrics["test_accuracy"]})

In [None]:
print("----------------------")
results = sorted(results, key=lambda x: x["accuracy"], reverse=True)
for res in results:
    print(res)
print("----------------------")