# Text Analysis Module in D-Fence Paper Code

This Jupyter Notebook implements the text classification module described in the D-Fence paper

### The text classification module in the D-Fence paper follows these key steps:

1. **Text Extraction**: Extract text from both text/plain and text/html sections.
2. **Language Detection**: Detect the language (only English is processed).
3. **Text Preprocessing**: Remove non-text elements like URLs, email addresses, and special characters.
4. **BERT Tokenization and Embedding**:
    - Tokenize the cleaned text using BERT’s tokenizer.
    - Generate embeddings using a BERT-base model (12 layers, 768 hidden units).
    - Aggregate token embeddings by averaging them.
5. **Classification**: Use RandomForest or XGBoost as the final classification model.

Below is the Python code implementing this pipeline. It follows the exact methodology described in the paper, using transformers for BERT embeddings and sklearn for classification.

In [None]:
# Install the missing numpy package
%pip install numpy
%pip install torch
%pip install transformers
%pip install scikit-learn
%pip install pandas
%pip install matplotlib
%pip install numpy==2.1.0
%pip install shap
%pip install xgboost
%pip install lime


In [None]:
import re
import numpy as np
import torch
import sklearn
import pandas as pd
import shap
import matplotlib.pyplot as plt
import xgboost as xgb

# Load the BERT model and tokenizer
from transformers import BertTokenizer, BertModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [None]:
### when you do it for the first time for SpamAssassin ###

# Load dataset
file_path = "Data/SpamAssasin.csv" # Path to the dataset
print("Loading dataset...")
df = pd.read_csv(file_path)

# Select only 'body' (email content) and 'label' (spam or not)
df = df[['body', 'label']].dropna()
print(f"Dataset loaded: {len(df)} emails\n")

# Load BERT tokenizer and model
print("Loading BERT model...")
MODEL_NAME = "bert-base-uncased" # doesn't care about capital letters
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
bert_model = BertModel.from_pretrained(MODEL_NAME)
print("BERT model loaded successfully!\n")

# Function to preprocess text
def preprocess_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = re.sub(r"\S+@\S+\.\S+", "", text)  # Remove email addresses
    text = re.sub(r"[^A-Za-z0-9\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Normalize spaces
    return text.lower()

# Function to extract BERT embeddings
def extract_bert_embeddings(text, index, total):
    print(f"Processing email {index+1}/{total}...", end="\r")
    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    
    with torch.no_grad():
        outputs = bert_model(**inputs)

    # Get last hidden state (512 tokens × 768 features)
    hidden_states = outputs.last_hidden_state.squeeze(0)

    # Average across tokens to get a single 768-dimensional vector
    feature_vector = torch.mean(hidden_states, dim=0).numpy()
    
    return feature_vector

# Apply text preprocessing
print("Preprocessing email content...")
print(f"Selected {len(df)} emails from the dataset.")
df['body'] = df['body'].apply(preprocess_text)
print("Text preprocessing complete!\n")

# Extract embeddings for all emails
print("Extracting BERT embeddings for each email...")
features = np.array([extract_bert_embeddings(email, i, len(df)) for i, email in enumerate(df['body'])])
print("\nBERT embeddings extraction complete!\n")

# Save features to a file
np.save("bert_features_1.npy", features)
print("Features saved to bert_features.npy\n")

labels = df['label'].values  # Target labels (0 = benign, 1 = phishing)

# Split into train and test sets
print("Splitting dataset into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
print(f"Training set: {len(X_train)} emails, Test set: {len(X_test)} emails\n")

# Train an XGBoost classifier
print("Training XGBoost classifier...")
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric="logloss")
xgb_model.fit(X_train, y_train)
print("Training complete!\n")

# Evaluate the model
print("Evaluating XGBoost model performance...\n")
y_pred_xgb = xgb_model.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred_xgb))

print("\n✅ Phishing detection pipeline using XGBoost complete!")

In [None]:
### when you already have bert_features.npy for SpamAssassin ###

from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_curve

# Load the saved BERT features
print("Loading BERT features from file...")
features = np.load("bert_features.npy")

# Ensure labels are available (should be loaded from the dataset)
file_path = "Data/SpamAssasin.csv"  # Adjust as needed
df = pd.read_csv(file_path)
df = df[['body', 'label']].dropna()
labels = df['label'].values  # Load labels from the original dataset

# Split into train and test sets
print("Splitting dataset into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
print(f"Training set: {len(X_train)} emails, Test set: {len(X_test)} emails\n")

# Train an XGBoost classifier
print("Training XGBoost classifier...")
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric="logloss")
xgb_model.fit(X_train, y_train)
print("Training complete!\n")

# Evaluate the model
print("Evaluating XGBoost model performance...\n")
y_pred_xgb = xgb_model.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred_xgb))

# AUPRC calculation
precision, recall, _ = precision_recall_curve(y_test, y_pred_xgb)
auprc = auc(recall, precision)
print(f"AUPRC (Area Under Precision-Recall Curve): {auprc:.4f}")

# recall at fixed FPR
desired_fpr = 0.01 # Define the desired FPR threshold (example: FPR = 0.001, meaning 0.1%)
fpr, tpr, _ = roc_curve(y_test, y_pred_xgb) # Compute False Positive Rate (FPR) and True Positive Rate (TPR) from ROC curve
closest_index = (np.abs(fpr - desired_fpr)).argmin() # Find the closest FPR value in the computed ROC curve
selected_tpr = tpr[closest_index] # Select the TPR value corresponding to the closest FPR value
print(f"TPR (Recall) at FPR={desired_fpr}: {selected_tpr:.6f}") # Print the TPR (Recall) at the fixed FPR


print("\n✅ Phishing detection pipeline using XGBoost complete!")



In [None]:
# Train a classifier (Random Forest)
print("Training Random Forest classifier...")
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
print("Training complete!\n")

# Evaluate the model
print("Evaluating model performance...\n")
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

# AUPRC calculation
precision, recall, _ = precision_recall_curve(y_test, y_pred)
auprc = auc(recall, precision)
print(f"AUPRC (Area Under Precision-Recall Curve): {auprc:.4f}")

# recall at fixed FPR
desired_fpr = 0.01 # Define the desired FPR threshold (example: FPR = 0.001, meaning 0.1%)
fpr, tpr, _ = roc_curve(y_test, y_pred) # Compute False Positive Rate (FPR) and True Positive Rate (TPR) from ROC curve
closest_index = (np.abs(fpr - desired_fpr)).argmin() # Find the closest FPR value in the computed ROC curve
selected_tpr = tpr[closest_index] # Select the TPR value corresponding to the closest FPR value
print(f"TPR (Recall) at FPR={desired_fpr}: {selected_tpr:.6f}") # Print the TPR (Recall) at the fixed FPR


print("\n✅ Phishing detection pipeline complete!")

With this kind of pipeline we cannot use SHAP because the input to XGBoost is the precomputed BERT embeddings, not raw words or tokens. We need to modify our pipeline so that SHAP can work directly on the raw text before it is converted into BERT embeddings.

Instead of training XGBoost on BERT embeddings, you will fine-tune a BERT classifier instead of using precomputed embeddings and use SHAP’s DeepExplainer or GradientExplainer on the BERT model to explain feature importance at the token level.