To systematically test different hyperparameter combinations, we will implement grid search using the specified values. The different hyperparameters are based on the recommended values from section fine tuning bert. We use resources from Grid5000 to get a GPU.

1. Iterate through different combinations of:
    - Batch size (16, 32)
    - Learning rate (2e-5, 3e-5, 5e-5)
    - Epochs (2, 3, 4)
2.  Use Adam optimizer with learning rate warmup over 10,000 steps.
3.  Apply dropout rate of 0.1 in the BERT model.
4.  Use cross-entropy loss function.
5.  Save only the best model based on accuracy.


# Total Training Time Estimate

Assuming:
- 30,000 samples
- Single GPU (e.g., A100 40GB)
- BERT-Base Uncased
- Dataset fully fits in memory
- No major bottlenecks in data loading

![Alt Text](runtime.png)

Total of 288 minutes = 4.8 h ~ 5h



In [4]:
import pandas as pd
import numpy as np
import re
import torch
import shap
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, average_precision_score, classification_report

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}\n")

# Load dataset
file_path1 = "Data/SpamAssasin.csv"
file_path2 = "Data/Enron.csv"
file_path3 = "Data/Nazario.csv"

print("Loading datasets...")
df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)
df3 = pd.read_csv(file_path3)

# Combine datasets
df_combined = pd.concat([df1, df2, df3], ignore_index=True)
df = df_combined[['body', 'label']].dropna()
df = df.sample(n=100, random_state=42)  # Select only 100 emails for faster training
print(df["label"].value_counts())
print("Datasets loaded!\n")

# Preprocessing function
def preprocess_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = re.sub(r"\S+@\S+\.\S+", "", text)  # Remove email addresses
    text = re.sub(r"[^A-Za-z0-9\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Normalize spaces
    return text.lower()

# Apply text preprocessing
print("Preprocessing email content...")
df['body'] = df['body'].apply(preprocess_text)
print("Text preprocessing complete!\n")

# Load BERT tokenizer
MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['body'], df['label'], test_size=0.2, random_state=42)
print(f"Training set: {len(X_train)} emails, Test set: {len(X_test)} emails\n")

# Define PyTorch Dataset class
class SpamDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Define hyperparameter search space
batch_sizes = [16, 32]
learning_rates = [2e-5, 3e-5, 5e-5]
epochs_list = [2, 3, 4]

best_auprc = 0.0
best_model = None
best_hyperparams = {}

# Iterate through hyperparameter combinations
for batch_size in batch_sizes:
    for lr in learning_rates:
        for epochs in epochs_list:
            print(f"\nTraining with batch_size={batch_size}, lr={lr}, epochs={epochs}")

            # Create DataLoaders
            train_dataset = SpamDataset(X_train, y_train, tokenizer)
            test_dataset = SpamDataset(X_test, y_test, tokenizer)

            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

            # Initialize BERT model with dropout 0.1
            bert_model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
            bert_model.to(device)

            # Define optimizer with learning rate warmup over 10,000 steps
            optimizer = AdamW(bert_model.parameters(), lr=lr)
            total_steps = len(train_loader) * epochs
            scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=10000, num_training_steps=total_steps)

            criterion = torch.nn.CrossEntropyLoss()

            # Training loop
            bert_model.train()
            for epoch in range(epochs):
                total_loss = 0
                for batch_idx, batch in enumerate(train_loader):
                    optimizer.zero_grad()
                    
                    input_ids = batch["input_ids"].to(device)
                    attention_mask = batch["attention_mask"].to(device)
                    labels = batch["labels"].to(device)

                    outputs = bert_model(input_ids, attention_mask=attention_mask)
                    loss = criterion(outputs.logits, labels)
                    loss.backward()
                    optimizer.step()
                    scheduler.step()

                    total_loss += loss.item()

                avg_loss = total_loss / len(train_loader)
                print(f"Epoch {epoch+1}/{epochs} - Avg Loss: {avg_loss:.4f}")

            # Evaluate model using AUPRC
            bert_model.eval()
            y_preds = []
            y_probs = []
            y_true = []

            with torch.no_grad():
                for batch in test_loader:
                    input_ids = batch["input_ids"].to(device)
                    attention_mask = batch["attention_mask"].to(device)
                    labels = batch["labels"].cpu().numpy()

                    outputs = bert_model(input_ids, attention_mask=attention_mask)
                    probs = torch.nn.functional.softmax(outputs.logits, dim=1).cpu().numpy()[:, 1]  # Get positive class probs
                    preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()

                    y_probs.extend(probs)  # Probabilities for AUPRC
                    y_preds.extend(preds)
                    y_true.extend(labels)

            # Calculate AUPRC
            auprc = average_precision_score(y_true, y_probs)
            print(f"AUPRC: {auprc:.4f}")

            # Print classification report
            print("\nFinal Model Performance:")
            print(classification_report(y_true, y_preds))

            # Save the best model
            if auprc > best_auprc:
                best_auprc = auprc
                best_model = bert_model
                best_hyperparams = {"batch_size": batch_size, "learning_rate": lr, "epochs": epochs}

# Save the best model
MODEL_SAVE_PATH = "best_bert_spam_classifier_auprc.pth"
print(f"\nBest model found with AUPRC {best_auprc:.4f}: {best_hyperparams}")
torch.save(best_model.state_dict(), MODEL_SAVE_PATH)
print(f"Best model saved to {MODEL_SAVE_PATH}")

Using device: cpu

Loading datasets...
label
0    67
1    33
Name: count, dtype: int64
Datasets loaded!

Preprocessing email content...
Text preprocessing complete!

Training set: 80 emails, Test set: 20 emails


Training with batch_size=16, lr=2e-05, epochs=2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2 - Avg Loss: 0.6593
Epoch 2/2 - Avg Loss: 0.6467
AUPRC: 0.4382

Final Model Performance:
              precision    recall  f1-score   support

           0       0.58      0.92      0.71        12
           1       0.00      0.00      0.00         8

    accuracy                           0.55        20
   macro avg       0.29      0.46      0.35        20
weighted avg       0.35      0.55      0.43        20


Training with batch_size=16, lr=2e-05, epochs=3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


: 