# Spam Email Detector with Neural Network and CountVectorizer
## with ~98% Accuracy on 164K email sample

In [None]:
# Import necessary Libraries
import pandas as pd  
import numpy as np  
import torch  
import torch.nn as nn  
import torch.optim as optim  
from sklearn.model_selection import train_test_split  
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Load the dataset  
data = pd.read_csv("Data/FullDataset.csv") 
# See first 5 rows 
print(data.head())

                                                text  label
0  Subject: naturally irresistible your corporate...      1
1  Subject: the stock trading gunslinger  fanny i...      1
2  Subject: unbelievable new homes made easy  im ...      1
3  Subject: 4 color printing special  request add...      1
4  Subject: do not have money , get software cds ...      1


In [3]:
# Preprocess the dataset  

# Split the data into features and labels  
X = data["text"].values  
y = data["label"].values
# Split the data into train and test sets  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

In [4]:
# Text vectorization using CountVectorizer with limited features 
# Convert text data to numerical data using CountVectorizer (Count each word from whole vocabulary in each sample) 
vectorizer = CountVectorizer(max_features=10000)  # Limit to top 10,000 features for impact effect on memory
# Fit train data to catch vocabulary of whole words AND transform them to numerical data
X_train_counts = vectorizer.fit_transform(X_train) 
# transform test data to numerical data based on train data vocabulary
X_test_counts = vectorizer.transform(X_test) 

In [5]:
# Convert the sparse matrix to a PyTorch sparse tensor  
X_train_tensor = torch.sparse.FloatTensor(torch.LongTensor(X_train_counts.nonzero()),  # get the integer indices of the non-zero elements as LongTensor
                                           torch.FloatTensor(X_train_counts.data),  # get real data of indices of the non-zero elements as FloatTensor
                                           torch.Size(X_train_counts.shape))  # Set shape size of sparse vector as tensor size
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)  # we need to convert y data (0 and 1) to float32, to compatible with train tensor
X_test_tensor = torch.sparse.FloatTensor(torch.LongTensor(X_test_counts.nonzero()),   
                                          torch.FloatTensor(X_test_counts.data),   
                                          torch.Size(X_test_counts.shape))  
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

  X_train_tensor = torch.sparse.FloatTensor(torch.LongTensor(X_train_counts.nonzero()),  # get the integer indices of the non-zero elements as LongTensor
  X_train_tensor = torch.sparse.FloatTensor(torch.LongTensor(X_train_counts.nonzero()),  # get the integer indices of the non-zero elements as LongTensor


In [6]:
# Define the neural network class  
class SpamDetectorNNByCV(nn.Module):  
    def __init__(self, input_size):  
        super().__init__()
        self.layer1 = nn.Linear(input_size, 64)  
        self.layer2 = nn.Linear(64, 32)  
        self.layer3 = nn.Linear(32, 1)  
        self.sigmoid = nn.Sigmoid() # Scale output between 0 and 1

    def forward(self, x):  
        # # use ReLU to replace negative values with 0
        # x = torch.relu(self.fc1(x))  
        # x = torch.relu(self.fc2(x))  
        # x = self.sigmoid(self.fc3(x))  
        # return x 
        
        # Optimized Retrun
        return self.sigmoid(self.layer3(torch.relu(self.layer2(torch.relu(self.layer1(x))))))

In [15]:
# Create the model  
input_size = X_train_tensor.size(1)  # feature size of train data (0 -> number of samples, 1 -> feature size)
model = SpamDetectorNNByCV(input_size) 

In [16]:
# loss function and optimizer  
criterion = nn.BCELoss()  # for Binary Classification (1 -> spam, 0 -> ham)
optimizer = optim.Adam(model.parameters(), lr=0.001) 

In [17]:
# Train the model  
for epoch in range(40):  
    model.train()  
    optimizer.zero_grad()  
    
    # Need to convert sparse tensor to dense tensor for training  
    outputs = model(X_train_tensor.to_dense())  
    
    loss = criterion(outputs.squeeze(), y_train_tensor)  
    loss.backward()  
    optimizer.step()  
    
    print(f'Epoch [{epoch+1}/{40}], Loss: {loss.item():.4f}')

Epoch [1/40], Loss: 0.6935
Epoch [2/40], Loss: 0.6667
Epoch [3/40], Loss: 0.6401
Epoch [4/40], Loss: 0.6107
Epoch [5/40], Loss: 0.5785
Epoch [6/40], Loss: 0.5467
Epoch [7/40], Loss: 0.5156
Epoch [8/40], Loss: 0.4847
Epoch [9/40], Loss: 0.4549
Epoch [10/40], Loss: 0.4271
Epoch [11/40], Loss: 0.4010
Epoch [12/40], Loss: 0.3773
Epoch [13/40], Loss: 0.3551
Epoch [14/40], Loss: 0.3345
Epoch [15/40], Loss: 0.3153
Epoch [16/40], Loss: 0.2974
Epoch [17/40], Loss: 0.2813
Epoch [18/40], Loss: 0.2660
Epoch [19/40], Loss: 0.2518
Epoch [20/40], Loss: 0.2386
Epoch [21/40], Loss: 0.2263
Epoch [22/40], Loss: 0.2149
Epoch [23/40], Loss: 0.2042
Epoch [24/40], Loss: 0.1943
Epoch [25/40], Loss: 0.1857
Epoch [26/40], Loss: 0.1770
Epoch [27/40], Loss: 0.1689
Epoch [28/40], Loss: 0.1613
Epoch [29/40], Loss: 0.1542
Epoch [30/40], Loss: 0.1475
Epoch [31/40], Loss: 0.1413
Epoch [32/40], Loss: 0.1354
Epoch [33/40], Loss: 0.1298
Epoch [34/40], Loss: 0.1246
Epoch [35/40], Loss: 0.1197
Epoch [36/40], Loss: 0.1150
E

In [18]:
# Test the model  
model.eval()  
with torch.inference_mode():  
    test_outputs = model(X_test_tensor.to_dense())  
    predicted = (test_outputs.squeeze() > 0.5).float()

In [19]:
# Evaluate the model  
print("Accuracy:", accuracy_score(y_test, predicted.numpy()))  
print(classification_report(y_test, predicted.numpy()))

Accuracy: 0.9789012859879002
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     16395
           1       0.98      0.98      0.98     16498

    accuracy                           0.98     32893
   macro avg       0.98      0.98      0.98     32893
weighted avg       0.98      0.98      0.98     32893



In [22]:
# Save model
from pathlib import Path

Path("Saved_model").mkdir(exist_ok=True)
# Save model's weight
torch.save(model.state_dict(), "Saved_model/Spam_Email_Detector_NN_ByCV_PyTorch_Weight.pt")
# Save whole model with structure
torch.save(model, "Saved_model/Spam_Email_Detector_NN_ByCV_PyTorch_WithStructure.pt")

in below cells, you can load saved model and predict with it

In [None]:
# Create model Structure and load saved weights
class SpamDetectorNNByCV(nn.Module):  
    def __init__(self, input_size):  
        super().__init__()
        self.layer1 = nn.Linear(input_size, 64)  
        self.layer2 = nn.Linear(64, 32)  
        self.layer3 = nn.Linear(32, 1)  
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):  
        return self.sigmoid(self.layer3(torch.relu(self.layer2(torch.relu(self.layer1(x))))))
    
model = SpamDetectorNNByCV(input_size) # Replace with you input_size data
model.load_state_dict(torch.load("Saved_model/Spam_Email_Detector_NN_ByCV_PyTorch_Weight.pt", weights_only=True))

In [None]:
# Load whole model with structure
model = torch.load("Saved_model/Spam_Email_Detector_NN_ByCV_PyTorch_WithStructure.pt", weights_only=False)

In [None]:
# Predict With model
model.eval()  
with torch.inference_mode():  
    test_outputs = model(X_test_tensor.to_dense())  # Replace with your X_test_tensor data
    predicted = (test_outputs.squeeze() > 0.5).float()