In [30]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [31]:
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|[^a-z\s]", "", text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [32]:
path = 'Spam.csv'
df = pd.read_csv(path)

df['SMS'] = df['SMS'].apply(preprocess_text)

X = df['SMS'].values
y = df['CLASS'].apply(lambda x: 1 if x == 'spam' else 0).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=420)

In [33]:
vectorizer = TfidfVectorizer(max_features=5000, min_df=2, max_df=0.95)
vectorizer_save_path = 'tfidf_vectorizer.pkl'

X_train_vector = vectorizer.fit_transform(X_train)
X_test_vector = vectorizer.transform(X_test)

joblib.dump(vectorizer, vectorizer_save_path)

['tfidf_vectorizer.pkl']

In [34]:
class SpamDataset(Dataset):
    def __init__(self, X, y, vectorizer):
        self.X = X
        self.y = y
        self.vectorizer = vectorizer
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        text = self.X[index]
        label = self.y[index]

        text_vector = self.vectorizer.transform([text]).toarray().astype(np.float32)

        input_tensor = torch.tensor(text_vector).squeeze(0)
        label_tensor = torch.tensor(label).long()

        return input_tensor, label_tensor

In [35]:
class SpamDetector(nn.Module):
    def __init__(self, input_dim):
        super(SpamDetector, self).__init__()

        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 1)

        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.dropout(x)

        x = self.fc2(x)
        x = torch.relu(x)
        x = self.dropout(x)

        x = self.fc3(x)

        return x

In [36]:
train_dataset = SpamDataset(X_train, y_train, vectorizer)
test_dataset = SpamDataset(X_test, y_test, vectorizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

input_dim = X_train_vector.shape[1]
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
print(f"Device = {device}")

model = SpamDetector(input_dim).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

Device = cuda


In [37]:
def save_model(model, optimizer, epoch, loss, model_filepath):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
        'input_dim': model.fc1.in_features
    }, model_filepath)

def load_model(model_filepath, vectorizer_filepath, device):
    checkpoint = torch.load(model_filepath, map_location=device)
    model = SpamDetector(checkpoint['input_dim']).to(device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    vectorizer = joblib.load(vectorizer_filepath)

    return model, vectorizer

In [38]:
num_epochs = 100
model_save_path = 'spam_detector_model.pth'

for epoch in range(num_epochs):
    model.train()
    train_loss = 0

    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Compute loss
        loss = criterion(outputs.squeeze(), targets.float())

        # Backward pass
        loss.backward()
        optimizer.step()

        train_loss += loss.item()            

    if (epoch + 1) % 10 == 0:
        avg_loss = train_loss/len(train_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")
        
        # Quick accuracy check
        with torch.no_grad():
            correct = 0
            total = 0
            for inputs, targets in train_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                predicted = (torch.sigmoid(outputs.squeeze()) > 0.5).float()
                total += targets.size(0)
                correct += (predicted == targets.float()).sum().item()
            print(f"Training Accuracy: {100 * correct / total:.2f}%")

final_loss = train_loss / len(train_loader)
save_model(model, optimizer, num_epochs, final_loss, model_save_path)

Epoch [10/100], Loss: 0.0143
Training Accuracy: 99.83%
Epoch [20/100], Loss: 0.0077
Training Accuracy: 99.83%
Epoch [30/100], Loss: 0.0053
Training Accuracy: 99.83%
Epoch [40/100], Loss: 0.0043
Training Accuracy: 99.83%
Epoch [50/100], Loss: 0.0047
Training Accuracy: 99.75%
Epoch [60/100], Loss: 0.0040
Training Accuracy: 99.92%
Epoch [70/100], Loss: 0.0034
Training Accuracy: 99.92%
Epoch [80/100], Loss: 0.0046
Training Accuracy: 99.75%
Epoch [90/100], Loss: 0.0048
Training Accuracy: 99.75%
Epoch [100/100], Loss: 0.0052
Training Accuracy: 99.75%


In [39]:
def evaluate_model(model, test_loader, device):
    model.eval()
    y_true = []
    y_pred = []

    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)

            predicted = (outputs.squeeze() > 0.5).long()

            y_true.append(targets.cpu())
            y_pred.append(predicted.cpu())

    y_true = torch.cat(y_true)
    y_pred = torch.cat(y_pred)

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

evaluate_model(model, test_loader, device)

Accuracy: 0.9367
Precision: 0.9595
Recall: 0.9161
F1 Score: 0.9373


In [42]:
def predict_spam(text, model, vectorizer, device):
    processed_text = preprocess_text(text)
    text_vector = vectorizer.transform([processed_text]).toarray().astype(np.float32)
    input_tensor = torch.tensor(text_vector).squeeze(0).to(device)

    with torch.no_grad():
        output = model(input_tensor)
        probability = torch.sigmoid(output).item()
        prediction = "Spam" if probability > 0.5 else "Not Spam"
    
    return prediction, probability

In [47]:
new_samples = [
    "Hey, are we still meeting for dinner tonight?",  # Expected: HAM
    "CONGRATULATIONS! You've won £1000! Call 09876543210 NOW to claim your prize!",  # Expected: SPAM
    "Can you pick up some milk on your way home please?",  # Expected: HAM
    "FREE RINGTONES! Text STOP to 85858. Only £3/week",  # Expected: SPAM
    "The meeting has been moved to 3pm tomorrow",  # Expected: HAM
    "URGENT! Your account will be closed. Click here: bit.ly/fake123",  # Expected: SPAM
    "Happy birthday! Hope you have a wonderful day",  # Expected: HAM
    "You have WON! £2000 cash prize. Call 09012345678 before midnight",  # Expected: SPAM
    "Don't forget we have the dentist appointment at 2pm",  # Expected: HAM
    "FINAL NOTICE: Pay now or face legal action. Call 08001234567",  # Expected: SPAM
    "Thanks for helping me move house yesterday",  # Expected: HAM
    "Get rich quick! Earn £500 per day working from home!",  # Expected: SPAM
    "Mom called, she wants you to call her back",  # Expected: HAM
    "Limited time offer! 90% discount on designer watches!",  # Expected: SPAM
    "Can you send me the report by end of day?",  # Expected: HAM
]

for sms in new_samples:
    prediction, probability = predict_spam(sms, model, vectorizer, device)

    if prediction == "Spam":
        confidence_percent = probability * 100
    else:
        confidence_percent = (1 - probability) * 100

    print(f"SMS = {sms}")
    print(f"Prediction = {prediction}")
    print(f"Confidence = {confidence_percent:.2f}%")
    print("-" * 80)

SMS = Hey, are we still meeting for dinner tonight?
Prediction = Not Spam
Confidence = 99.99%
--------------------------------------------------------------------------------
SMS = CONGRATULATIONS! You've won £1000! Call 09876543210 NOW to claim your prize!
Prediction = Spam
Confidence = 100.00%
--------------------------------------------------------------------------------
SMS = Can you pick up some milk on your way home please?
Prediction = Not Spam
Confidence = 100.00%
--------------------------------------------------------------------------------
SMS = FREE RINGTONES! Text STOP to 85858. Only £3/week
Prediction = Spam
Confidence = 100.00%
--------------------------------------------------------------------------------
SMS = The meeting has been moved to 3pm tomorrow
Prediction = Spam
Confidence = 94.29%
--------------------------------------------------------------------------------
SMS = URGENT! Your account will be closed. Click here: bit.ly/fake123
Prediction = Not Spam
Confid