In [1]:
import pandas as pd
import numpy as np

# [SMS Spam Collection Dataset](https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset/code)

In [2]:
data = pd.read_csv('data.csv', encoding="ISO-8859-1")

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [3]:
data.shape

(5572, 5)

In [4]:
data.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [5]:
# print lines where 2, 3 or 4 are not NaN
weird_lines = data[data['Unnamed: 2'].notnull() | data['Unnamed: 3'].notnull() | data['Unnamed: 4'].notnull()]
weird_lines.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
95,spam,Your free ringtone is waiting to be collected....,PO Box 5249,"MK17 92H. 450Ppw 16""",
281,ham,\Wen u miss someone,the person is definitely special for u..... B...,why to miss them,"just Keep-in-touch\"" gdeve.."""
444,ham,\HEY HEY WERETHE MONKEESPEOPLE SAY WE MONKEYAR...,HOWU DOIN? FOUNDURSELF A JOBYET SAUSAGE?LOVE ...,,
671,spam,SMS. ac sun0819 posts HELLO:\You seem cool,"wanted to say hi. HI!!!\"" Stop? Send STOP to ...",,
710,ham,Height of Confidence: All the Aeronautics prof...,"this wont even start........ Datz confidence..""",,


In [6]:
X = data['v2']
Y = data['v1'] == 'spam'

# [Super SMS Dataset](https://github.com/smspamresearch/spstudy)

In [8]:
data = pd.read_csv('super_sms_dataset.csv', encoding="ISO-8859-1")

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67010 entries, 0 to 67009
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   SMSes   67009 non-null  object 
 1   Labels  67008 non-null  float64
dtypes: float64(1), object(1)
memory usage: 1.0+ MB


In [13]:
# remove all values that are nan in either column
data = data.dropna()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67008 entries, 0 to 67009
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   SMSes   67008 non-null  object 
 1   Labels  67008 non-null  float64
dtypes: float64(1), object(1)
memory usage: 1.5+ MB


In [14]:
X = data['SMSes']
Y = data['Labels'].astype(int)

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [17]:
def eval(model):
    Y_pred = model.predict(X_test_tfidf)
    
    # Accuracy, Precision, Recall, F1
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    accuracy = accuracy_score(Y_test, Y_pred)
    precision = precision_score(Y_test, Y_pred)
    recall = recall_score(Y_test, Y_pred)
    f1 = f1_score(Y_test, Y_pred)
    
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1: {f1}')

In [18]:
clf = MultinomialNB()
clf.fit(X_train_tfidf, Y_train)

eval(clf)

Accuracy: 0.9744316768641497
Precision: 0.9735176333807002
Recall: 0.9606118546845124
F1: 0.9670216861285769


In [19]:
percent_spam = np.mean(Y_train)
print('Percentage of spam in training set:', percent_spam)

clf = MultinomialNB()
# Sample weight should be proportional to the percentage of the class in the dataset
sample_weight = np.where(Y_train, 1/percent_spam, 1/(1-percent_spam))
clf.fit(X_train_tfidf, Y_train, sample_weight=sample_weight)

eval(clf)

Percentage of spam in training set: 0.3908538535337384
Accuracy: 0.9707506342336965
Precision: 0.9532792004996877
Recall: 0.9727214786488209
F1: 0.9629022082018928


In [20]:
# Lets resample from the training set until we have 50% spam
X_train_resampled = X_train.copy().values
Y_train_resampled = Y_train.copy().values

total_samples = len(Y_train_resampled)
spam_samples = np.sum(Y_train_resampled)
ham_samples = total_samples - spam_samples

spam_samples_to_add = int(ham_samples - spam_samples)
spam_indices = np.where(Y_train_resampled)[0]

while spam_samples_to_add > 0:
    samples_to_add = min(spam_samples_to_add, spam_samples)
    indices_to_add = np.random.choice(spam_indices, samples_to_add)
    X_train_resampled = np.concatenate([X_train_resampled, X_train_resampled[indices_to_add]])
    Y_train_resampled = np.concatenate([Y_train_resampled, Y_train_resampled[indices_to_add]])
    spam_samples_to_add -= samples_to_add
    spam_samples += samples_to_add
    
assert np.sum(Y_train_resampled) == len(Y_train_resampled) / 2
    
X_train_tfidf_resampled = vectorizer.transform(X_train_resampled)

clf = MultinomialNB()
clf.fit(X_train_tfidf_resampled, Y_train_resampled)

eval(clf)

Accuracy: 0.9701537084017311
Precision: 0.951176983435048
Recall: 0.9734862970044614
F1: 0.9622023434547058


# Spam detection using LSTM

In [21]:
from collections import Counter


def remove_all_non_alphanumeric(s: str) -> str:
    return ''.join(c.lower() for c in s if c.isalnum() or c.isspace())

words = Counter(remove_all_non_alphanumeric(' '.join(X_train)).split())

print(len(words), len([w for w in words if words[w] > 2]))

UNKNOWN = '<UNK>'
dictionary = {w: i for i, w in enumerate([w for w in words if words[w] > 2])} 
dictionary[UNKNOWN] = len(dictionary)

60381 13135


In [22]:
# LSTM model
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class LSTM(nn.Module):
    def __init__(self, hidden_size, output_size, vocab_size, embedding_dim):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size) # , num_layers=2, dropout=0.2)
        self.hidden2out = nn.Linear(hidden_size, output_size)
        
    def forward(self, input):
        assert len(input) > 0
        embedded = self.embedding(input)
        lstm_out, _ = self.lstm(embedded.view(len(input), 1, -1))
        output = self.hidden2out(lstm_out.sum(dim=0))
        return F.sigmoid(output.view(1))
    
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] if w in to_ix else to_ix[UNKNOWN] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

def prepare_label(label):
    return torch.tensor([label], dtype=torch.float)

def eval_lstm(model, X_test, Y_test, dictionary):
    correct = 0
    for sentence, label in zip(X_test, Y_test):
        sentence_in = prepare_sequence(remove_all_non_alphanumeric(sentence).split(), dictionary)
        target = prepare_label(label)
        
        output = model(sentence_in)
        if round(output.item()) == round(target.item()):
            correct += 1
    print(f'Accuracy: {correct / len(Y_test)}')
    
def train(model, X_train, Y_train, X_test, Y_test, dictionary, epochs=10):
    optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.0001)
    
    for epoch in range(epochs):
        total_loss = 0
        outputs = []
        for sentence, label in zip(X_train, Y_train):
            model.zero_grad()
            input_sentence = remove_all_non_alphanumeric(sentence).split()
            if not input_sentence:
                continue
            sentence_in = prepare_sequence(input_sentence, dictionary)
            target = prepare_label(label)
            
            output = model(sentence_in)
            loss = F.binary_cross_entropy(output, target)
            loss.backward()
            optimizer.step()
            
            outputs.append(output.item())
            
            total_loss += loss.item()
        print(f'Epoch {epoch}, loss: {total_loss}, mean output: {sum(outputs) / len(outputs)}, min output: {min(outputs)}, max output: {max(outputs)}')
        eval_lstm(model, X_test, Y_test, dictionary)


In [None]:
model = LSTM(4, 1, len(dictionary), 8)

train(model, X_train_resampled, Y_train_resampled, X_test, Y_test, dictionary, epochs=10)




Epoch 0, loss: 2591.0987372331583, mean output: 0.5187632818799145, min output: 0.0006405205349437892, max output: 0.9998452663421631
Accuracy: 0.20992822966507177
Epoch 1, loss: 2502.0210675670605, mean output: 0.584843852839379, min output: 0.0005746837123297155, max output: 0.9999998807907104
Accuracy: 0.3038277511961722
Epoch 2, loss: 1909.8660983637255, mean output: 0.5648492125957781, min output: 4.600250395014882e-05, max output: 0.9999997615814209
Accuracy: 0.39712918660287083
Epoch 3, loss: 1629.6353338320441, mean output: 0.550631104983269, min output: 2.7559319732972654e-06, max output: 1.0
Accuracy: 0.4748803827751196
Epoch 4, loss: 1384.987472933573, mean output: 0.5389697502289086, min output: 1.432171785609171e-07, max output: 1.0
Accuracy: 0.5424641148325359
Epoch 5, loss: 1204.5053858667084, mean output: 0.5295956917771394, min output: 6.8041008383090684e-09, max output: 1.0
Accuracy: 0.6076555023923444
Epoch 6, loss: 1069.234269154125, mean output: 0.5249345883714878,

Based on athe results from the paper [Investigating Evasive Techniques in SMS Spam Filtering: A Comparative Analysis of Machine Learning Models](https://ieeexplore.ieee.org/document/10431737):

![Results](https://ieeexplore.ieee.org/ielx7/6287639/10380310/10431737/graphical_abstract/access-gagraphic-3364671.jpg)

LSTMs simply don't seem to perform well on this dataset.