The dataset we're working with this time is less structured. We only have 2 columns in total. One indicating whether or not it is SPAM, and one containing the message. Let's do a bit more exploration on the dataset this time.  

What do I want to know?
1. Number of unique words
2. List of the most common words
3. How many email examples do we have in total?
3. SPAM/HAM ratio of the emails
4. Maximum, Minimum, and Average length of emails
5. List of most common words in emails labled SPAM
6. List of most common words in emails labled HAM

In [1]:
# importing relevant models
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from spam_classification_utils import *
import re
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard

In [2]:
# load dataset. It is always good practice to make a copy of the dataframe before you make any changes so you always have the original. 
df = pd.read_csv('Datasets/spam_data.csv', encoding='utf-8')
df_orig = df.copy()

Let's start by cleaning up the data a little, then going through the things we want to know one by one.

In [None]:
from unidecode import unidecode
import re
df = df_orig
df['Message'] = df['Message'].apply(clean_message)

In [None]:
unique_words = df['Message'].str.split(expand=True).stack().value_counts()
unique_words = unique_words.reset_index()
unique_words.columns = ['word', 'count']

print('There is a total of ' + str(df.shape[0]) + ' emails in the dataset')
print()
print('There is a split of ' + str(df['Category'].value_counts().iloc[0]) + ' SPAM emails and ' + str(df['Category'].value_counts().iloc[1]) + ' HAM emails')
print()
print('The longest email has a length of ' + str(df['Message'].str.split().str.len().max()) + ' and the shortest email has length of ' + str(df['Message'].str.split().str.len().min()) +'. The average length is ' + str(int(df['Message'].str.split().str.len().mean())))
print()
print('There are a total of ' + str(len(unique_words)) + ' unique words in the dataset')
print()
print('The most commonly appearing words are:')
display(unique_words[0:10])

As one would expect, pretty much all of the most common words are stop words like 'to', 'you', 'I', 'my', 'is', 'and' etc. These appear in almost every email so it makes sense for them to be the most common, but we are also not getting anything meaningful from them, so its ideal to remove them when we try to build our model. Lets remove the stop words, then take a closer look specifically at the SPAM and HAM emails. 

In [None]:
stop_words = pd.read_json('stopwords-en.json')
stop_words = stop_words[0].tolist()
filtered_unique_words = unique_words[~unique_words['word'].isin(stop_words)]
display(filtered_unique_words[0:10])

In [None]:
spam_emails = df[df['Category'] == 'spam']
ham_emails = df[df['Category'] == 'ham']
spam_unique_words = spam_emails['Message'].str.split(expand=True).stack().value_counts()
ham_unique_words = ham_emails['Message'].str.split(expand=True).stack().value_counts()

spam_unique_words = spam_unique_words.reset_index()
spam_unique_words.columns = ['word', 'count']

ham_unique_words = ham_unique_words.reset_index()
ham_unique_words.columns = ['word', 'count']

filtered_spam_unique_words = spam_unique_words[~spam_unique_words['word'].isin(stop_words)]
filtered_ham_unique_words = ham_unique_words[~ham_unique_words['word'].isin(stop_words)]

print('The most commonly appearing words in SPAM emails are:')
display(filtered_spam_unique_words[0:10])
print()
print('The most commonly appearing words in HAM emails are:')
display(filtered_ham_unique_words[0:10])


Already we can see the most commonly appearing words between spam and ham emails differ greatly. Now that we have a general idea of whats in the datasets, we can try to learn a basic model using our previous somewhat naive method that breaks down the emails into the common words. 

In [None]:
unique_words_list = filtered_unique_words['word'].tolist()
unique_words_truncated = unique_words_list[0:3000]


In [None]:
word_counts = np.zeros((len(df), len(unique_words_truncated)), dtype=int)

for i, word in enumerate(unique_words_truncated):
    word_counts[:, i] = df['Message'].apply(lambda msg: count_word_in_message(word, msg))
    print(str(i) if i%10 == 0 else '')
word_counts_df = pd.DataFrame(word_counts, columns=unique_words_truncated)

# Concatenate the word counts DataFrame with the original DataFrame
result_df = pd.concat([word_counts_df, df[['Category']]], axis=1)

# Display the updated DataFrame
display(result_df)

In [None]:
label_encoder = LabelEncoder()

word_columns = result_df.columns[0:-1]
label_column = result_df.columns[-1]
result_df[word_columns] = (result_df[word_columns] - result_df[word_columns].min()) / (result_df[word_columns].max() - result_df[word_columns].min())
result_df['Category'] = label_encoder.fit_transform(result_df['Category'])
# check our mapping to see if the encoder correctly labels ham 0 and spam 1
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(result_df[word_columns], result_df[label_column], test_size=0.2, random_state=42)

In [None]:

from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.metrics import accuracy_score

linear_model = Sequential()
linear_model.add(Dense(128, activation = 'relu', input_shape = (3000,)))
linear_model.add(Dense(64, activation='relu'))
linear_model.add(Dense(1, activation = 'sigmoid'))

linear_model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

linear_model.fit(X_train, y_train.T, batch_size=10,
          epochs=10, validation_split=0.1)

y_pred_prob = linear_model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to binary predictions

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the model: {accuracy:.4f}")

honestly this is already pretty good maybe i just stop here lmao. 
Ok that was a joke. Let's make an RNN using the original data

In [None]:
label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['Category'])
display(df)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = len(unique_words_list))
tokenizer.fit_on_texts(df['Message'])
sequences = tokenizer.texts_to_sequences(df['Message'])

In [None]:
max_length = max(len(seq) for seq in sequences)  # Or choose a fixed length
padded_sequences = pad_sequences(sequences, maxlen=max_length)

In [None]:
display(padded_sequences)
print(padded_sequences.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, df['Category'], test_size=0.2, random_state=42)

In [None]:
X_train = np.array(X_train)
X_val = np.array(X_val)

y_train = np.array(y_train)
y_val = np.array(y_val)

In [None]:
display(X_train.shape)
display(y_train.shape)

In [None]:
display(y_train)

In [None]:
from keras import layers, models
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, roc_auc_score, f1_score, recall_score, precision_score
from tensorflow.keras.metrics import Recall, Precision
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

vocab_size = len(unique_words_list)
max_length = 171
METRICS = ['accuracy', 'Precision', 'Recall']
      


simple_RNN_model = models.Sequential()
simple_RNN_model.add(layers.Embedding(input_dim = vocab_size, output_dim=64))
simple_RNN_model.add(layers.Bidirectional(layers.LSTM(128, recurrent_dropout=0.2)))
simple_RNN_model.add(layers.Dense(1, activation='sigmoid'))  # Use 'softmax' if you have multiple classes

simple_RNN_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=METRICS)  # Adjust loss based on the number of classes

In [None]:
# Define the log directory for TensorBoard
file_name = 'test3'
tensorboard = TensorBoard(log_dir="logs\\{}".format(file_name))

In [None]:
simple_RNN_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val), callbacks = [tensorboard])

In [None]:
y_pred_prob = simple_RNN_model.predict(X_val)
y_pred = (y_pred_prob > 0.99).astype(int)  # Convert probabilities to binary predictions

accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy of the model: {accuracy:.4f}")

We have very slightly improved our performance! 98.92% accuracy on our test set compared to 98.02% is pretty good for our relatively basic RNN model. 

In [None]:
SEQUENCE_LENGTH = 50 # the length of all sequences (number of words per sample)
EMBEDDING_SIZE = 100  # Using 100-Dimensional GloVe embedding vectors
TEST_SIZE = 0.25 # ratio of testing set

BATCH_SIZE = 64
EPOCHS = 20 # number of epochs

In [None]:
from keras import layers, models
vocab_size = len(unique_words_list)
max_length = 171


adv_RNN_model = models.Sequential()
adv_RNN_model.add(layers.Embedding(input_dim=vocab_size, output_dim=64))
adv_RNN_model.add(layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))  # Set return_sequences=True
adv_RNN_model.add(layers.BatchNormalization())

adv_RNN_model.add(layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=False))  # Last LSTM can return sequences=False
adv_RNN_model.add(layers.BatchNormalization())

adv_RNN_model.add(layers.Dense(1, activation='sigmoid'))  # For binary classification

adv_RNN_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

adv_RNN_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

adv_RNN_model.summary()

In [None]:
# Confusion matrix
y_pred = (adv_RNN_model.predict(X_test) > 0.78).astype("int32")

plot_confusion_matrix(confusion_matrix(y_test, y_pred), class_names=['Ham','Spam'])
plt.title('Test data confusion matrix with classification threshold at 0.77')
plt.savefig('images/confusion77.png',bbox_inches='tight',dpi=400, pad_inches=0.1)
plt.show()

NameError: name 'adv_RNN_model' is not defined

In [None]:
y_pred_prob = adv_RNN_model.predict(X_val)
y_pred = (y_pred_prob > 0.99).astype(int)  # Convert probabilities to binary predictions

accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy of the model: {accuracy:.4f}")