# Email Spam Detection using RNN model

## 1. Import all used packages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import nltk
import ssl
import re
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download("stopwords")
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences 
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Bidirectional
from keras.models import Model
import tensorflow as tf

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/davidanggawijaya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2023-01-28 02:55:02.439843: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 2. importing email spam dataset

In [2]:
df = pd.read_csv("spam_ham_dataset.csv")
df.drop(['Unnamed: 0', 'label'], axis=1, inplace=True)
df = df.rename(columns = {"label_num":"spam"})
df.head()

Unnamed: 0,text,spam
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0


## 3. cleaning the raw data

In [3]:
# remove duplicate
df.drop_duplicates(inplace = True)

# clean each raw data lower case, no punctuation, no stop words
def clean_text(text):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    text = re.sub(emoj, '', text)
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words("english")]
    return " ".join(clean_words)

df["text"] = df["text"].map(clean_text)
df.head()

Unnamed: 0,text,spam
0,Subject enron methanol meter 988291 follow not...,0
1,Subject hpl nom january 9 2001 see attached fi...,0
2,Subject neon retreat ho ho ho around wonderful...,0
3,Subject photoshop windows office cheap main tr...,1
4,Subject indian springs deal book teco pvr reve...,0


## 4. split the dataset

In [4]:
emails_train, emails_test, target_train, target_test = train_test_split(df["text"],df["spam"],test_size = 0.2, random_state= 0) 

## 5. tokenizing the dataset

In [5]:
## some config values 
embed_size = 100 # how big is each word vector
max_feature = 50000 # how many unique words to use (i.e num rows in embedding vector)
max_len = 2000 # max number of words in a question to use

In [6]:
tokenizer = Tokenizer(num_words=max_feature)

tokenizer.fit_on_texts(emails_train)

emails_train_tokens = np.array(tokenizer.texts_to_sequences(emails_train))
emails_test_tokens = np.array(tokenizer.texts_to_sequences(emails_test))
print(emails_train_tokens[0])

[2, 1256, 4334, 1546, 118, 2596, 10141, 10142, 1294, 1146, 62, 20043, 1238, 396, 381, 135, 569, 517, 4833, 1295, 13157, 133, 118, 2917, 411, 510, 57, 425, 133, 4833, 1295, 252, 2597, 50, 3158, 269, 1321, 293, 76, 792, 188, 547, 2262, 20044, 20045, 20046, 20047, 20048, 13158, 8480, 10143, 20049, 10144, 20050, 20051, 13159, 10145, 13160, 6050, 13161]


  emails_train_tokens = np.array(tokenizer.texts_to_sequences(emails_train))
  emails_test_tokens = np.array(tokenizer.texts_to_sequences(emails_test))


In [7]:
# normalised the array
emails_train_tokens = pad_sequences(emails_train_tokens,maxlen=max_len)
emails_test_tokens = pad_sequences(emails_test_tokens,maxlen=max_len)
emails_train_tokens[0]

array([    0,     0,     0, ..., 13160,  6050, 13161], dtype=int32)

## 6. Creating RNN model

In [8]:

embedding_vecor_length = 32

model = tf.keras.Sequential()
model.add(Embedding(max_feature, embedding_vecor_length, input_length=max_len))
model.add(Bidirectional(tf.keras.layers.LSTM(64)))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

2023-01-28 02:56:02.120217: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2000, 32)          1600000   
                                                                 
 bidirectional (Bidirectiona  (None, 128)              49664     
 l)                                                              
                                                                 
 dense (Dense)               (None, 16)                2064      
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 1,651,745
Trainable params: 1,651,745
Non-trainable params: 0
______________________________________________

## 7. train the model

In [None]:
#training stage
history = model.fit(emails_train_tokens, target_train, batch_size=512, epochs=20, validation_data=(emails_test_tokens, target_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

In [None]:
from  matplotlib import pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.grid()
plt.show() 

## 8. analyst the training result

In [None]:
ax= plt.subplot()
spam_predict  = [1 if o>0.5 else 0 for o in model.predict(emails_test_tokens)]
cf_matrix =confusion_matrix(target_test,spam_predict)
sns.heatmap(cf_matrix, annot=True, ax = ax,cmap='Blues',fmt=''); #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels');
ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Not Spam', 'Spam']); ax.yaxis.set_ticklabels(['Not Spam', 'Spam']);

In [None]:
print(classification_report(target_test,spam_predict))
print()
print("Accuracy: ", accuracy_score(target_test,spam_predict))

## 9. testing stage using real life gmail emails

In [None]:

df2 = pd.read_csv("label_spam.csv")
df2 = df2.rename(columns = {"Email Text":"text"})
df2["text"] = df2['Subject'] + " " + df2['text']
df2.drop(['From', 'Subject'], axis=1, inplace=True)
df2["spam"] = 1
df2["text"] = df2["text"].map(clean_text)

In [None]:
real_emails_tokens = np.array(tokenizer.texts_to_sequences(df2["text"]))
real_emails_tokens = pad_sequences(real_emails_tokens,maxlen=max_len)

In [None]:
prediction  = [1 if o>0.5 else 0 for o in model.predict(real_emails_tokens)]
df2["prediction"] = prediction
df2

In [None]:
print("Accuracy of predicting real life gmail: ", accuracy_score(prediction,df2["spam"]))

## Conclusion: 

1. overfitting the dataset fail to perform good in predicting a different dataset in this case the gmail's spam label
2. dataset shift problem, The dataset that is used in the experiment is generated from few years ago, while the new dataset may no longer follow the same pattern due to socioeconomic factors such as spammers or attacker behaviour