# SPAM DETECTION SYSTEM

This project aims to build a spam detection system using Natural Language Processing (NLP) techniques in order to classify emails as spam or not spam based on the content of the email.
The steps involved in this project are:
1. Train a classifier to identify spam emails.
2. Find out the principal topics of the spam emails.
3. Compute the semantic similarity between the spam emails, to verify the etheroegeneity of the spam emails.
4. Extract from non-spam emails the Organisations mentioned in the emails.

In [2]:
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%config Completer.use_jedi=False

The dataset is available in data, and it is called `spam_dataset.csv`. It contains the following columns:

In [13]:
spam_df = pd.read_csv('../data/spam_dataset.csv', index_col=0)
spam_df.head(5)
print('------------------')
spam_df.shape
print('------------------')
spam_df.isnull().sum()
print('------------------')
spam_df['text'][605]
print('------------------')
spam_df.value_counts('label')

Unnamed: 0,label,text,label_num
605,ham,Subject: enron methanol ; meter # : 988291\nth...,0
2349,ham,"Subject: hpl nom for january 9 , 2001\n( see a...",0
3624,ham,"Subject: neon retreat\nho ho ho , we ' re arou...",0
4685,spam,"Subject: photoshop , windows , office . cheap ...",1
2030,ham,Subject: re : indian springs\nthis deal is to ...,0


------------------


(5171, 3)

------------------


label        0
text         0
label_num    0
dtype: int64

------------------


"Subject: enron methanol ; meter # : 988291\nthis is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary\nflow data provided by daren } .\nplease override pop ' s daily volume { presently zero } to reflect daily\nactivity you can obtain from gas control .\nthis change is needed asap for economics purposes ."

------------------


label
ham     3672
spam    1499
Name: count, dtype: int64

In [4]:
spam_df.reset_index(drop=True, inplace=True)
spam_df.head(3)

Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\nth...,0
1,ham,"Subject: hpl nom for january 9 , 2001\n( see a...",0
2,ham,"Subject: neon retreat\nho ho ho , we ' re arou...",0


In [5]:
import nltk
import spacy
import os

try:
    nlp = spacy.load('en_core_web_sm')
except:
    os.system('python -m spacy download en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')
try:
    eng_stopwords = nltk.corpus.stopwords.words('english')
except:
    nltk.download('stopwords')
    eng_stopwords = nltk.corpus.stopwords.words('english')
try:
    _ = nltk.tokenize.word_tokenize('test')
except:
    nltk.download('punkt')


In [6]:
np.argmax([len(spam_df['text'][i]) for i in range(spam_df.shape[0])])

949

In [7]:
emails = spam_df['text'].values

In [12]:
def preprocess_text_and_store(text, doc_store=None, store=False):
    preprocessed_text = np.empty(len(text), dtype=object)
    if doc_store is None or doc_store not in os.listdir('../data'):
        counter = 0
        for i in range(len(text)):
            process_words = []
            text[i] = text[i].replace('\d', ' ')
            for word in nltk.word_tokenize(nlp(text[i].lower()).text):
                if word.isalpha() and word not in eng_stopwords and len(str(word)) >= 3:
                    process_words.append(word)
            preprocessed_text[counter] = ' '.join(process_words)
            counter += 1
    else:
        preprocessed_text = np.array(pd.read_csv('../data/'+doc_store)['comment_text'])
    if store:
        pd.DataFrame(data = {"comment_text":preprocessed_text}).to_csv('../data/'+doc_store)

    return preprocessed_text

In [11]:
preprocessed_text= preprocess_text_and_store(emails, doc_store="preprocessed_spam_df.csv", store=True)

In [23]:
from sklearn.model_selection import train_test_split

def train_test_val_split(data, labels=None, test_size=0.2, val_size=0.1, random_state=42):
    if labels is None:
        data, labels = data
    corpus, X_test, train_labels, y_test = train_test_split(data, labels, test_size=test_size, random_state=random_state)
    X_train, X_val, y_train, y_val = train_test_split(corpus, train_labels, test_size=val_size/(1-test_size), random_state=random_state)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    y_val = np.array(y_val)

    return X_train.astype(str).tolist(), X_test.astype(str).tolist(), X_val.astype(str).tolist(), y_train, y_test, y_val

X_train, X_test, X_val, y_train, y_test, y_val = train_test_val_split(preprocessed_text, labels=spam_df['label_num'], test_size=0.2, val_size=0.2)


X_train[0:3]
y_train[0:3]

['subject record spot deal notes deal sheet bob wants call tape would great difference results call bob try put bed leave allocated term deal daren farmer rebecca griffin enron enron gary lamphier hou ect ect ilene erskine hou azurix azurix subject volume mmbtu flowed noms record spot deal day gary currently volume allocated term agreement price difference day significant hsc mid record spot deal allocation stand rebecca griffin enron daren farmer hou ect ect subject daren handling trying resolve issue february production shows mmbtu february deal able find anything thanks help rebecca forwarded rebecca griffin enron katherine herrera rebecca griffin enron enron subject forwarded katherine herrera corp enron gary lamphier ect daren farmer hou ect ect katherine herrera corp enron enron subject verified gas flowed billed term deal one place gas flowed invoice term let know put janet wallis gary lamphier hou ect ect katherine herrera corp enron enron subject bob says billed purchase made 

array([0, 0, 1])

In [24]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=1500)
tokenizer.fit_on_texts(X_train)

tokenizer.word_counts

OrderedDict([('subject', 4761),
             ('record', 47),
             ('spot', 148),
             ('deal', 1700),
             ('notes', 54),
             ('sheet', 24),
             ('bob', 412),
             ('wants', 57),
             ('call', 438),
             ('tape', 7),
             ('would', 631),
             ('great', 154),
             ('difference', 36),
             ('results', 113),
             ('try', 89),
             ('put', 121),
             ('bed', 34),
             ('leave', 37),
             ('allocated', 121),
             ('term', 149),
             ('daren', 1121),
             ('farmer', 671),
             ('rebecca', 38),
             ('griffin', 21),
             ('enron', 3672),
             ('gary', 332),
             ('lamphier', 64),
             ('hou', 4174),
             ('ect', 7964),
             ('ilene', 1),
             ('erskine', 3),
             ('azurix', 3),
             ('volume', 559),
             ('mmbtu', 733),
             ('flow

In [25]:
X_train_padded = tokenizer.texts_to_sequences(X_train)
X_test_padded = tokenizer.texts_to_sequences(X_test)
X_val_padded = tokenizer.texts_to_sequences(X_val)

vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary size: ", vocab_size)

Vocabulary size:  34301


In [26]:
maxlen = len(max(X_train_padded, key=len))
X_train_padded = pad_sequences(X_train_padded, padding='post', maxlen=maxlen)
X_test_padded = pad_sequences(X_test_padded, padding='post', maxlen=maxlen)
X_val_padded = pad_sequences(X_val_padded, padding='post', maxlen=maxlen)

In [27]:
X_train_padded[0:5]
X_train_padded.shape
y_train.shape

array([[   2,  986,  261, ...,    0,    0,    0],
       [   2,    4,   10, ...,    0,    0,    0],
       [   2,  346, 1186, ...,    0,    0,    0],
       [   2,  132,  534, ...,    0,    0,    0],
       [   2,  785,   38, ...,    0,    0,    0]], dtype=int32)

(3102, 1533)

(3102,)

In [35]:
import tensorflow as tf
class EarlyStopping(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if logs['val_accuracy'] > 0.97 and epoch + 1 > 3:
            self.model.stop_training = True
            print('\nStop training at epoch:', epoch+1)

early_stopping = EarlyStopping()

In [28]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras.models import Sequential
def CNN_model(vocab_size, maxlen, activation='softmax'):
    model = Sequential()
    model.add(Embedding(input_dim = vocab_size, output_dim=128, input_length=maxlen))
    model.add(Dropout(0.8))
    model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(units=64))
    model.add(Dropout(0.65))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation=activation))
    return model


In [29]:
cnn_model = CNN_model(vocab_size=vocab_size, maxlen=maxlen, activation='sigmoid')

In [30]:
cnn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1533, 128)         4390528   
                                                                 
 dropout (Dropout)           (None, 1533, 128)         0         
                                                                 
 conv1d (Conv1D)             (None, 1533, 64)          24640     
                                                                 
 max_pooling1d (MaxPooling1  (None, 766, 64)           0         
 D)                                                              
                                                                 
 lstm (LSTM)                 (None, 64)                33024     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                        

In [None]:
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = cnn_model.fit(X_train_padded, 
                        y_train, 
                        epochs=10, 
                        validation_data=(X_val_padded, y_val), 
                        batch_size=64, 
                        callbacks=[early_stopping])