## Importing Libraries for use in the Project

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.callbacks import EarlyStopping

## Loading dataset from Google Drive

In [None]:
path = "/content/drive/MyDrive/SCHOOL/ICS/Year IV/Semester  I/Spam Email raw text for NLP.csv" # Place the path to your CSV file here
data = pd.read_csv(path)

In [None]:
data

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",00249.5f45607c1bffe89f60ba1ec9f878039a
1,1,ATTENTION: This is a MUST for ALL Computer Use...,00373.ebe8670ac56b04125c25100a36ab0510
2,1,This is a multi-part message in MIME format.\n...,00214.1367039e50dc6b7adb0f2aa8aba83216
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,00210.050ffd105bd4e006771ee63cabc59978
4,1,This is the bottom line. If you can GIVE AWAY...,00033.9babb58d9298daa2963d4f514193d7d6
...,...,...,...
5791,0,"I'm one of the 30,000 but it's not working ver...",00609.dd49926ce94a1ea328cce9b62825bc97
5792,0,Damien Morton quoted:\n\n>W3C approves HTML 4 ...,00957.e0b56b117f3ec5f85e432a9d2a47801f
5793,0,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\...",01127.841233b48eceb74a825417d8d918abf8
5794,0,"Once upon a time, Manfred wrote :\n\n\n\n> I w...",01178.5c977dff972cd6eef64d4173b90307f0


## Brief Exploratory Data Analysis

In [None]:
# 5796 samples of data
data.shape

(5796, 3)

In [None]:
# Based on Non-Null Count Column in below output, imputation of missing values does not need to be performed
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5796 entries, 0 to 5795
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   CATEGORY   5796 non-null   int64 
 1   MESSAGE    5796 non-null   object
 2   FILE_NAME  5796 non-null   object
dtypes: int64(1), object(2)
memory usage: 136.0+ KB


## NLP Text Preprocessing
* Dropping irrelevant features
* Tokenization
* Sequencing
* Splitting dataset into train and test sets

In [None]:
def get_sequences(texts, tokenizer, train=True, max_sequence_length=None):
  # Creating sequences from text
  sequences = tokenizer.texts_to_sequences(texts)

  # Calculates the maximum sequence length if the train set is in use
  if train == True:
    max_sequence_length = np.max(list(map(lambda x: len(x), sequences)))

  # Padding the train set with zeros
  sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')
  return sequences

In [None]:
# Function to preprocess data
def preprocessing(data):
  df = data.copy()
  # Drop FILE_NAME column
  df = df.drop("FILE_NAME",axis=1)
  X = df['MESSAGE']
  y = df['CATEGORY']

  # Splitting the dataset into training and testing data
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, shuffle=True, random_state=1)

  # Creating the tokenizer
  tokenizer = Tokenizer(num_words=30000) # num_words reduces the number of words in the training set; slightly better for the model

  # Fitting the tokenizer
  tokenizer.fit_on_texts(X_train)

  # Convert emails to sequences
  X_train = get_sequences(X_train,tokenizer,train=True)
  X_test = get_sequences(X_test,tokenizer,train=False,max_sequence_length=X_train.shape[1])
  return  X_train, X_test, y_train, y_test

In [None]:
 X_train, X_test, y_train, y_test = preprocessing(data)

In [None]:
X_train.shape[1]

14805

In [None]:
# Training set has less emails that are spam. Spam = 1; 0=Not Spam
y_train.value_counts()

0    3331
1    1595
Name: CATEGORY, dtype: int64

## Model Creation and Training

In [None]:
# Input Layer
inputs = Input(shape=(X_train.shape[1],))
# Embedding Layer
embedding = Embedding(
    input_dim=30000, # Number of words in tokenizer
    output_dim=64
) (inputs)
# Flatten Layer
flatten = Flatten()(embedding)
#Output Layer
outputs = Dense(1, activation='sigmoid')(flatten) # Sigmoid used because this is a binary classification problem

# Creating model
model = Model(inputs=inputs, outputs=outputs)

model.compile(optimizer='adam',loss='binary_crossentropy',metrics='accuracy')

print(model.summary())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 14805)]           0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 14805, 64)         1920000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 947520)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 947521    
Total params: 2,867,521
Trainable params: 2,867,521
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# Training model
history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=100,
    callbacks=[
      EarlyStopping(monitor='val_loss',patience=3,restore_best_weights=True) # Stop model from training if validation error has not improved after 3 epochs
    ]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


## Model Evaluation

In [None]:
results = model.evaluate(X_test,y_test)



In [None]:
print(f"Test Loss: {results[0]:.4f}")
print(f"Test Accuracy: {results[1] * 100:.2f}%") # Model is performing fairly well

Test Loss: 0.0225
Test Accuracy: 99.20%
