# Exercise 2

## Import Necessary Libraries

In [1]:
import numpy as np
import pandas as pd

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

print("Setup Complete")

Setup Complete


In [2]:
# Path of the file to read
spam_filepath = "../datasets/spam_or_not_spam.csv"

# Read the file into a variable spam_data
spam_data = pd.read_csv(spam_filepath)

# Set seed for reproducibility
np.random.seed(0)

## Prepare The Dataset

In [3]:
spam_data

Unnamed: 0,email,label
0,mike bostock said received from trackingNUMBE...,0
1,no i was just a little confused because i m r...,0
2,this is just an semi educated guess if i m wro...,0
3,jm URL justin mason writes except for NUMBER t...,0
4,i just picked up razor sdk NUMBER NUMBER and N...,0
...,...,...
1495,abc s good morning america ranks it the NUMBE...,1
1496,hyperlink hyperlink hyperlink let mortgage le...,1
1497,thank you for shopping with us gifts for all ...,1
1498,the famous ebay marketing e course learn to s...,1


In [4]:
# Check for missing values
spam_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   email   1499 non-null   object
 1   label   1500 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 23.6+ KB


In [5]:
# Drop missing values
spam_data.dropna(inplace=True)

In [6]:
# Split the data
X = spam_data["email"].values
y = spam_data["label"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [7]:
# Prepare tokenizer
t = Tokenizer()
t.fit_on_texts(X_train)

In [8]:
# Integer encode the documents
encoded_train = t.texts_to_sequences(X_train)
encoded_test = t.texts_to_sequences(X_test)

In [9]:
# Pad documents to a max length of 1000 words
max_length = 1000
padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='post')
padded_test = pad_sequences(encoded_test, maxlen=max_length, padding='post')

## Build Deep Learning Model

In [10]:
vocab_size = len(t.word_index) + 1 # Add 1 because of 0s of padding

# Define the model
model = Sequential()
model.add(Embedding(vocab_size, 24, input_length=max_length))
model.add(Flatten())
model.add(Dense(500, activation='relu'))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

# Summarize the model
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1000, 24)          473088    
_________________________________________________________________
flatten (Flatten)            (None, 24000)             0         
_________________________________________________________________
dense (Dense)                (None, 500)               12000500  
_________________________________________________________________
dense_1 (Dense)              (None, 200)               100200    
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 100)               20100     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 1

In [11]:
# Define early stopping
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)

# Fit the model
model.fit(x=padded_train,
         y=y_train,
         epochs=50,
         validation_data=(padded_test, y_test), verbose=1,
         callbacks=[early_stop]
         )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 00013: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f4aa50f1070>

In [12]:
# Set a threshold for results
preds = (model.predict(padded_test) > 0.5).astype("int32")

In [13]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       246
           1       0.97      0.95      0.96       129

    accuracy                           0.97       375
   macro avg       0.97      0.97      0.97       375
weighted avg       0.97      0.97      0.97       375

