In [2]:
import tensorflow as tf

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import Input, LSTM, Dense, GlobalMaxPooling1D, Embedding
from tensorflow.keras.models import Model

In [4]:
df = pd.read_csv("spam_or_not_spam.csv")
df.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


In [7]:
df.isna().sum()

email    1
label    0
dtype: int64

In [9]:
df.dropna(inplace=True)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df['email'].values, df['label'].values, test_size=0.33)

### 1 Tokenization

- convert strings to tokens
- convert tokens to sequences of indices


In [11]:
MAX_VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(X_train) # only want to fit train set
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)

In [15]:
# pad sequences to get N x T matrix, T = length of sequence of equal length
data_train = pad_sequences(sequences_train)
print(f"Shape of train data tensor: {data_train.shape}")

# cannot guess how long test sequences will be, set some value equal to training max
# may truncate possible future sentences
T = data_train.shape[1]
data_test = pad_sequences(sequences_test, maxlen=T)
print(f"Shape of test data tensor: {data_test.shape}")

Shape of train data tensor: (2009, 13303)
Shape of test data tensor: (990, 13303)


### 2 Build the LSTM model


In [18]:
word2idxmapping = tokenizer.word_index
V = len(word2idxmapping)
print(f"Found {V} unique tokens")

### Embedding dimensionality (learned param)
D = 20 # word vector size

### Hidden state dimensionality (learned param)
M = 15

### Output state dimensionality
K = 1

input = Input(shape=(T, )) # takes in sequence of integers to get corresponding sequence vector
x = Embedding(V + 1, D)(input) # V + 1 because of index 1, 0 used to pad + Embedding layer - to retrieve sequence vector
# becomes T x D after embedding
x = LSTM(M, return_sequences=True)(x) # M = no. of hidden nodes, learned param
x = GlobalMaxPooling1D()(x) # helps in learning long-term dependency by focusing on the most impt hidden state features
x = Dense(K, activation='sigmoid')(x) # 1 for spam or no spam prediction

model = Model(input, x)

### 3 Compile & Train model


In [None]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

print("Training model....")
result = model.fit(
    data_train, y_train,
    epochs=10,
    validation_data=(data_test, y_test)
)

Training model....
Epoch 1/10

In [None]:
plt.figure(figsize=(15, 6))
plt.plot(result.history['loss'], label='loss')
plt.plot(result.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

plt.figure(figsize=(15, 6))
plt.plot(result.history['accuracy'], label='accuracy')
plt.plot(result.history['val_accuracy'], label='val_accuracy')
plt.legend()
plt.show()