# Import libraries

In [3]:
import pandas as pd

import re

import en_core_web_sm
from spacy.lang.en.stop_words import STOP_WORDS

import tensorflow as tf
# from tensorflow.keras.metrics import Sparse
from tensorflow.keras import Sequential
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, Embedding, SimpleRNN, LSTM


import plotly.express as px
from plotly import graph_objects as go




# Import data

In [4]:
df = pd.read_csv('src/spam.csv', encoding='cp1252')

UTF-8 encoding doesn't work on this file, we need to use another encoding type.

# Basic Statistics

In [5]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
df.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [7]:
fig = px.pie(
    values=df['v1'].value_counts(),
    names=df['v1'].unique(),
    height=500,
    width=700
    )
fig.show()

In [8]:
df[df['Unnamed: 4'].isnull()==False]

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
281,ham,\Wen u miss someone,the person is definitely special for u..... B...,why to miss them,"just Keep-in-touch\"" gdeve.."""
1038,ham,"Edison has rightly said, \A fool can ask more ...",GN,GE,"GNT:-)"""
2255,ham,I just lov this line: \Hurt me with the truth,I don't mind,i wil tolerat.bcs ur my someone..... But,"Never comfort me with a lie\"" gud ni8 and swe..."
3525,ham,\HEY BABE! FAR 2 SPUN-OUT 2 SPK AT DA MO... DE...,HAD A COOL NYTHO,TX 4 FONIN HON,"CALL 2MWEN IM BK FRMCLOUD 9! J X\"""""
4668,ham,"When I was born, GOD said, \Oh No! Another IDI...",GOD said,"\""OH No! COMPETITION\"". Who knew","one day these two will become FREINDS FOREVER!"""
5048,ham,"Edison has rightly said, \A fool can ask more ...",GN,GE,"GNT:-)"""


The csv file use a comma separator. The unnamed columns were created by mistake, because the original text contains commas.
<br>Let's concatenate them, and do some cleaning on the dataset.

In [9]:
# replace empty values with an empty string
df = df.fillna('')

In [10]:
#Concatenate columns with a comma separator
df['text'] = df[['v2','Unnamed: 2','Unnamed: 3','Unnamed: 4']].apply(lambda x: ','.join(x), axis=1)


In [11]:
#rename and drop unused columns
df=df.rename(columns={'v1':'target'})
df = df[['target','text']]

# Preprocessing

## Normalization

- Punctuation
- Stop words
- Text kept in case shape as it might be a spam indicator.
- Target be encoded into a binary form (HAM=0, SPAM=1)

- stemming/Lemmatization

In [12]:
nlp = en_core_web_sm.load()

In [13]:
#remove punctuation
df['cleaned_text'] = df['text'].apply(lambda x: re.sub('[!\"#$%&()*+,-./:;<=>?@\[\]^_`{|}~\\\]+'," ", x))

#replace doubles whitespaces with single and trim starting and finishing whitespaces
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: x.replace(" +"," ").strip())

df['cleaned_text'] = df['cleaned_text'].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) & (token.text not in STOP_WORDS)]))

In [14]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=1000) # instanciate the tokenizer
tokenizer.fit_on_texts(df["cleaned_text"])
df["encoded_text"] = tokenizer.texts_to_sequences(df['cleaned_text'])

#Add a column with the length of the encoded text.
df['len_text'] = df["encoded_text"].apply(lambda x: len(x))

#Remove empty encoded text.
df = df[df["len_text"]!=0]

After being processed, some sentences didn't get encoded. It is probably due to the fact that the words were not contained in the vocabulary of the tokenizer.

In [15]:
#Zero-padding the encoded text, to keep same length for all sequences
text_pad = tf.keras.preprocessing.sequence.pad_sequences(df['encoded_text'], padding="post")

In [16]:
#Binary encode the target
df['encoded_target']=df['target'].map({'ham':0,'spam':1})

In [17]:
full_ds = tf.data.Dataset.from_tensor_slices((text_pad, df['encoded_target']))

In [18]:
# Train Test Split
TAKE_SIZE = int(0.7*df.shape[0])

train_ds = full_ds.take(TAKE_SIZE).shuffle(TAKE_SIZE).batch(64)
test_ds = full_ds.skip(TAKE_SIZE).batch(64)

In [19]:
 # Regardons un batch 
for text, target in train_ds.take(1):
  print(text, target)

tf.Tensor(
[[975  29  61 ...   0   0   0]
 [110 242 671 ...   0   0   0]
 [672   0   0 ...   0   0   0]
 ...
 [124   5 112 ...   0   0   0]
 [ 37  57  11 ...   0   0   0]
 [  6  84 538 ...   0   0   0]], shape=(64, 58), dtype=int32) tf.Tensor(
[0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0], shape=(64,), dtype=int64)


# Model Training

## Base line model (RNN)

In [20]:
vocab_size = 1000
embedding_dim= 8

RNN_model = Sequential([
    Embedding(vocab_size+1,embedding_dim,input_shape=[text.shape[1],],name='embedding',
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
  SimpleRNN(units=64, return_sequences=True), # maintains the sequential nature
  SimpleRNN(units=32, return_sequences=False), # returns the last output
  Dense(16, activation='relu'), # a dense layer
  Dense(1, activation="sigmoid") # the prediction layer
])


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



In [21]:
precision = tf.keras.metrics.Precision()
recall = tf.keras.metrics.Recall()

def f1_score_metric(y_true, y_pred):
    
    precision_calc= precision(y_true,y_pred)
    recall_calc=recall(y_true,y_pred)
    f1_score = 2 * (precision_calc * recall_calc) / (precision_calc + recall_calc + K.epsilon())
    return f1_score

In [22]:
RNN_model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[f1_score_metric])

In [23]:
RNN_model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=30)

Epoch 1/30
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - f1_score_metric: 0.1940 - loss: 0.5238 - val_f1_score_metric: 0.5713 - val_loss: 0.1595
Epoch 2/30
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - f1_score_metric: 0.6549 - loss: 0.1228 - val_f1_score_metric: 0.7539 - val_loss: 0.0764
Epoch 3/30
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - f1_score_metric: 0.7831 - loss: 0.0551 - val_f1_score_metric: 0.8246 - val_loss: 0.0607
Epoch 4/30
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - f1_score_metric: 0.8378 - loss: 0.0359 - val_f1_score_metric: 0.8610 - val_loss: 0.0689
Epoch 5/30
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - f1_score_metric: 0.8680 - loss: 0.0215 - val_f1_score_metric: 0.8835 - val_loss: 0.0704
Epoch 6/30
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - f1_score_metric: 0.8879 - loss: 0.0191

<keras.src.callbacks.history.History at 0x1f1ab6f7d90>

In [24]:
history = RNN_model.history.history

fig = go.Figure(data=[
                      go.Scatter(
                          y=history["loss"],
                          name="Training loss",
                          mode="lines",
                          ),
                      go.Scatter(
                          y=history["val_loss"],
                          name="Validation loss",
                          mode="lines",
                          )
])
fig.update_layout(
    title='Training and val loss across epochs',
    xaxis_title='epochs',
    yaxis_title='Cross Entropy' ,
    height=600,
    width=900
     
)
fig.show()

In [25]:
RNN_model.evaluate(train_ds)
RNN_model.evaluate(test_ds)

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - f1_score_metric: 0.9617 - loss: 6.2951e-05
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - f1_score_metric: 0.9623 - loss: 0.1349   


[0.12159130722284317, 0.9621945023536682]

The model seems to be good quickly. After a few epochs, the model starts to diverge.

## LSTM Model

In [26]:
LSTM_model = Sequential([
  Embedding(vocab_size+1,embedding_dim,input_shape=[text.shape[1],],name='embedding',
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
  LSTM(units=64, return_sequences=True), # maintains the sequential nature
  LSTM(units=32, return_sequences=False), # returns the last output
  Dense(16, activation='relu'), # a dense layer
  Dense(1, activation="sigmoid") # the prediction layer
])

In [27]:
LSTM_model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[f1_score_metric])

In [28]:
LSTM_model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=50)

Epoch 1/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step - f1_score_metric: 0.9592 - loss: 0.6040 - val_f1_score_metric: 0.9517 - val_loss: 0.2252
Epoch 2/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - f1_score_metric: 0.9508 - loss: 0.1757 - val_f1_score_metric: 0.9502 - val_loss: 0.0881
Epoch 3/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - f1_score_metric: 0.9501 - loss: 0.0661 - val_f1_score_metric: 0.9502 - val_loss: 0.0651
Epoch 4/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - f1_score_metric: 0.9503 - loss: 0.0459 - val_f1_score_metric: 0.9505 - val_loss: 0.0602
Epoch 5/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - f1_score_metric: 0.9506 - loss: 0.0410 - val_f1_score_metric: 0.9510 - val_loss: 0.0529
Epoch 6/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - f1_score_metric: 0.9512 - loss: 0.0189

<keras.src.callbacks.history.History at 0x1f1a8e7b2e0>

In [29]:
history = LSTM_model.history.history

fig = go.Figure(data=[
                      go.Scatter(
                          y=history["loss"],
                          name="Training loss",
                          mode="lines",
                          ),
                      go.Scatter(
                          y=history["val_loss"],
                          name="Validation loss",
                          mode="lines",
                          )
])
fig.update_layout(
    title='Training and val loss across epochs',
    xaxis_title='epochs',
    yaxis_title='Cross Entropy' ,
    height=600,
    width=900
     
)
fig.show()


We can see that the model starts to diverge after 22-23 epochs

In [30]:
LSTM_model.evaluate(train_ds)
LSTM_model.evaluate(test_ds)

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - f1_score_metric: 0.9658 - loss: 4.0212e-05
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - f1_score_metric: 0.9659 - loss: 0.2086


[0.21355025470256805, 0.9659081697463989]

## Conclusion

After comparing the two models, it can be seen that the performance of both models are really close.