In [None]:
 import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Data import

df = pd.read_csv('/content/drive/MyDrive/Jedha_Fullstack/AT&T_Spam_Detector_Project/Spam_Dataset.csv', encoding='latin1')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [None]:
# Fast data analysis

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [None]:
df.loc[df['Unnamed: 3'].notna()]

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
95,spam,Your free ringtone is waiting to be collected....,PO Box 5249,"MK17 92H. 450Ppw 16""",
281,ham,\Wen u miss someone,the person is definitely special for u..... B...,why to miss them,"just Keep-in-touch\"" gdeve.."""
899,spam,Your free ringtone is waiting to be collected....,PO Box 5249,"MK17 92H. 450Ppw 16""",
1038,ham,"Edison has rightly said, \A fool can ask more ...",GN,GE,"GNT:-)"""
2170,ham,\CAN I PLEASE COME UP NOW IMIN TOWN.DONTMATTER...,JUST REALLYNEED 2DOCD.PLEASE DONTPLEASE DONTIG...,"U NO THECD ISV.IMPORTANT TOME 4 2MORO\""""",
2255,ham,I just lov this line: \Hurt me with the truth,I don't mind,i wil tolerat.bcs ur my someone..... But,"Never comfort me with a lie\"" gud ni8 and swe..."
3145,ham,\SHIT BABE.. THASA BIT MESSED UP.YEH,SHE SHUDVETOLD U. DID URGRAN KNOW?NEWAY,"ILLSPEAK 2 U2MORO WEN IM NOT ASLEEP...\""""",
3506,ham,Two fundamentals of cool life: \Walk,"like you are the KING\""...! OR \""Walk like yo...","whoever is the KING\""!... Gud nyt""",
3525,ham,\HEY BABE! FAR 2 SPUN-OUT 2 SPK AT DA MO... DE...,HAD A COOL NYTHO,TX 4 FONIN HON,"CALL 2MWEN IM BK FRMCLOUD 9! J X\"""""
4668,ham,"When I was born, GOD said, \Oh No! Another IDI...",GOD said,"\""OH No! COMPETITION\"". Who knew","one day these two will become FREINDS FOREVER!"""


In [None]:
df['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [None]:
# Fast encoding of target variable values

df['v1'] = df['v1'].replace({"ham" : 0, "spam" : 1})

In [None]:
# First preprocessing of messages with Pandas

import re

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

df['clean_text'] = df['v2'].astype(str).apply(remove_emoji)
df['clean_text'] = df['clean_text'].replace(r"[^a-zA-Z0-9\s]", " ")
df['clean_text'] = df['clean_text'].apply(lambda x: ''.join(elt for elt in x if elt.isalnum() or elt==" "))
df['clean_text'] = df['clean_text'].replace(r"\s+", " ")
df['clean_text'] = df['clean_text'].apply(lambda x: x.lower())
df['clean_text'][:5]

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in 2 a wkly comp to win fa cup fina...
3          u dun say so early hor u c already then say
4    nah i dont think he goes to usf he lives aroun...
Name: clean_text, dtype: object

In [None]:
# Preprocessing and tokenization of messages with Spacy

!pip install spacy -q

In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
!python -m spacy download en_core_web_sm -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
import en_core_web_sm

nlp = spacy.load('en_core_web_sm')

In [None]:
%%time

df['tokens'] = df['clean_text'].apply(lambda x: [token.lemma_ for token in nlp(x) if token.text not in STOP_WORDS])

CPU times: user 58.5 s, sys: 156 ms, total: 58.7 s
Wall time: 1min 7s


In [None]:
df['clean_text'] = df['tokens'].apply(lambda txt: ' '.join(txt).replace(",", ""))

In [None]:
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,clean_text,tokens
0,0,"Go until jurong point, crazy.. Available only ...",,,,jurong point crazy available bugis n great wor...,"[jurong, point, crazy, available, bugis, n, gr..."
1,0,Ok lar... Joking wif u oni...,,,,ok lar joke wif u oni,"[ok, lar, joke, wif, u, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,,,,free entry 2 wkly comp win fa cup final tkts 2...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."


In [None]:
df['clean_text'].isna().sum()

0

In [None]:
# Text encoding

import numpy as np
import tensorflow as tf

tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df['clean_text'])
df['encoded_text'] = tokenizer.texts_to_sequences(df['clean_text'])
df["len_text"] = df['encoded_text'].apply(lambda x: len(x))
df = df[df["len_text"]!=0]
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,clean_text,tokens,encoded_text,len_text
0,0,"Go until jurong point, crazy.. Available only ...",,,,jurong point crazy available bugis n great wor...,"[jurong, point, crazy, available, bugis, n, gr...","[3626, 242, 460, 476, 958, 38, 54, 216, 959, 8...",15
1,0,Ok lar... Joking wif u oni...,,,,ok lar joke wif u oni,"[ok, lar, joke, wif, u, oni]","[12, 207, 477, 301, 1, 1472]",6
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,,,,free entry 2 wkly comp win fa cup final tkts 2...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...","[15, 311, 4, 547, 678, 36, 1473, 866, 436, 147...",22


In [None]:
# Padding

text_pad = tf.keras.preprocessing.sequence.pad_sequences(df['encoded_text'], padding="post")

In [None]:
full_ds = tf.data.Dataset.from_tensor_slices((text_pad, df['v1']))

In [None]:
# Splitting dataset into train and test set

TAKE_SIZE = int(0.7*df.shape[0])

train_data = full_ds.take(TAKE_SIZE).shuffle(TAKE_SIZE)
train_data = train_data.batch(64)

test_data = full_ds.skip(TAKE_SIZE)
test_data = test_data.batch(64)

In [None]:
# Viz of text and target

for text, cat in train_data.take(1):
    print(text, cat)

tf.Tensor(
[[ 575   34 3158 ...    0    0    0]
 [   2   77   39 ...    0    0    0]
 [   5 6241 6242 ...    0    0    0]
 ...
 [  75   70    2 ...    0    0    0]
 [ 436  194   52 ...    0    0    0]
 [  21  171 2092 ...    0    0    0]], shape=(64, 73), dtype=int32) tf.Tensor(
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0], shape=(64,), dtype=int64)


In [None]:
# SimpleRNN model

from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, GRU, LSTM

vocab_size = df['encoded_text'].apply(lambda x: max(x)).max()

model_rnn = tf.keras.Sequential([
    # Word Embedding layer
    Embedding(vocab_size + 1, 64, input_shape=[text.shape[1], ], name="embedding"),
    # SimpleRNN layers
    SimpleRNN(units=16, return_sequences=True),  # Maintains the sequential nature
    SimpleRNN(units=8, return_sequences=False),  # Returns the last output
    # Dense layers once the data is flat
    Dense(4, activation='relu'),
    # Output layer with a single neuron for binary classification and sigmoid activation
    Dense(1, activation="sigmoid")
])

model_rnn.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 73, 64)            527360    
                                                                 
 simple_rnn_10 (SimpleRNN)   (None, 73, 16)            1296      
                                                                 
 simple_rnn_11 (SimpleRNN)   (None, 8)                 200       
                                                                 
 dense_15 (Dense)            (None, 4)                 36        
                                                                 
 dense_16 (Dense)            (None, 1)                 5         
                                                                 
Total params: 528,897
Trainable params: 528,897
Non-trainable params: 0
_________________________________________________________________


In [None]:
optimizer = tf.keras.optimizers.Adam()

model_rnn.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy()])

In [None]:
# Adding class weights to compensate the very unbalanced distribution of the two classes in the original dataset

weights = 1/df['v1'].value_counts()
weights = weights * len(df)/2
weights = {index : values for index , values in zip(weights.index,weights.values)}
weights

{0: 0.5776184538653367, 1: 3.720883534136546}

In [None]:
# Model training and performance assessment over 20 epochs

history_rnn = model_rnn.fit(train_data,
                            epochs=20,
                            validation_data=test_data,
                            class_weight=weights
                            )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# Viz of the loss evolution over the epochs

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(y=history_rnn.history["loss"],
                    mode='lines',
                    name='loss'))
fig.add_trace(go.Scatter(y=history_rnn.history["val_loss"],
                    mode='lines',
                    name='val_loss'))
fig.show()

In [None]:
# Viz of the binary accuracy evolution over the epochs

fig = go.Figure()
fig.add_trace(go.Scatter(y=history_rnn.history['binary_accuracy'],
                    mode='lines',
                    name='binary_accuracy'))
fig.add_trace(go.Scatter(y=history_rnn.history["val_binary_accuracy"],
                    mode='lines',
                    name='val_binary_accuracy'))
fig.show()

In [None]:
# Model's predictions

rnn_predicted_labels = model_rnn.predict(test_data)
rnn_predicted_labels



array([[0.00715643],
       [0.00667989],
       [0.00675104],
       ...,
       [0.8294013 ],
       [0.00715346],
       [0.00669129]], dtype=float32)

In [None]:
rnn_predicted_classes = (rnn_predicted_labels >= 0.5).astype(int)
rnn_predicted_classes

array([[0],
       [0],
       [0],
       ...,
       [1],
       [0],
       [0]])

In [None]:
true_labels = []
for _, labels in test_data:
    true_labels.extend(labels.numpy().tolist())

In [None]:
for i in range(len(rnn_predicted_classes)):
    print(f"Valeur prédite : {rnn_predicted_classes[i][0]}, Valeur réelle : {true_labels[i]}")

Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 1, Valeur réelle : 1
Valeur prédite : 1, Valeur réelle : 1
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 1, Valeur réelle : 1
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 1, Valeur réelle : 1
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédi

In [None]:
count_non_zero_predictions = 0

for i in range(len(rnn_predicted_classes)):
    if rnn_predicted_classes[i][0] != 0:
        count_non_zero_predictions += 1

print("Number of predicted spam messages :", count_non_zero_predictions)

Number of predicted spam messages : 283


In [None]:
# Reset training and validation sets for a new model

TAKE_SIZE = int(0.7*df.shape[0])

train_data = full_ds.take(TAKE_SIZE).shuffle(TAKE_SIZE)
train_data = train_data.batch(64)

test_data = full_ds.skip(TAKE_SIZE)
test_data = test_data.batch(64)

In [None]:
for text, cat in train_data.take(1):
    print(text, cat)

tf.Tensor(
[[   8 3989    0 ...    0    0    0]
 [  36  149  231 ...    0    0    0]
 [1713  586   80 ...    0    0    0]
 ...
 [  50   71  892 ...    0    0    0]
 [  15  418  101 ...    0    0    0]
 [  25   44  659 ...    0    0    0]], shape=(64, 73), dtype=int32) tf.Tensor(
[0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 1 1 0 1 0 0 0 0 1 0], shape=(64,), dtype=int64)


In [None]:
# GRU model

from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, GRU, LSTM

vocab_size = df['encoded_text'].apply(lambda x: max(x)).max()

model_gru = tf.keras.Sequential([
    # Word Embedding layer
    Embedding(vocab_size + 1, 64, input_shape=[text.shape[1], ], name="embedding"),
    # SimpleRNN layers
    GRU(units=16, return_sequences=True),  # Maintains the sequential nature
    GRU(units=8, return_sequences=False),  # Returns the last output
    # Dense layers once the data is flat
    Dense(4, activation='relu'),
    # Output layer with a single neuron for binary classification and sigmoid activation
    Dense(1, activation="sigmoid")
])

model_gru.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 73, 64)            527360    
                                                                 
 gru_2 (GRU)                 (None, 73, 16)            3936      
                                                                 
 gru_3 (GRU)                 (None, 8)                 624       
                                                                 
 dense_2 (Dense)             (None, 4)                 36        
                                                                 
 dense_3 (Dense)             (None, 1)                 5         
                                                                 
Total params: 531,961
Trainable params: 531,961
Non-trainable params: 0
_________________________________________________________________


In [None]:
optimizer = tf.keras.optimizers.Adam()

model_gru.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy()])

In [None]:
# Model training and performance assessment over 20 epochs

history_gru = model_gru.fit(train_data,
                            epochs=20,
                            validation_data=test_data,
                            class_weight=weights
                            )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# Viz of the loss evolution over the epochs

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(y=history_gru.history["loss"],
                    mode='lines',
                    name='loss'))
fig.add_trace(go.Scatter(y=history_gru.history["val_loss"],
                    mode='lines',
                    name='val_loss'))
fig.show()

In [None]:
# Viz of the binary accuracy evolution over the epochs

fig = go.Figure()
fig.add_trace(go.Scatter(y=history_gru.history['binary_accuracy'],
                    mode='lines',
                    name='binary_accuracy'))
fig.add_trace(go.Scatter(y=history_gru.history["val_binary_accuracy"],
                    mode='lines',
                    name='val_binary_accuracy'))
fig.show()

In [None]:
# Model's predictions

gru_predicted_labels = model_gru.predict(test_data)
gru_predicted_labels



array([[0.49725673],
       [0.49725673],
       [0.49725673],
       ...,
       [0.49725673],
       [0.49725673],
       [0.49725673]], dtype=float32)

In [None]:
gru_predicted_classes = (gru_predicted_labels >= 0.5).astype(int)
gru_predicted_classes

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [None]:
for i in range(len(gru_predicted_classes)):
    print(f"Valeur prédite : {gru_predicted_classes[i][0]}, Valeur réelle : {true_labels[i]}")

Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 1
Valeur prédite : 0, Valeur réelle : 1
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 1
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 1
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédite : 0, Valeur réelle : 0
Valeur prédi

In [None]:
count_non_zero_predictions = 0

for i in range(len(gru_predicted_classes)):
    # Vérifie si la valeur prédite n'est pas égale à zéro
    if gru_predicted_classes[i][0] != 0:
        count_non_zero_predictions += 1

print("Number of predicted spam messages :", count_non_zero_predictions)

Number of predicted spam messages : 0
