<a href="https://colab.research.google.com/github/AshvinVignesh/Spam-Classification-with-Embeddings-and-LSTM/blob/main/Spam_Classification_with_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
data = pd.read_csv("Spam-Classification.csv")
data.head(5)

Unnamed: 0,CLASS,SMS
0,ham,"said kiss, kiss, i can't do the sound effects..."
1,ham,&lt;#&gt; ISH MINUTES WAS 5 MINUTES AGO. WTF.
2,spam,(Bank of Granite issues Strong-Buy) EXPLOSIVE ...
3,spam,* FREE* POLYPHONIC RINGTONE Text SUPER to 8713...
4,spam,**FREE MESSAGE**Thanks for using the Auction S...


In [15]:
data.count

<bound method DataFrame.count of      CLASS                                                SMS
0      ham   said kiss, kiss, i can't do the sound effects...
1      ham      &lt;#&gt; ISH MINUTES WAS 5 MINUTES AGO. WTF.
2     spam  (Bank of Granite issues Strong-Buy) EXPLOSIVE ...
3     spam  * FREE* POLYPHONIC RINGTONE Text SUPER to 8713...
4     spam  **FREE MESSAGE**Thanks for using the Auction S...
...    ...                                                ...
1495   ham       Yup, no need. I'll jus wait 4 e rain 2 stop.
1496   ham  Yup... From what i remb... I think should be c...
1497   ham                           Yup... How Ã¼ noe leh...
1498   ham  Yup... Ok i go home look at the timings then i...
1499  spam  <Forwarded from 21870000>Hi - this is your Mai...

[1500 rows x 2 columns]>

In [4]:
target = data["CLASS"]
sms = data["SMS"]
target

0        ham
1        ham
2       spam
3       spam
4       spam
        ... 
1495     ham
1496     ham
1497     ham
1498     ham
1499    spam
Name: CLASS, Length: 1500, dtype: object

In [11]:
import tensorflow as tf
from sklearn import preprocessing


In [18]:
label_encoder = preprocessing.LabelEncoder()

spam_classes = label_encoder.fit_transform(target)

#Convert target to one-hot encoding vector
spam_classes = tf.keras.utils.to_categorical(spam_classes,2)


In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
     

In [21]:
VOCABULARY_WORDS = 10000
MAX_SEQUNCE_LENGTH = 100

This step builds a vocabulary of unique words from the SMS list and assigns a unique ID (integer) to each word.

In [22]:
spam_tokenizer = Tokenizer(num_words=VOCABULARY_WORDS)
spam_tokenizer.fit_on_texts(sms)

The first print statement displays the total number of unique tokens (words) found in the spam messages. The second print statement demonstrates how to retrieve the token ID for a specific word (in this case, "me").

In [26]:
print("Total unique tokens found: ", len(spam_tokenizer.word_index))
print("Example token ID for word \"said\":", spam_tokenizer.word_index.get("said"))
print("Example token ID for word \"kiss\":", spam_tokenizer.word_index.get("kiss"))

Total unique tokens found:  4688
Example token ID for word "said": 260
Example token ID for word "kiss": 921


The blow code converts each sentence in the spam_messages list into a sequence of token IDs based on the tokenizer's vocabulary.

In [27]:
spam_sequences = spam_tokenizer.texts_to_sequences(sms)


The pad_sequences function is used to ensure that all sequences have the same length. Sequences shorter than the specified maxlen are padded with zeros at the beginning, while longer sequences are truncated. The resulting sequences will have a fixed length of MAX_SEQUENCE_LENGTH.

In [29]:
spam_padded = pad_sequences(spam_sequences, maxlen=MAX_SEQUNCE_LENGTH)

In [31]:
print("\nTotal sequences found : ", len(spam_padded))
print("Example Sequence for sentence : ", sms[0] )
print(spam_padded[0])


Total sequences found :  1500
Example Sequence for sentence :   said kiss, kiss, i can't do the sound effects! He is a gorgeous man isn't he! Kind of person who needs a smile to brighten his day! 
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0  260  921  921    4  430   55    6 1488 2294  148   10
    3 1489  464 1143  148  922   19  514   77 1144    3  515    1 2295
  397   89]


In [32]:
from sklearn.model_selection import train_test_split

In [33]:
X_train, X_test, Y_train, Y_test = train_test_split(spam_padded,spam_classes,test_size=0.2)

##Building the embeddding matrix using GLOVE dictionary

In [34]:
import numpy as np

#Read pretrained embeddings into a dictionary
glove_dict = {} 

#Loading a 50 feature (dimension) embedding with 6 billion words
with open('glove.6B.50d.txt', "r", encoding="utf8") as glove_file:     
    for line in glove_file:
        
        emb_line = line.split()      
        emb_token = emb_line[0]         
        emb_vector = np.array(emb_line[1:], dtype=np.float32)
        
        if emb_vector.shape[0] == 50:    
            glove_dict[emb_token] = emb_vector



In [35]:
vocab_len = len(spam_tokenizer.word_index) + 1

embedding_matrix = np.zeros((vocab_len, 50))

for word, id in spam_tokenizer.word_index.items():  
    try:
        embedding_vector = glove_dict.get(word) 
        if embedding_vector is not None:         
            embedding_matrix[id] = embedding_vector
    except:
        pass

In [36]:
from tensorflow import keras
from tensorflow.keras import optimizers
from tensorflow.keras.regularizers import l2
from keras.layers import LSTM,Dense

In [40]:
no_of_output= 2
model = tf.keras.models.Sequential()

model.add(keras.layers.Embedding(vocab_len,
                                 50,   #50: The size of the embedding vector. Each word will be represented by a dense vector of length 50 in the embedding space.
                                 name="Embedding-Layer",
                                 weights=[embedding_matrix],
                                 input_length=MAX_SEQUNCE_LENGTH,
                                 trainable=True))

#Add LSTM Layer
model.add(LSTM(256))
model.add(keras.layers.Flatten())

model.add(keras.layers.Dense(no_of_output,
                             name='Output-Layer',
                             activation='softmax'))

model.compile(loss='categorical_crossentropy',
              metrics=['accuracy'])

In [41]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Embedding-Layer (Embedding)  (None, 100, 50)          234450    
                                                                 
 lstm (LSTM)                 (None, 256)               314368    
                                                                 
 flatten (Flatten)           (None, 256)               0         
                                                                 
 Output-Layer (Dense)        (None, 2)                 514       
                                                                 
Total params: 549,332
Trainable params: 549,332
Non-trainable params: 0
_________________________________________________________________


In [42]:
VERBOSE=1

#Setup Hyper Parameters for training
BATCH_SIZE=256
EPOCHS=10
VALIDATION_SPLIT=0.2

print("\nTraining Progress:\n------------------------------------")

history=model.fit(X_train,
          Y_train,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          verbose=VERBOSE,
          validation_split=VALIDATION_SPLIT)

print("\nEvaluation against Test Dataset :\n------------------------------------")
model.evaluate(X_test,Y_test)


Training Progress:
------------------------------------
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Evaluation against Test Dataset :
------------------------------------


[0.15506385266780853, 0.9466666579246521]