In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

In [None]:
df = pd.read_csv('spam.csv', delimiter = ',', encoding = 'latin-1')
print(df.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [None]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis = 1,inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [None]:
X = df.v2
Y = df.v1 # Encode the class labels
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1) # reshape the array with 1 column and as many rows as necessary to accomodate the data
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object


In [None]:
import tensorflow as tf
# Split the data into train and test
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.15)

In [None]:
# Convert the train data into tokens
max_words = 1000
max_len = 150 # Length of the sequence

In [None]:
tok = Tokenizer(num_words = max_words)
# Updates internal vocabulary based on a list of texts.
# In the case where texts contain lists, we assume each entry of the lists to be a token.

In [None]:
tok.fit_on_texts(X_train) # Use tokenizer object to convert the text into tokens

In [None]:
# Transforms each text in texts to a sequences of integers
sequences = tok.texts_to_sequences(X_train)
print(sequences)

[[1, 240, 49, 777, 3, 22, 19, 27, 373, 74], [7, 81, 334, 122, 517, 292, 3, 34, 2, 734, 7], [214, 25, 14, 196, 23, 1, 833, 161], [496, 21, 497, 44, 23, 4, 302, 26, 464, 264, 53, 351, 1, 196, 697, 21, 536], [537, 272, 15, 21, 38, 219, 184, 965, 735, 429, 374, 96], [446, 6, 44, 20, 966, 393, 325, 44, 518, 224, 447, 375, 88, 736, 375, 29, 88, 8, 10, 66, 366], [211, 24, 352, 3, 8, 4, 498, 40, 9, 14, 61, 7, 19, 3, 54, 19, 39, 167, 3, 335, 1, 167, 376, 23, 103, 130, 97, 11, 66, 17, 4, 125, 63], [34, 288, 12, 42, 41, 16], [273, 303, 1, 13, 3, 22, 175, 499], [1, 105, 17, 142, 1, 84, 481, 282, 44, 4, 135, 2, 154], [56, 19, 3], [1, 65, 8, 13, 377, 834], [121, 24, 289, 128, 119, 394, 430, 10, 82, 6, 147, 66, 236], [155, 162, 318], [129, 265, 34, 5, 629, 431, 28, 538, 18, 93, 69, 230, 28, 2, 21, 594, 53, 76, 81, 336, 212], [27, 21, 77, 25, 518, 89, 2, 137, 231, 143, 465, 572, 1, 353, 778, 351, 175, 225], [112, 168, 251, 78, 18, 698, 123, 16, 967, 1, 4, 779, 737, 7, 894, 16, 319, 111, 6, 50, 1, 34, 

In [None]:
sequences_matrix = tf.keras.utils.pad_sequences(sequences,maxlen=max_len)
print(sequences_matrix[0])
print(len(sequences_matrix),len(X_train))

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1 240  49 777
   3  22  19  27 373  74]
4736 4736


In [None]:
def RNN():
    inputs = Input(name='inputs',shape = [max_len])
    layer = Embedding(max_words,50,input_length = max_len)(inputs)

    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)

    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)

    model = Model(inputs = inputs,outputs = layer)

    return model

In [None]:
model = RNN()
model.summary()
model.compile(loss = 'binary_crossentropy',optimizer = tf.keras.optimizers.RMSprop(),metrics=['accuracy'])
model.fit(sequences_matrix,Y_train,batch_size=120,epochs = 10,validation_split = 0.2)

Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 150)]             0         
                                                                 
 embedding_10 (Embedding)    (None, 150, 50)           50000     
                                                                 
 lstm_9 (LSTM)               (None, 64)                29440     
                                                                 
 FC1 (Dense)                 (None, 256)               16640     
                                                                 
 activation_18 (Activation)  (None, 256)               0         
                                                                 
 dropout_9 (Dropout)         (None, 256)               0         
                                                                 
 out_layer (Dense)           (None, 1)                 257 

<keras.src.callbacks.History at 0x7f841c3ee910>

In [None]:
# Prepare the test data by conveting text to sequence
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = tf.keras.utils.pad_sequences(test_sequences,maxlen = max_len)
accr = model.evaluate(test_sequences_matrix,Y_test)
print("Test Set \n loss:{:0.3f}\nAccuracy:{:0.3f}".format(accr[0],accr[1]))

Test Set 
 loss:0.055
Accuracy:0.990
