In [373]:
# Start LSTM Model for Spam Detection
# LSTMs are the industry standard model to counter vanishing/exploding gradients
import numpy as np
import tensorflow
np.random.seed(0)
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout, LSTM, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.initializers import glorot_uniform
import pandas as pd

np.random.seed(1)

In [377]:
# Utils
# UNQ_C3 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
# GRADED FUNCTION: sentences_to_indices

def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m,)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    
    ### START CODE HERE ###
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m,max_len))
    
    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence to lower case and split it into words. You should get a list of words.
        sentence_words = X[i].lower().split()
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words

        for w in sentence_words:
            # if w exists in the word_to_index dictionary
            if w in word_to_index:
                # Set the (i,j)th entry of X_indices to the index of the correct word.
                X_indices[i, j] = word_to_index[w]
                # Increment j to j + 1
                j =  j+1
            
    ### END CODE HERE ###
    
    return X_indices

def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary 
    
    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_size = len(word_to_index) + 1
    any_word = next(iter(word_to_vec_map.keys()))
    emb_dim = word_to_vec_map[any_word].shape[0]

     # Step 1
    # Initialize the embedding matrix as a numpy array of zeros.
    emb_matrix = np.zeros((vocab_size, emb_dim))
    
    # Step 2
    # Set each row "idx" of the embedding matrix to be 
    # the word vector representation of the idx'th word of the vocabulary
    for word, idx in word_to_index.items():
        emb_matrix[idx, :] = word_to_vec_map[word]
    
    # Step 3
    # Define Keras embedding layer with the correct input and output sizes
    # Make it non-trainable because we are just loading in pretrained GloVe 50-dimensional vectors
    embedding_layer = Embedding(input_dim=vocab_size , output_dim=emb_dim , trainable=False)
    
    # Step 4 (already done for you; please do not modify)
    # Build the embedding layer, it is required before setting the weights of the embedding layer. 
    embedding_layer.build((None,))
    
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [378]:
### YOU CANNOT EDIT THIS CELL

# UNIT TEST
def pretrained_embedding_layer_test(target):
    # Create a controlled word to vec map
    word_to_vec_map = {'a': [3, 3], 'synonym_of_a': [3, 3], 'a_nw': [2, 4], 'a_s': [3, 2], 'a_n': [3, 4], 
                       'c': [-2, 1], 'c_n': [-2, 2],'c_ne': [-1, 2], 'c_e': [-1, 1], 'c_se': [-1, 0], 
                       'c_s': [-2, 0], 'c_sw': [-3, 0], 'c_w': [-3, 1], 'c_nw': [-3, 2]
                      }
    # Convert lists to np.arrays
    for key in word_to_vec_map.keys():
        word_to_vec_map[key] = np.array(word_to_vec_map[key])
        
    # Create a word_to_index dictionary
    word_to_index = {}
    for idx, val in enumerate(list(word_to_vec_map.keys())):
        word_to_index[val] = idx;
        
    np.random.seed(1)
    embedding_layer = target(word_to_vec_map, word_to_index)
    
    assert type(embedding_layer) == Embedding, "Wrong type"
    assert embedding_layer.input_dim == len(list(word_to_vec_map.keys())) + 1, "Wrong input shape"
    assert embedding_layer.output_dim == len(word_to_vec_map['a']), "Wrong output shape"
    assert np.allclose(embedding_layer.get_weights(), 
                       [[[ 3, 3], [ 3, 3], [ 2, 4], [ 3, 2], [ 3, 4],
                       [-2, 1], [-2, 2], [-1, 2], [-1, 1], [-1, 0],
                       [-2, 0], [-3, 0], [-3, 1], [-3, 2], [ 0, 0]]]), "Wrong vaulues"
    print("\033[92mAll tests passed!")
       
    
pretrained_embedding_layer_test(pretrained_embedding_layer)

[92mAll tests passed!


In [371]:
X1 = np.array(["funny lol", "lets play baseball", "food is ready for you"])
X1_indices = sentences_to_indices(X1, word_to_index, max_len=5)
print("X1 =", X1)
print("X1_indices =\n", X1_indices)

X1 = ['funny lol' 'lets play baseball' 'food is ready for you']
X1_indices =
 [[155345. 225122.      0.      0.      0.]
 [220930. 286375.  69714.      0.      0.]
 [151204. 192973. 302254. 151349. 394475.]]


In [394]:
# Load in csv dataset
df = pd.read_csv('data/spam.csv',delimiter=',',encoding='latin-1')
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [395]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1,inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [449]:
# Get input and outputs
X = df.v2.iloc[:100]
Y = df.v1.iloc[:100]
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.20)
#,Y_train,Y_test = 
# print(len(X_train))
# print(len(Y_train))
# print(len(Y_test))
# print(len(Y_test)/(len(Y_train)+len(Y_test)))

In [450]:
maxLen = len(max(X_train, key=lambda x: len(x.split())).split())
maxLen

55

In [376]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

In [452]:
def SMS_SPAM_DETECT_LSTM(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Emojify-v2 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    
    ### START CODE HERE ###
    # Define sentence_indices as the input of the graph.
    # It should be of shape input_shape and dtype 'int32' (as it contains indices, which are integers).
    sentence_indices = Input(shape=(input_shape), dtype='int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer
    # (See additional hints in the instructions).
    embeddings = embedding_layer(sentence_indices)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # The returned output should be a batch of sequences.
    X = LSTM(units = 128, return_sequences= True)(embeddings)
    # Add dropout with a probability of 0.5
    X = Dropout(rate = 0.5 )(X) 
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # The returned output should be a single hidden state, not a batch of sequences.
    X = LSTM(units = 128, return_sequences= False)(X)
    # Add dropout with a probability of 0.5
    X = Dropout(rate = 0.5 )(X)  
    # Propagate X through a Dense layer with 2 units
    X = Dense(units = 2)(X)
    # Add a softmax activation
    X = Activation('softmax')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=sentence_indices, outputs=X)
    
    ### END CODE HERE ###
    
    return model 

In [453]:
model = SMS_SPAM_DETECT_LSTM((maxLen,), word_to_vec_map, word_to_index)
model.summary()

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 55)]              0         
                                                                 
 embedding_8 (Embedding)     (None, 55, 50)            20000050  
                                                                 
 lstm_14 (LSTM)              (None, 55, 128)           91648     
                                                                 
 dropout_14 (Dropout)        (None, 55, 128)           0         
                                                                 
 lstm_15 (LSTM)              (None, 128)               131584    
                                                                 
 dropout_15 (Dropout)        (None, 128)               0         
                                                                 
 dense_7 (Dense)             (None, 2)                 258 

In [454]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [455]:
X_train_np = np.array(X_train)
X_train_indices = sentences_to_indices(X_train_np, word_to_index, maxLen)
Y_train_np = np.array(Y_train)
label_encoder = LabelEncoder()
#     print('Y before encoding',Y)
Y_train_vec = label_encoder.fit_transform(Y_train_np)
#     print('Y after encoding', Y)
#     print('Y_vec',Y_vec)
Y_train_oh = to_categorical(Y_train_vec) 
# print("X_train_indices.shape",X_train_indices.shape)
# print("X_train_indices.shape",X_train_indices)
# print("Y_train_oh.shape",Y_train_oh.shape)
# print("Y_train_oh.shape",Y_train_oh)
# Y_train_oh = convert_to_one_hot(Y_train, C = 5)

In [456]:
model.fit(X_train_indices, Y_train_oh, epochs = 50, batch_size = 32, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x18199933d00>

In [457]:
X_test_np = np.array(X_test)
Y_test_np = np.array(Y_test)
X_test_indices = sentences_to_indices(X_test_np, word_to_index, max_len = maxLen)
label_encoder = LabelEncoder()
#     print('Y before encoding',Y)
Y_test_vec = label_encoder.fit_transform(Y_test_np)
#     print('Y after encoding', Y)
#     print('Y_vec',Y_vec)
Y_test_oh = to_categorical(Y_test_vec) 
loss, acc = model.evaluate(X_test_indices, Y_test_oh)
print()
print("Test accuracy = ", acc)


Test accuracy =  1.0


In [458]:
label_encoder.classes_
# Get the classes (original labels) from the LabelEncoder
classes_dict = {index: label for index, label in enumerate(label_encoder.classes_)}
classes_dict

{0: 'ham', 1: 'spam'}

In [460]:
# This code allows you to see the mislabelled examples
pred = model.predict(X_test_indices)
count = 0
for i in range(len(X_test_np)):
    num = np.argmax(pred[i])
#     if(classes_dict[num] == 'spam'):
#         print('predict spam')
#         # This model never predicts spam!!
#     if(classes_dict[num] == Y_test_np[i] and classes_dict[num] == 'spam'):
    if(classes_dict[num] != Y_test_np[i]):
        print('Mislabeled input:', X_test_np[i])
        count += 1
        print('EXPECT:'+Y_test_np[i]+' PREDICT:'+classes_dict[num])
        print()
        # 03/06/24: I think all of the spam got mislabeled!
print('Total count:',count)
#         print('Expected: ')

Total count: 0


In [465]:
# Change the sentence below to see your prediction. Make sure all the words are in the Glove embeddings.  
x_test = np.array(['*((*(AMDDD 1800 NOW)))'])
X_test_indices_input = sentences_to_indices(x_test, word_to_index, maxLen)
print(x_test[0] +' '+  classes_dict[np.argmax(model.predict(X_test_indices_input))])

*((*(AMDDD 1800 NOW))) ham
