This repo is a complementary to [my other git repo](https://github.com/ErfanEbrahimiBazaz/spam_detection_with_nltk) for spam detection.

In [my previous repo](https://github.com/ErfanEbrahimiBazaz/spam_detection_with_nltk) we constructed a TF-IDF vector and trained a naive Baise network for spam detection. In this repository we use LSTM for spam detection. Both repositories work with the same data set.

I make use of [this link](https://towardsdatascience.com/spam-detection-in-emails-de0398ea3b48) to implement the code. Some of the methods in the link are not implemented properly but altogether it shows the big picture and the logic behind text classification. Some minor changes were necessary to methods like remove_stop_words which I have corrected in this repo.

In [43]:
import keras
import string
import sklearn
import re

### Lower casing all words

In [2]:
def to_lower(word):
    result = word.lower()
    return result

### Remove of special characters

In [6]:
import string


string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [4]:
def remove_special_characters(word):
    result = word.translate(str.maketrans(dict.fromkeys(string.punctuation)))
    return result

### Remove special characters

In [37]:
from sklearn import feature_extraction


def remove_stop_words(sentence):
    result = [word for word in sentence.split() if word not in feature_extraction.text.ENGLISH_STOP_WORDS]
    return result

In [29]:
# sklearn.feature_extraction.text.ENGLISH_STOP_WORDS()

# TypeError: 'frozenset' object is not callable

In [38]:
remove_stop_words('remove an apple.')

['remove', 'apple.']

In [30]:
sklearn.feature_extraction.text.ENGLISH_STOP_WORDS

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

### Removal of hyperlinks

In [41]:
def remove_hyperlink(word):
    return re.sub(r"http\S+", "", word)

In [44]:
remove_hyperlink('Test if hyperlinks like https://github.com/ErfanEbrahimiBazaz?tab=repositories are removed properly.')

'Test if hyperlinks like  are removed properly.'

In [45]:
def remove_additional_white_space(sentence):
    return re.sub("  ", " ", sentence)

In [49]:
remove_additional_white_space(
    remove_hyperlink('Test if hyperlinks like https://github.com/ErfanEbrahimiBazaz?tab=repositories are removed properly.'))

'Test if hyperlinks like are removed properly.'

### Tokenizing clean data

In [51]:
from keras.preprocessing.text import Tokenizer


max_feature = 50000 #number of unique words to consider
tokenizer = Tokenizer(num_words=max_feature)
tokenizer.fit_on_texts(x_train)
x_train_features = np.array(tokenizer.texts_to_sequences(x_train))
x_test_features = np.array(tokenizer.texts_to_sequences(x_test))

### Padding

Making all tokens of equal size

In [None]:
from keras.preprocessing.sequence import pad_sequences
x_train_features = pad_sequences(x_train_features,maxlen=max_len)
x_test_features = pad_sequences(x_test_features,maxlen=max_len)

### Lable encoding target value

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_y = le.fit_transform(target_train.values)
test_y = le.transform(target_test.values)

### Embedding

Embedding is the process of converting formatted text data into numerical values/vectors which a machine can interpret.

In [54]:
import tensorflow as tf
from keras.layers import Dense,LSTM, Embedding, Dropout, Activation, Bidirectional


# The length of all tokenized emails post-padding is set using ‘max_len’
max_len = 50


#size of the output vector from each layer
embedding_vector_length = 32
#Creating a sequential model
model = tf.keras.Sequential()
#Creating an embedding layer to vectorize
model.add(Embedding(max_feature, embedding_vector_length, input_length=max_len))
#Addding Bi-directional LSTM
model.add(Bidirectional(tf.keras.layers.LSTM(64)))
#Relu allows converging quickly and allows backpropagation
model.add(Dense(16, activation='relu'))
#Deep Learninng models can be overfit easily, to avoid this, we add randomization using drop out
model.add(Dropout(0.1))
#Adding sigmoid activation function to normalize the output
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 32)            1600000   
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               49664     
_________________________________________________________________
dense (Dense)                (None, 16)                2064      
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 1,651,745
Trainable params: 1,651,745
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
history = model.fit(x_train_features, train_y, batch_size=512, epochs=20, validation_data=(x_test_features, test_y))
y_predict = [1 if o>0.5 else 0 for o in model.predict(x_test_features)]

### Performance

According to [this link](https://towardsdatascience.com/spam-detection-in-emails-de0398ea3b48):

"Precision and recall are the two most widely used performance metrics for a classification problem to get a better understanding of the problem. Precision is the fraction of the relevant instances from all the retrieved instances. Precision helps us to understand how useful the results are. The recall is the fraction of relevant instances from all the relevant instances. Recall helps us understand how complete the results are."

In [None]:
from sklearn.metrics import confusion_matrix,f1_score, precision_score,recall_score


cf_matrix =confusion_matrix(test_y,y_predict)
tn, fp, fn, tp = confusion_matrix(test_y,y_predict).ravel()
print("Precision: {:.2f}%".format(100 * precision_score(test_y, y_predict)))
print("Recall: {:.2f}%".format(100 * recall_score(test_y, y_predict)))
print("F1 Score: {:.2f}%".format(100 * f1_score(test_y,y_predict)))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


ax= plt.subplot()
#annot=True to annotate cells
sns.heatmap(cf_matrix, annot=True, ax = ax,cmap='Blues',fmt='');
# labels, title and ticks
ax.set_xlabel('Predicted labels');
ax.set_ylabel('True labels');
ax.set_title('Confusion Matrix');
ax.xaxis.set_ticklabels(['Not Spam', 'Spam']); ax.yaxis.set_ticklabels(['Not Spam', 'Spam']);

### Testing ideas

In [1]:
from keras.preprocessing.text import one_hot, Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras import Sequential
from keras.layers import Embedding, Flatten, Dense

In [2]:
docs = ['Well done!',
'Good work',
'Great effort',
'nice work',
'Excellent!',
'Weak',
'Poor effort!',
'not good',
'poor work',
'Could have done better.']
labels = [1,1,1,1,1,0,0,0,0,0]

In [3]:
text = 'a sample text to tokenize'
tokens = text_to_word_sequence(text)

In [4]:
tokens

['a', 'sample', 'text', 'to', 'tokenize']

In [5]:
len(tokens)

5

Set the length for creating one hot  encoder according to [this link](https://stackoverflow.com/questions/57653204/keras-pre-processing-of-text-using-one-hot-class).

In [6]:
one_hot(text,5,lower=True)

[4, 1, 1, 1, 1]

In [7]:
one_hot(text,5*1.25,lower=True)

[2.0, 1.75, 3.75, 3.5, 4.75]

### Q1: It is not indeed one hot encoding, but a vector of ints with 1s in different positions to map the input text to a number, right?

In [10]:
import pandas as pd

In [21]:
!dir

 Volume in drive E is WorkSpace
 Volume Serial Number is 6AD8-FF46

 Directory of E:\Fad\Advpy\s13\hw

05/31/2021  08:50 PM    <DIR>          .
05/31/2021  08:50 PM    <DIR>          ..
05/31/2021  08:44 PM    <DIR>          .ipynb_checkpoints
05/18/2021  02:46 AM            14,478 label.txt
05/30/2021  01:48 AM               556 README.md
05/31/2021  08:50 PM            41,994 Spam detection with RNN.ipynb
05/31/2021  08:44 PM           916,713 spam detection.ipynb
05/18/2021  02:46 AM           170,099 test.txt
05/30/2021  01:57 AM            69,549 Text_Mining_Session02.ipynb
05/18/2021  02:46 AM           288,974 train.txt
01/01/2021  09:03 PM            18,837 word_embedding_with_keras .ipynb
               8 File(s)      1,521,200 bytes
               3 Dir(s)  40,949,358,592 bytes free


In [58]:
df = pd.read_csv('train.txt', delimiter = "\n", header=None,  quotechar="'") #, error_bad_lines=False )
df.columns = ['message']

In [59]:
df.head(10)

Unnamed: 0,message
0,The basket's gettin full so I might be by tonight
1,Can i get your opinion on something first?
2,Company is very good.environment is terrific a...
3,Its a valentine game. . . Send dis msg to all ...
4,S.i'm watching it in live..
5,Don know:)this week i'm going to tirunelvai da.
6,7 lor... Change 2 suntec... Wat time u coming?
7,"""Garbage bags, eggs, jam, bread, hannaford whe..."
8,You see the requirements please
9,"""Are you being good, baby? :)"""


In [60]:
df.tail()

Unnamed: 0,message
3497,Ok lor. I'm in town now lei.
3498,"""Aight I've been set free, think you could tex..."
3499,No no:)this is kallis home ground.amla home to...
3500,excellent. I spent &lt;#&gt; years in the Ai...
3501,Watching tv lor...


In [61]:
df_label = pd.read_csv('label.txt', delimiter='\n', header = None, quotechar="'")
df_label.columns = ['message_type']

In [62]:
df_label.tail()

Unnamed: 0,message_type
3495,ham
3496,ham
3497,ham
3498,ham
3499,ham


In [8]:
def read_and_concat_datasets(train_dataset='train.txt', labels='label.txt', delimiter = "\n"):
    df = pd.read_csv(train_dataset, delimiter = delimiter, header=None, quotechar="'" )
    df.columns = ['message']
    
    df_label = pd.read_csv(labels, delimiter=delimiter, header = None, quotechar="'")
    df_label.columns = ['message_type']
    
    df_final = pd.concat([df, df_label], axis=1)
    return df_final

#### The error "ParserError: Error tokenizing data. C error: Expected 1 fields in line 8, saw 2" is because of having comma in a field where using pd.read_csv().

To resolve the error refer to [this link](https://stackoverflow.com/questions/32743479/pandas-read-csv-with-extra-commas-in-column). To resolve the issue, use  quotechar="'" in pd.read_csv().

In [11]:
read_and_concat_datasets().head(10)

Unnamed: 0,message,message_type
0,The basket's gettin full so I might be by tonight,ham
1,Can i get your opinion on something first?,ham
2,Company is very good.environment is terrific a...,ham
3,Its a valentine game. . . Send dis msg to all ...,ham
4,S.i'm watching it in live..,ham
5,Don know:)this week i'm going to tirunelvai da.,ham
6,7 lor... Change 2 suntec... Wat time u coming?,ham
7,"""Garbage bags, eggs, jam, bread, hannaford whe...",ham
8,You see the requirements please,ham
9,"""Are you being good, baby? :)""",ham


In [12]:
df = read_and_concat_datasets()
df.tail()

Unnamed: 0,message,message_type
3497,Ok lor. I'm in town now lei.,ham
3498,"""Aight I've been set free, think you could tex...",ham
3499,No no:)this is kallis home ground.amla home to...,ham
3500,excellent. I spent &lt;#&gt; years in the Ai...,
3501,Watching tv lor...,


## There is length mismatch. Downloading data sets again to start the work.

#### Determining length of one-hot encoder to avoid collision.

In [13]:
import pandas as pd
df = pd.read_csv('train.txt', delimiter = "\n", header=None )
df.columns = ['message']

In [14]:
df.head(10)

Unnamed: 0,message
0,The basket's gettin full so I might be by tonight
1,Can i get your opinion on something first?
2,Company is very good.environment is terrific a...
3,Its a valentine game. . . Send dis msg to all ...
4,S.i'm watching it in live..
5,Don know:)this week i'm going to tirunelvai da.
6,7 lor... Change 2 suntec... Wat time u coming?
7,"Garbage bags, eggs, jam, bread, hannaford whea..."
8,You see the requirements please
9,"Are you being good, baby? :)"


In [16]:
df_label = pd.read_csv('label.txt', delimiter='\n', header = None)
df_label.columns = ['message_type']

In [17]:
df_label.head()

Unnamed: 0,message_type
0,ham
1,ham
2,ham
3,ham
4,ham


In [18]:
df = pd.concat([df, df_label],axis=1)
df.tail()

Unnamed: 0,message,message_type
3495,Ok lor. I'm in town now lei.,ham
3496,"Aight I've been set free, think you could text...",ham
3497,No no:)this is kallis home ground.amla home to...,ham
3498,excellent. I spent &lt;#&gt; years in the Ai...,ham
3499,Watching tv lor...,ham


In [19]:
max([len(message) for message in df["message"]])

910

In [20]:
import math

In [21]:
one_hot_vec_len = math.ceil(max([len(message) for message in df["message"]]) * 1.25)
one_hot_vec_len

1138

In [22]:
encoded_docs = [one_hot(message, one_hot_vec_len) for message in df["message"]]

In [23]:
len(encoded_docs)

3500

In [24]:
encoded_docs[3499]

[343, 549, 229]

In [25]:
max([len(enc_doc) for enc_doc in encoded_docs])

189

In [26]:
# bad way
i = 0
for enc_doc in encoded_docs:
    i += 1
    if len(enc_doc) == max([len(enc_doc) for enc_doc in encoded_docs]):
        print(i,  enc_doc)

3078 [298, 552, 821, 139, 497, 1079, 527, 60, 431, 497, 687, 864, 431, 451, 50, 298, 287, 32, 552, 666, 497, 548, 821, 326, 255, 182, 163, 995, 698, 50, 431, 1121, 1079, 821, 103, 716, 25, 342, 527, 50, 666, 497, 548, 955, 298, 287, 431, 184, 139, 352, 548, 327, 646, 698, 298, 542, 478, 50, 535, 698, 271, 497, 449, 32, 50, 698, 271, 352, 548, 68, 432, 50, 431, 1121, 604, 298, 50, 352, 786, 691, 698, 471, 716, 1135, 691, 50, 149, 431, 352, 548, 322, 432, 200, 527, 152, 298, 50, 431, 352, 548, 995, 139, 646, 431, 352, 548, 941, 821, 671, 62, 298, 50, 139, 352, 548, 646, 431, 1047, 29, 432, 975, 152, 864, 698, 549, 1071, 821, 940, 177, 537, 944, 821, 325, 419, 431, 352, 215, 548, 499, 323, 298, 50, 139, 352, 548, 646, 431, 1079, 653, 765, 325, 707, 716, 25, 653, 998, 168, 271, 352, 548, 821, 940, 177, 327, 352, 864, 298, 983, 716, 421, 714, 298, 821, 103, 917, 666, 1071, 527, 552, 431, 1121, 41, 432, 741, 1082, 1015, 352, 452, 1114]


In [27]:
df["message"].iloc[3078 ]

'You do got a shitload of diamonds though'

In [28]:
df[df["message"]=="hi baby im cruisin with my girl friend what r u up 2? give me a call in and hour at home if thats alright or fone me on this fone now love jenny xxx"]

Unnamed: 0,message,message_type
771,hi baby im cruisin with my girl friend what r ...,ham


In [31]:
max_length = max([len(enc_doc) for enc_doc in encoded_docs])
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs[0])

[ 821  515  127   31  921  431  426  548 1039  130    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0]


In [32]:
df.iloc[0]

message         The basket's gettin full so I might be by tonight
message_type                                                  ham
Name: 0, dtype: object

In [35]:
from keras.layers import Dense,LSTM, Embedding, Dropout, Activation, Bidirectional
from keras import Sequential


#size of the output vector from each layer
embedding_vector_length = 32

model = Sequential()
#Creating an embedding layer to vectorize
#max_feature is 1.25 * length of the mapping space.
model.add(Embedding(one_hot_vec_len, embedding_vector_length, input_length=max_length))
#Addding Bi-directional LSTM
model.add(Bidirectional(LSTM(64)))
#Relu allows converging quickly and allows backpropagation
model.add(Dense(16, activation='relu'))
#Deep Learninng models can be overfit easily, to avoid this, we add randomization using drop out
model.add(Dropout(0.1))
#Adding sigmoid activation function to normalize the output
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 189, 32)           36416     
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               49664     
_________________________________________________________________
dense (Dense)                (None, 16)                2064      
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 88,161
Trainable params: 88,161
Non-trainable params: 0
_________________________________________________________________
None


In [38]:
labels = df["message_type"]
labels

0       ham
1       ham
2       ham
3       ham
4       ham
       ... 
3495    ham
3496    ham
3497    ham
3498    ham
3499    ham
Name: message_type, Length: 3500, dtype: object

Lables must be encoded to numerical value, otherwise, there will be the following error:

UnimplementedError:  Cast string to float is not supported
	 [[node binary_crossentropy/Cast (defined at <ipython-input-39-c86fac56b2a7>:1) ]] [Op:__inference_train_function_5826]

In [41]:
from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
labels_enc = le.fit_transform(labels)

labels_enc

array([0, 0, 0, ..., 0, 0, 0])

In [42]:
type(labels_enc)

numpy.ndarray

In [52]:
label_sr = pd.Series(labels_enc)[:15]
label_sr.values 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1])

In [46]:
df["message_type"].head(15)

0      ham
1      ham
2      ham
3      ham
4      ham
5      ham
6      ham
7      ham
8      ham
9      ham
10     ham
11     ham
12     ham
13    spam
14    spam
Name: message_type, dtype: object

In [54]:
df1 = df.assign(e = pd.Series(labels_enc).values)

In [55]:
df1.head()

Unnamed: 0,message,message_type,e
0,The basket's gettin full so I might be by tonight,ham,0
1,Can i get your opinion on something first?,ham,0
2,Company is very good.environment is terrific a...,ham,0
3,Its a valentine game. . . Send dis msg to all ...,ham,0
4,S.i'm watching it in live..,ham,0


In [56]:
df1 = df.assign(labels = labels_enc)

In [59]:
df1.head(15)

Unnamed: 0,message,message_type,labels
0,The basket's gettin full so I might be by tonight,ham,0
1,Can i get your opinion on something first?,ham,0
2,Company is very good.environment is terrific a...,ham,0
3,Its a valentine game. . . Send dis msg to all ...,ham,0
4,S.i'm watching it in live..,ham,0
5,Don know:)this week i'm going to tirunelvai da.,ham,0
6,7 lor... Change 2 suntec... Wat time u coming?,ham,0
7,"Garbage bags, eggs, jam, bread, hannaford whea...",ham,0
8,You see the requirements please,ham,0
9,"Are you being good, baby? :)",ham,0


In [60]:
labels = df1["labels"]

In [61]:
model.fit(padded_docs, labels, epochs=40, verbose=1)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x1e22fd4f730>

In [62]:
loss, accuracy = model.evaluate(padded_docs, labels)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 100.000000


In [68]:
df_test = pd.read_csv('test.txt', delimiter='\n', header=None, quotechar="'")
df_test.columns = ["message"]

In [69]:
df_test.tail()

Unnamed: 0,message
2067,Our Prashanthettan's mother passed away last n...
2068,either way works for me. I am &lt;#&gt; year...
2069,Not yet had..ya sapna aunty manege y'day hogid...
2070,What happen dear. Why you silent. I am tensed
2071,Don't b floppy... b snappy & happy! Only gay c...


In [71]:
one_hot_msg_len= math.ceil(max([len(message) for message in df_test["message"]]) * 1.25)
one_hot_msg_len

738

In [72]:
encoded_docs = [one_hot(msg, one_hot_msg_len) for msg in df_test["message"]]

In [73]:
# "Beautiful tomorrow never comes.. When it comes, it's already TODAY.. In the hunt of beautiful tomorrow don't waste your wonderful TODAY.. GOODMORNING:)"
encoded_docs[0]

[400,
 432,
 69,
 61,
 86,
 424,
 61,
 330,
 331,
 33,
 736,
 267,
 264,
 286,
 400,
 432,
 14,
 94,
 470,
 652,
 33,
 50]

In [74]:
# "Beautiful tomorrow never comes.. When it comes, it's already TODAY.. In the hunt of beautiful tomorrow don't waste your wonderful TODAY.. GOODMORNING:)"
df_test.iloc[0]

message    "Beautiful tomorrow never comes.. When it come...
Name: 0, dtype: object

In [76]:
max_length = max([len(message) for message in df_test["message"]])
max_length

590

In [77]:
test_padded_msgs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [79]:
test_padded_msgs[0]

array([400, 432,  69,  61,  86, 424,  61, 330, 331,  33, 736, 267, 264,
       286, 400, 432,  14,  94, 470, 652,  33,  50,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

In [80]:
model.predict(test_padded_msgs)



array([[2.4121206e-08],
       [9.9986744e-01],
       [3.4668432e-07],
       ...,
       [7.4099103e-07],
       [9.4126940e-01],
       [3.3551952e-08]], dtype=float32)

In [81]:
spam_pred = model.predict(test_padded_msgs)

In [85]:
spam_pred

array([[2.4121206e-08],
       [9.9986744e-01],
       [3.4668432e-07],
       ...,
       [7.4099103e-07],
       [9.4126940e-01],
       [3.3551952e-08]], dtype=float32)

In [87]:
spam_pred[0][0]

2.4121206e-08

In [96]:
max(spam_pred)

array([0.9999549], dtype=float32)

In [89]:
spam_pred_list = []
for i in range(len(spam_pred)):
    spam_pred_list.append(spam_pred[i][0])
    
spam_pred_list[:10]

[2.4121206e-08,
 0.99986744,
 3.4668432e-07,
 3.3845222e-09,
 5.6716118e-08,
 2.821918e-05,
 2.4148803e-06,
 0.00025257468,
 0.006676048,
 1.2434184e-05]

In [99]:
predicted_lables = [0 if val<0.5 else 1 for val in spam_pred_list ]
predicted_lables[:10]

[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]

In [100]:
df_lbl = pd.DataFrame(predicted_lables, columns=None)
df_lbl.columns = ['predicted_label']
df_lbl.head()

Unnamed: 0,predicted_label
0,0
1,1
2,0
3,0
4,0


In [103]:
df_lbl.to_csv('predicted_lbl.csv', header=False, index=False)

In [104]:
df_lbl

Unnamed: 0,predicted_label
0,0
1,1
2,0
3,0
4,0
...,...
2067,1
2068,0
2069,0
2070,1


In [105]:
#size of the output vector from each layer
embedding_vector_length = 32

model = Sequential()
max_length = max([len(message) for message in df_test["message"]])
model.add(Embedding(max_length, embedding_vector_length, input_length=max_length))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 590, 32)           18880     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               49664     
_________________________________________________________________
dense_2 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 70,625
Trainable params: 70,625
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
# model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())