This repo is a complementary to [my other git repo](https://github.com/ErfanEbrahimiBazaz/spam_detection_with_nltk) for spam detection.

In [my previous repo](https://github.com/ErfanEbrahimiBazaz/spam_detection_with_nltk) we constructed a TF-IDF vector and trained a naive Baise network for spam detection. In this repository we use LSTM for spam detection. Both repositories work with the same data set.

I make use of [this link](https://towardsdatascience.com/spam-detection-in-emails-de0398ea3b48) to implement the code. Some of the methods in the link are not implemented properly but altogether it shows the big picture and the logic behind text classification. Some minor changes were necessary to methods like remove_stop_words which I have corrected in this repo.

In [43]:
import keras
import string
import sklearn
import re

### Lower casing all words

In [2]:
def to_lower(word):
    result = word.lower()
    return result

### Remove of special characters

In [6]:
import string


string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [4]:
def remove_special_characters(word):
    result = word.translate(str.maketrans(dict.fromkeys(string.punctuation)))
    return result

### Remove special characters

In [37]:
from sklearn import feature_extraction


def remove_stop_words(sentence):
    result = [word for word in sentence.split() if word not in feature_extraction.text.ENGLISH_STOP_WORDS]
    return result

In [29]:
# sklearn.feature_extraction.text.ENGLISH_STOP_WORDS()

# TypeError: 'frozenset' object is not callable

In [38]:
remove_stop_words('remove an apple.')

['remove', 'apple.']

In [30]:
sklearn.feature_extraction.text.ENGLISH_STOP_WORDS

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

### Removal of hyperlinks

In [41]:
def remove_hyperlink(word):
    return re.sub(r"http\S+", "", word)

In [44]:
remove_hyperlink('Test if hyperlinks like https://github.com/ErfanEbrahimiBazaz?tab=repositories are removed properly.')

'Test if hyperlinks like  are removed properly.'

In [45]:
def remove_additional_white_space(sentence):
    return re.sub("  ", " ", sentence)

In [49]:
remove_additional_white_space(
    remove_hyperlink('Test if hyperlinks like https://github.com/ErfanEbrahimiBazaz?tab=repositories are removed properly.'))

'Test if hyperlinks like are removed properly.'

### Tokenizing clean data

In [51]:
from keras.preprocessing.text import Tokenizer


max_feature = 50000 #number of unique words to consider
tokenizer = Tokenizer(num_words=max_feature)
tokenizer.fit_on_texts(x_train)
x_train_features = np.array(tokenizer.texts_to_sequences(x_train))
x_test_features = np.array(tokenizer.texts_to_sequences(x_test))

### Padding

Making all tokens of equal size

In [None]:
from keras.preprocessing.sequence import pad_sequences
x_train_features = pad_sequences(x_train_features,maxlen=max_len)
x_test_features = pad_sequences(x_test_features,maxlen=max_len)

### Lable encoding target value

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_y = le.fit_transform(target_train.values)
test_y = le.transform(target_test.values)

### Embedding

Embedding is the process of converting formatted text data into numerical values/vectors which a machine can interpret.

In [54]:
import tensorflow as tf
from keras.layers import Dense,LSTM, Embedding, Dropout, Activation, Bidirectional


# The length of all tokenized emails post-padding is set using ‘max_len’
max_len = 50


#size of the output vector from each layer
embedding_vector_length = 32
#Creating a sequential model
model = tf.keras.Sequential()
#Creating an embedding layer to vectorize
model.add(Embedding(max_feature, embedding_vector_length, input_length=max_len))
#Addding Bi-directional LSTM
model.add(Bidirectional(tf.keras.layers.LSTM(64)))
#Relu allows converging quickly and allows backpropagation
model.add(Dense(16, activation='relu'))
#Deep Learninng models can be overfit easily, to avoid this, we add randomization using drop out
model.add(Dropout(0.1))
#Adding sigmoid activation function to normalize the output
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 32)            1600000   
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               49664     
_________________________________________________________________
dense (Dense)                (None, 16)                2064      
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 1,651,745
Trainable params: 1,651,745
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
history = model.fit(x_train_features, train_y, batch_size=512, epochs=20, validation_data=(x_test_features, test_y))
y_predict = [1 if o>0.5 else 0 for o in model.predict(x_test_features)]

### Performance

According to [this link](https://towardsdatascience.com/spam-detection-in-emails-de0398ea3b48):

"Precision and recall are the two most widely used performance metrics for a classification problem to get a better understanding of the problem. Precision is the fraction of the relevant instances from all the retrieved instances. Precision helps us to understand how useful the results are. The recall is the fraction of relevant instances from all the relevant instances. Recall helps us understand how complete the results are."

In [None]:
from sklearn.metrics import confusion_matrix,f1_score, precision_score,recall_score


cf_matrix =confusion_matrix(test_y,y_predict)
tn, fp, fn, tp = confusion_matrix(test_y,y_predict).ravel()
print("Precision: {:.2f}%".format(100 * precision_score(test_y, y_predict)))
print("Recall: {:.2f}%".format(100 * recall_score(test_y, y_predict)))
print("F1 Score: {:.2f}%".format(100 * f1_score(test_y,y_predict)))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


ax= plt.subplot()
#annot=True to annotate cells
sns.heatmap(cf_matrix, annot=True, ax = ax,cmap='Blues',fmt='');
# labels, title and ticks
ax.set_xlabel('Predicted labels');
ax.set_ylabel('True labels');
ax.set_title('Confusion Matrix');
ax.xaxis.set_ticklabels(['Not Spam', 'Spam']); ax.yaxis.set_ticklabels(['Not Spam', 'Spam']);

### Testing ideas

In [5]:
from keras.preprocessing.text import one_hot, Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras import Sequential
from keras.layers import Embedding, Flatten, Dense

In [1]:
docs = ['Well done!',
'Good work',
'Great effort',
'nice work',
'Excellent!',
'Weak',
'Poor effort!',
'not good',
'poor work',
'Could have done better.']
labels = [1,1,1,1,1,0,0,0,0,0]

In [7]:
text = 'a sample text to tokenize'
tokens = text_to_word_sequence(text)

In [9]:
tokens

['a', 'sample', 'text', 'to', 'tokenize']

In [11]:
len(tokens)

5

Set the length for creating one hot  encoder according to [this link](https://stackoverflow.com/questions/57653204/keras-pre-processing-of-text-using-one-hot-class).

In [12]:
one_hot(text,5,lower=True)

[1, 2, 2, 1, 3]

In [13]:
one_hot(text,5*1.25,lower=True)

[5.0, 1.75, 6.0, 2.0, 6.0]

### Q1: It is not indeed one hot encoding, but a vector of ints with 1s in different positions to map the input text to a number, right?

In [14]:
import pandas as pd

In [21]:
!dir

 Volume in drive E is WorkSpace
 Volume Serial Number is 6AD8-FF46

 Directory of E:\Fad\Advpy\s13\hw

05/31/2021  08:50 PM    <DIR>          .
05/31/2021  08:50 PM    <DIR>          ..
05/31/2021  08:44 PM    <DIR>          .ipynb_checkpoints
05/18/2021  02:46 AM            14,478 label.txt
05/30/2021  01:48 AM               556 README.md
05/31/2021  08:50 PM            41,994 Spam detection with RNN.ipynb
05/31/2021  08:44 PM           916,713 spam detection.ipynb
05/18/2021  02:46 AM           170,099 test.txt
05/30/2021  01:57 AM            69,549 Text_Mining_Session02.ipynb
05/18/2021  02:46 AM           288,974 train.txt
01/01/2021  09:03 PM            18,837 word_embedding_with_keras .ipynb
               8 File(s)      1,521,200 bytes
               3 Dir(s)  40,949,358,592 bytes free


In [28]:
df = pd.read_csv('train.txt', delimiter = "\n", header=None,  quotechar="'") #, error_bad_lines=False )
df.columns = ['message']

In [29]:
df.head(10)

Unnamed: 0,message
0,The basket's gettin full so I might be by tonight
1,Can i get your opinion on something first?
2,Company is very good.environment is terrific a...
3,Its a valentine game. . . Send dis msg to all ...
4,S.i'm watching it in live..
5,Don know:)this week i'm going to tirunelvai da.
6,7 lor... Change 2 suntec... Wat time u coming?
7,"""Garbage bags, eggs, jam, bread, hannaford whe..."
8,You see the requirements please
9,"""Are you being good, baby? :)"""


In [30]:
def read_and_concat_datasets(train_dataset='train.txt', labels='label.txt', delimiter = "\n"):
    df = pd.read_csv(train_dataset, delimiter = delimiter, header=None, quotechar="'" )
    df.columns = ['message']
    
    df_label = pd.read_csv(labels, delimiter=delimiter, header = None, quotechar="'")
    df_label.columns = ['message_type']
    
    df_final = pd.concat([df, df_label], axis=1)
    return df_final

In [32]:
# The error "ParserError: Error tokenizing data. C error: Expected 1 fields in line 8, saw 2" is because of having comma in text.
read_and_concat_datasets().head(10)

Unnamed: 0,message,message_type
0,The basket's gettin full so I might be by tonight,ham
1,Can i get your opinion on something first?,ham
2,Company is very good.environment is terrific a...,ham
3,Its a valentine game. . . Send dis msg to all ...,ham
4,S.i'm watching it in live..,ham
5,Don know:)this week i'm going to tirunelvai da.,ham
6,7 lor... Change 2 suntec... Wat time u coming?,ham
7,"""Garbage bags, eggs, jam, bread, hannaford whe...",ham
8,You see the requirements please,ham
9,"""Are you being good, baby? :)""",ham


In [33]:
df = read_and_concat_datasets()
df.tail()

Unnamed: 0,message,message_type
3497,Ok lor. I'm in town now lei.,ham
3498,"""Aight I've been set free, think you could tex...",ham
3499,No no:)this is kallis home ground.amla home to...,ham
3500,excellent. I spent &lt;#&gt; years in the Ai...,
3501,Watching tv lor...,
