In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from nltk.tokenize import word_tokenize
from keras.utils import pad_sequences 
from keras.preprocessing.text import one_hot

In [2]:
data = pd.read_csv("datasets/spam.csv", encoding="latin1")

In [3]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
data_input = data["v2"]
data_input

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

In [5]:
data_output = data["v1"]
data_output

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: v1, Length: 5572, dtype: object

In [6]:
tokenized_input = [word_tokenize(sentence) for sentence in data_input]
tokenized_output = [word_tokenize(sentence) for sentence in data_output]

In [7]:
tokenized_input

[['Go',
  'until',
  'jurong',
  'point',
  ',',
  'crazy',
  '..',
  'Available',
  'only',
  'in',
  'bugis',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  '...',
  'Cine',
  'there',
  'got',
  'amore',
  'wat',
  '...'],
 ['Ok', 'lar', '...', 'Joking', 'wif', 'u', 'oni', '...'],
 ['Free',
  'entry',
  'in',
  '2',
  'a',
  'wkly',
  'comp',
  'to',
  'win',
  'FA',
  'Cup',
  'final',
  'tkts',
  '21st',
  'May',
  '2005',
  '.',
  'Text',
  'FA',
  'to',
  '87121',
  'to',
  'receive',
  'entry',
  'question',
  '(',
  'std',
  'txt',
  'rate',
  ')',
  'T',
  '&',
  'C',
  "'s",
  'apply',
  '08452810075over18',
  "'s"],
 ['U',
  'dun',
  'say',
  'so',
  'early',
  'hor',
  '...',
  'U',
  'c',
  'already',
  'then',
  'say',
  '...'],
 ['Nah',
  'I',
  'do',
  "n't",
  'think',
  'he',
  'goes',
  'to',
  'usf',
  ',',
  'he',
  'lives',
  'around',
  'here',
  'though'],
 ['FreeMsg',
  'Hey',
  'there',
  'darling',
  'it',
  "'s",
  'been',
  '3',
  'week',
  "'s"

In [8]:
dictionary = []
for sentence in tokenized_input:
    for word in sentence:
        dictionary.append(word)
len(dictionary)

104193

In [9]:
dictionary = set(dictionary)
len(dictionary)

11520

In [10]:
embedded_input = [one_hot(sent, len(dictionary)) for sent in data_input]
embedded_input

[[180,
  3584,
  10414,
  8636,
  909,
  10061,
  599,
  647,
  3720,
  3893,
  4223,
  4685,
  9728,
  5549,
  2419,
  6766,
  8233,
  6344,
  4098,
  2488],
 [8064, 8310, 509, 1203, 10253, 10644],
 [990,
  470,
  647,
  11325,
  2622,
  10057,
  6361,
  8716,
  10079,
  1886,
  874,
  4213,
  306,
  5050,
  8711,
  4029,
  5325,
  1886,
  8716,
  5007,
  8716,
  8479,
  470,
  4538,
  9969,
  3723,
  7460,
  382,
  4398,
  3374,
  286],
 [10253, 4170, 7760, 4465, 8981, 408, 10253, 11354, 9197, 2556, 7760],
 [8335, 836, 2459, 6174, 744, 7650, 8716, 7988, 744, 1282, 1876, 10759, 907],
 [2247,
  1881,
  8233,
  2672,
  1259,
  787,
  3699,
  11321,
  9070,
  1092,
  10075,
  9645,
  2849,
  3996,
  886,
  3936,
  7055,
  3786,
  8040,
  7538,
  4277,
  4007,
  3551,
  8064,
  4362,
  9969,
  7093,
  8716,
  8373,
  10450,
  662,
  8716,
  2187],
 [1773,
  7766,
  7256,
  6645,
  10993,
  886,
  8716,
  5843,
  7226,
  9023,
  11244,
  6867,
  9023,
  886,
  7757,
  638],
 [247,
  2356,


In [11]:
tokenized_output

[['ham'],
 ['ham'],
 ['spam'],
 ['ham'],
 ['ham'],
 ['spam'],
 ['ham'],
 ['ham'],
 ['spam'],
 ['spam'],
 ['ham'],
 ['spam'],
 ['spam'],
 ['ham'],
 ['ham'],
 ['spam'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['spam'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['spam'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['spam'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['spam'],
 ['ham'],
 ['spam'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['spam'],
 ['ham'],
 ['spam'],
 ['spam'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['spam'],
 ['ham'],
 ['spam'],
 ['ham'],
 ['ham'],
 ['

In [12]:
embedded_output = []
for sentence in tokenized_output:
    if sentence[0] == "ham":
        embedded_output.append([1])
    else:
        embedded_output.append([0])
    

In [13]:
embedded_output

[[1],
 [1],
 [0],
 [1],
 [1],
 [0],
 [1],
 [1],
 [0],
 [0],
 [1],
 [0],
 [0],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [0],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [0],
 [1],
 [1],
 [0],
 [0],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [0],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [0],
 [1],
 [1],
 [1],
 [0],
 [0],
 [1],
 [0]

In [14]:
word_count = lambda sentence: len(word_tokenize(sentence))
longest_sentence = max(data_input, key=word_count)
length_long_sentence = len(word_tokenize(longest_sentence))
length_long_sentence

220

In [15]:
padded_input = pad_sequences(embedded_input, length_long_sentence, padding='post')
padded_input

array([[  180,  3584, 10414, ...,     0,     0,     0],
       [ 8064,  8310,   509, ...,     0,     0,     0],
       [  990,   470,   647, ...,     0,     0,     0],
       ...,
       [ 1285,  4244,   647, ...,     0,     0,     0],
       [  831,  9726,  8437, ...,     0,     0,     0],
       [10367,  9592,  8234, ...,     0,     0,     0]])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(padded_input.tolist(), embedded_output, test_size=0.2, random_state=0)

In [17]:
model = Sequential()
model.add(Embedding(len(dictionary), 20, input_length=length_long_sentence))
model.add(Flatten())
model.add(Dense(15, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 220, 20)           230400    
                                                                 
 flatten (Flatten)           (None, 4400)              0         
                                                                 
 dense (Dense)               (None, 15)                66015     
                                                                 
 dense_1 (Dense)             (None, 1)                 16        
                                                                 
Total params: 296,431
Trainable params: 296,431
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [19]:
history = model.fit(X_train, y_train, epochs=50, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [20]:
y_pred = model.predict(X_test)
y_pred



array([[1.        ],
       [0.99999976],
       [0.99999946],
       ...,
       [1.        ],
       [0.99999994],
       [0.99984556]], dtype=float32)

In [21]:
y_pred = y_pred.tolist()
for item in y_pred:
    item[0] = round(item[0])
y_pred

[[1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [0],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [0],
 [1],
 [0],
 [0],
 [1],
 [1]

In [22]:
y_test

[[1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [0],
 [1],
 [0],
 [1],
 [1],
 [0],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [0],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [0],
 [1],
 [0],
 [0],
 [1],
 [1]

In [23]:
results = [y_pred[i] == y_test[i] for i in range(len(y_pred))]
results

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 T

In [24]:
results.count(True)/len(results)

0.9811659192825112

In [25]:
def refactor(text: str) -> list:
    dictionary_length = 11520
    length_long_sentence = 220
    embedded_input = [one_hot(text, dictionary_length)]
    padded_input = pad_sequences(embedded_input, length_long_sentence, padding='post')
    return padded_input.tolist()

In [28]:
model.predict(refactor("All information on donations, sponsorship, etc. will be treated in
strictest confidence!"))



array([[0.9995278]], dtype=float32)