## Spam Classification

### import libraries

In [1]:
import pandas as pd
import numpy as np

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dropout, Dense, LSTM
from tensorflow.keras.models import load_model

In [2]:
data = pd.read_csv('SMSSpamCollection', sep='\t', names=['label','message'])
data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
#check for nan 
data.isnull().sum()

label      0
message    0
dtype: int64

In [4]:
data.shape

(5572, 2)

#### Data Cleaning

In [3]:
#download stopword
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Romiyo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
ps = PorterStemmer()
corpus = []
for i in range(0, len(data)):
    review = re.sub('[^a-zA-Z]',' ',data.message[i]) #keep only english word
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')] #remove stopword
    review = ' '.join(review)
    corpus.append(review)

In [7]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl week word back like fun still tb ok xxx std chg send rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'search right word thank breather

#### one_hot representation

In [8]:
voc_size=5000
onehot_rep = [one_hot(words, voc_size) for words in corpus]
onehot_rep

[[1939,
  1012,
  4101,
  3842,
  1843,
  4797,
  3233,
  2697,
  1588,
  4562,
  4343,
  4564,
  3904,
  2777,
  4997,
  2304],
 [3661, 4430, 1908, 4663, 3561, 2464],
 [4982,
  4833,
  2072,
  2269,
  193,
  483,
  1547,
  864,
  1786,
  2365,
  1227,
  1318,
  483,
  3999,
  4833,
  636,
  2827,
  4538,
  1055,
  444,
  1387],
 [3561, 2211, 4843, 1057, 107, 3561, 444, 24, 4843],
 [200, 3257, 3248, 4288, 1406, 2293, 3831],
 [4605,
  3097,
  4626,
  2547,
  4894,
  157,
  1747,
  2398,
  4660,
  885,
  3661,
  4936,
  2827,
  27,
  3913,
  677],
 [4276, 4186, 1747, 2264, 2105, 1747, 586, 475],
 [4097,
  3208,
  2007,
  2007,
  214,
  2816,
  2821,
  4396,
  3096,
  1760,
  3381,
  3616,
  2159,
  3883,
  1760],
 [3196,
  4155,
  2851,
  3005,
  4065,
  796,
  3387,
  3724,
  2965,
  4772,
  2965,
  3427,
  1666,
  4254,
  1812],
 [1236,
  1967,
  3561,
  4024,
  203,
  170,
  50,
  1160,
  1236,
  3114,
  4982,
  4772,
  1236,
  170,
  4006,
  4982],
 [254, 1722, 1127, 2614, 578, 3982,

#### Embedding representation

In [11]:
sent_len = 20
embedded_docs = pad_sequences(onehot_rep, padding='pre', maxlen=sent_len)
embedded_docs

array([[   0,    0,    0, ..., 2777, 4997, 2304],
       [   0,    0,    0, ..., 4663, 3561, 2464],
       [4833, 2072, 2269, ..., 1055,  444, 1387],
       ...,
       [   0,    0,    0, ..., 3151, 2623, 2900],
       [   0,    0,    0, ..., 3536, 4928, 4982],
       [   0,    0,    0, ..., 2572, 2475, 4692]])

In [21]:
embedded_docs[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0, 3661, 4430, 1908, 4663, 3561, 2464])

In [28]:
data.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [29]:
data['target'] = data['label'].apply(lambda x : 1 if x == 'spam' else 0)
data.target.value_counts()

0    4825
1     747
Name: target, dtype: int64

In [30]:
x = np.array(embedded_docs)
y = np.array(data.target)

In [31]:
print(x.shape)
print(y.shape)

(5572, 20)
(5572,)


In [34]:
#split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

### create model

In [37]:
embedding_vector_feature = 40
model = Sequential()

model.add(Embedding(voc_size, embedding_vector_feature, input_length=sent_len))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 40)            200000    
_________________________________________________________________
dropout (Dropout)            (None, 20, 40)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               56400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________


In [38]:
#comile
model.compile(
    loss='binary_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
)

In [39]:
#train model
model.fit(
    x_train,
    y_train,
    validation_data=(x_test, y_test),
    epochs = 10,
    batch_size = 32
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x28916d60208>

### prediction

In [48]:
prediction = model.predict_classes(x_test)

In [49]:
#confusion_matrix(y_test, prediction)
prediction

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [50]:
confusion_matrix(y_test, prediction)

array([[1591,    2],
       [  22,  224]], dtype=int64)

In [51]:
accuracy_score(y_test, prediction)

0.9869494290375204

#### test model

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Romiyo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
test_sen = 'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&Cs apply 08452810075over18' 
ps = PorterStemmer()

def process_data(test_sen):
    test_sen = test_sen.lower()
    test_sen = test_sen.split()
    test_sen = [ps.stem(word) for word in test_sen if not word in stopwords.words('english')]
    test_sen = ' '.join(test_sen)
    return test_sen

In [15]:
processed = process_data(test_sen)
processed

'free entri 2 wkli comp win fa cup final tkt 21st may 2005. text fa 87121 receiv entri question(std txt rate)t&c appli 08452810075over18'

In [16]:
oh = one_hot(processed,5000)
oh

[4070,
 4181,
 2750,
 1674,
 1031,
 1565,
 4904,
 3703,
 4452,
 3140,
 3864,
 2195,
 430,
 117,
 4904,
 3183,
 4833,
 4181,
 3190,
 2861,
 2786,
 299,
 1457,
 2388,
 2483,
 962]

In [17]:
sent_len = 20
embedded_docs = pad_sequences([oh], padding='pre', maxlen=sent_len)
embedded_docs

array([[4904, 3703, 4452, 3140, 3864, 2195,  430,  117, 4904, 3183, 4833,
        4181, 3190, 2861, 2786,  299, 1457, 2388, 2483,  962]])

In [8]:
embedded_docs[0]

array([3660, 2750,  191, 4984, 4984, 2007, 2991, 4684, 2750, 1775, 4742,
       1403, 3879, 2314, 1265, 1794, 2261, 4122, 2112, 1638])

In [9]:
X = np.array(embedded_docs)

In [10]:
model = load_model('spam_classification.h5')

In [11]:
model.predict_classes(X)[0][0]

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


0

In [12]:
print(data['message'].loc[2])

Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
