In [1]:
import numpy as np 
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
STOPWORDS = set(stopwords.words('english'))

In [2]:
df = pd.read_csv('spam.csv',delimiter=',',encoding='latin-1')
del df["Unnamed: 2"]
del df["Unnamed: 3"]
del df["Unnamed: 4"]
df.columns = ['label','text']
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df = df.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer() 

def clean_text(text):
    
    text = [lemmatizer.lemmatize(word,pos="v") for word in text.split() if word not in STOPWORDS]
    text= ' '.join(text)
    text = [lemmatizer.lemmatize(word,pos="a") for word in text.split() if word not in STOPWORDS]
    text= ' '.join(text)
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    return text
df['text'] = df['text'].apply(clean_text)

In [4]:
df['text'] = df['text'].str.replace('\d+', '')
df['text']

0       go jurong point  crazy available bugis n great...
1                                 ok lar joking wif u oni
2       free entry  wkly comp win fa cup final tkts st...
3                     u dun say early hor u c already say
4                  nah i think go usf  live around though
                              ...                        
5944                                                 yaoi
5945                                        yellow shower
5946                                                yiffy
5947                                            zoophilia
5948                                                     
Name: text, Length: 5949, dtype: object

In [5]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 8114 unique tokens.


In [6]:
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (5949, 250)


In [7]:
Y = pd.get_dummies(df['label']).values
print('Shape of label tensor:', Y.shape)
Y

Shape of label tensor: (5949, 2)


array([[1, 0],
       [1, 0],
       [0, 1],
       ...,
       [0, 1],
       [0, 1],
       [0, 1]], dtype=uint8)

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(5354, 250) (5354, 2)
(595, 250) (595, 2)


In [9]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
Y_train

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 250, 100)          5000000   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 250, 100)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 2)                 202       
Total params: 5,080,602
Trainable params: 5,080,602
Non-trainable params: 0
_________________________________________________________________
None


array([[1, 0],
       [1, 0],
       [1, 0],
       ...,
       [1, 0],
       [1, 0],
       [1, 0]], dtype=uint8)

In [10]:
epochs = 5
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
#model.fit(X_train,Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=callbacks_list,shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [20]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.065
  Accuracy: 0.976


In [21]:
new_complaint = ['shit']
new_complaint = [clean_text(new_complaint[0])]
seq = tokenizer.texts_to_sequences(new_complaint)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = ['ham','spam']
print(pred, labels[np.argmax(pred)])

[[0.03150873 0.9684912 ]] spam


In [22]:
model.save('spam_classifier.h5')

In [25]:
from tensorflow.keras.models import load_model
models = load_model('spam_classifier.h5')

new_complaint = ['fuck off']
new_complaint = [clean_text(new_complaint[0])]
seq = tokenizer.texts_to_sequences(new_complaint)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = ['ham','spam']
print(pred, labels[np.argmax(pred)])

[[0.11364656 0.8863535 ]] spam


In [24]:
models.weights

[<tf.Variable 'embedding/embeddings:0' shape=(50000, 100) dtype=float32, numpy=
 array([[-0.03643825, -0.07546821,  0.09804107, ..., -0.04415632,
         -0.09394564,  0.00342683],
        [ 0.00442739,  0.06300038, -0.03702718, ...,  0.02492158,
          0.05203681, -0.06668912],
        [-0.04261263,  0.06723834, -0.00900643, ..., -0.01918705,
          0.07445677, -0.02190241],
        ...,
        [ 0.017804  ,  0.04536338, -0.01568151, ..., -0.03696734,
          0.02097887,  0.00338512],
        [ 0.02779299, -0.03373472,  0.04159513, ..., -0.03186712,
          0.00074468,  0.00013179],
        [ 0.02618251, -0.0059527 ,  0.00331489, ...,  0.03223563,
         -0.03865696,  0.02071199]], dtype=float32)>,
 <tf.Variable 'lstm/lstm_cell_2/kernel:0' shape=(100, 400) dtype=float32, numpy=
 array([[ 0.00725064,  0.08192713,  0.02255873, ...,  0.09886636,
         -0.0723936 ,  0.02858222],
        [ 0.12658955,  0.08196021,  0.09211063, ...,  0.08329953,
          0.08700854,  0.050

In [28]:
import pickle
f = open('Tokenizer.pkl','wb')
pickle.dump(tokenizer,f)
f.close()