In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/spam.csv', delimiter=',',encoding='latin-1')

In [3]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1,inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
v1    5572 non-null object
v2    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [4]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.iloc[2:3]['v2']

2    Free entry in 2 a wkly comp to win FA Cup fina...
Name: v2, dtype: object

In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [7]:
X = df.v2
Y = df.v1
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1)

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.15)

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

Using TensorFlow backend.


In [10]:
X_train[:10]

3854    Free msg: Single? Find a partner in your area!...
4206                   Lets use it next week, princess :)
207                          Aight yo, dats straight dogg
878     U have a secret admirer who is looking 2 make ...
5538    I can't believe how attached I am to seeing yo...
3566    Collect your VALENTINE'S weekend to PARIS inc ...
1935    Did either of you have any idea's? Do you know...
2584              Hi happy birthday. Hi hi hi hi hi hi hi
3512       Staff of placement training in Amrita college.
6       Even my brother is not like to speak with me. ...
Name: v2, dtype: object

In [11]:
Y_train[:10]

array([[1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0]])

In [12]:
le.inverse_transform([0])

array(['ham'], dtype=object)

In [13]:
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [14]:
dict_params = {
    "label_encoder": le,
    "tokenizer": tok,
    "max_len" : max_len
}

In [15]:
import pickle

In [16]:
with open('dict_params.pickle', 'wb') as f:
    pickle.dump(dict_params, f)

In [17]:
from keras.layers import Input, Embedding, LSTM, Dense, Activation, Dropout
from keras.models import Model

In [18]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [19]:
from keras.optimizers import RMSprop

In [20]:
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 50)           50000     
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                29440     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_1 (Dropout)  

In [21]:
from keras.callbacks import EarlyStopping

In [22]:
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Instructions for updating:
Use tf.cast instead.
Train on 3788 samples, validate on 948 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.callbacks.History at 0x133343cc0>

In [23]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [24]:
accr = model.evaluate(test_sequences_matrix,Y_test)



In [25]:
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.041
  Accuracy: 0.989


In [26]:
model.save_weights('trained_models/lstm_model.h5')

In [27]:
import os

In [28]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 50)           50000     
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                29440     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)                 257       
__________

In [29]:
import numpy as np

In [30]:
model.predict(sequences_matrix[7:8])

array([[5.3829677e-05]], dtype=float32)

In [31]:
sequences_matrix[0:1]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  48,
        126, 175,   4,   9,  13,  15, 313, 277,  22, 251,   2, 211,  21,
         68, 211,   2,  68, 243, 227, 126]], dtype=int32)

In [32]:
np.array(150*[sequences_matrix[0]])

array([[  0,   0,   0, ..., 243, 227, 126],
       [  0,   0,   0, ..., 243, 227, 126],
       [  0,   0,   0, ..., 243, 227, 126],
       ...,
       [  0,   0,   0, ..., 243, 227, 126],
       [  0,   0,   0, ..., 243, 227, 126],
       [  0,   0,   0, ..., 243, 227, 126]], dtype=int32)

In [33]:
len(sequences_matrix)

4736

In [34]:
os.path.realpath(__file__)

NameError: name '__file__' is not defined

In [35]:
new_model = RNN()

In [36]:
new_model.load_weights("trained_models/lstm_model.h5")

In [37]:
new_model.predict(sequences_matrix[0:1])

array([[0.99937797]], dtype=float32)