In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/spam.csv', delimiter=',',encoding='latin-1')

In [3]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1,inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
v1    5572 non-null object
v2    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [4]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.iloc[2:3]['v2']

2    Free entry in 2 a wkly comp to win FA Cup fina...
Name: v2, dtype: object

In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [36]:
raw_text = df.v2

In [7]:
X = df.v2
Y = df.v1
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
cv = CountVectorizer()

In [25]:
dict_prams_mnb = {
    'labelencoder': le,
    'counteVectorizer': cv
}

In [26]:
import pickle 

In [27]:
with open('mnb_params.pickle', 'wb') as f:
    pickle.dump(dict_prams_mnb, f)

In [14]:
X = cv.fit_transform(X).toarray()

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.15)

In [16]:
from sklearn.naive_bayes import MultinomialNB

In [17]:
classifier = MultinomialNB()

In [19]:
classifier.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [41]:
cv.transform([raw_text.iloc[0]]).toarray()

(1, 8672)

In [20]:
y_hat = classifier.predict(X_test)

In [21]:
from sklearn.metrics import classification_report

In [22]:
print(classification_report(y_hat, Y_test))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       717
           1       0.92      0.92      0.92       119

    accuracy                           0.98       836
   macro avg       0.96      0.96      0.96       836
weighted avg       0.98      0.98      0.98       836



In [35]:
classifier.predict_proba([X[0]])[0][1]

1.8786184415875638e-07

In [30]:
X[0].reshape(1,-1)

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [23]:
from sklearn.externals import joblib



In [24]:
joblib.dump(classifier, 'trained_models/mnv.joblib')

['trained_models/mnv.joblib']

In [14]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

In [77]:
X_train[:10]

4028             [Û_] anyway, many good evenings to u! s
1040    Today am going to college so am not able to at...
277                        Awesome, I'll see you in a bit
2966    NEFT Transaction with reference number  &lt;#&...
1351                       Yo theres no class tmrw right?
2162    1) Go to write msg 2) Put on Dictionary mode 3...
347                       One small prestige problem now.
1776                    Call FREEPHONE 0800 542 0578 now!
533                                       I'll be late...
1201                                 I know she called me
Name: v2, dtype: object

In [80]:
Y_train[:10]

array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0]])

In [81]:
le.inverse_transform([0])

array(['ham'], dtype=object)

In [15]:
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [31]:
dict_params = {
    "label_encoder": le,
    "tokenizer": tok,
    "max_len" : max_len
}

In [32]:
import pickle

In [33]:
with open('dict_params.pickle', 'wb') as f:
    pickle.dump(dict_params, f)

In [16]:
from keras.layers import Input, Embedding, LSTM, Dense, Activation, Dropout
from keras.models import Model

In [17]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [19]:
from keras.optimizers import RMSprop

In [20]:
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 150)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 150, 50)           50000     
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                29440     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation_3 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)                 257       
__________

In [22]:
from keras.callbacks import EarlyStopping

In [23]:
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Instructions for updating:
Use tf.cast instead.
Train on 3788 samples, validate on 948 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.callbacks.History at 0x12b72dbe0>

In [24]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [25]:
accr = model.evaluate(test_sequences_matrix,Y_test)



In [26]:
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.057
  Accuracy: 0.978


In [30]:
model.save_weights('trained_models/lstm_model.h5')

In [34]:
import os

In [57]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 150)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 150, 50)           50000     
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                29440     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation_3 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)                 257       
__________

In [61]:
import numpy as np

In [82]:
model.predict(sequences_matrix[7:8])

array([[0.97412497]], dtype=float32)

In [76]:
sequences_matrix[0:1]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
        704, 492, 206,  55,   2,   6, 236]], dtype=int32)

In [69]:
np.array(150*[sequences_matrix[0]])

array([[  0,   0,   0, ...,   2,   6, 236],
       [  0,   0,   0, ...,   2,   6, 236],
       [  0,   0,   0, ...,   2,   6, 236],
       ...,
       [  0,   0,   0, ...,   2,   6, 236],
       [  0,   0,   0, ...,   2,   6, 236],
       [  0,   0,   0, ...,   2,   6, 236]], dtype=int32)

In [70]:
len(sequences_matrix)

4736

In [38]:
os.path.realpath(__file__)

NameError: name '__file__' is not defined

In [83]:
new_model = RNN()

In [84]:
new_model.load_weights("trained_models/lstm_model.h5")

In [85]:
new_model.predict(sequences_matrix[0:1])

array([[0.0062418]], dtype=float32)