In [12]:
import tqdm
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
from sklearn.model_selection import train_test_split
from trange import trange
from tqdm import tqdm
import time
import pickle

In [13]:
SEQ_LEN = 50
EMBED_SIZE = 50
TEST_SIZE = 0.1
BATCH_SIZE = 64

label2int = {'ham': 0, 'spam': 1}
int2label = {0: 'ham', 1: 'spam'}

In [14]:
def load_data():
    texts, labels = [], []
    with open('/home/tim/Datasets/smsspamcollection/SMSSpamCollection') as f:
        for line in f:
            split = line.split()
            labels.append(split[0].strip())
            texts.append(' '.join(split[1:]).strip())
    return texts, labels

In [15]:
X, y = load_data()

In [17]:
X[1], y[1]

('Ok lar... Joking wif u oni...', 'ham')

In [18]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

In [19]:
X = np.array(X)
y = np.array(y)

  X = np.array(X)


In [23]:
X.shape, y.shape

((5574, 50), (5574, 2))

In [20]:
X = pad_sequences(X, maxlen=SEQ_LEN)

In [21]:
y = [label2int[label] for label in y]
y = to_categorical(y)
print(y[0])

[1. 0.]


In [22]:
X.shape, y.shape

((5574, 50), (5574, 2))

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=42)

In [None]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
x_test[0]

In [None]:
def get_embedding_vector(tokenizer, dim=50):
    embedding_index = {}
    with open('/home/tim/trained/glove/glove.6B.50d.txt', encoding='utf-8') as f:
        for line in tqdm(f, 'Reading Glove'):
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embedding_index[word] = vector

    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((len(word_index) + 1, dim))

    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

In [None]:
def get_model(tokenizer, lstm_unit):
    embedding_matrix = get_embedding_vector(tokenizer)
    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index) + 1, EMBED_SIZE, weights=[embedding_matrix], trainable=False))
    model.add(LSTM(lstm_unit, recurrent_dropout=0.2))
    model.add(Dropout(0.2))
    model.add(Dense(2, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    model.summary()

    return model

In [None]:
model = get_model(tokenizer=tokenizer, lstm_unit=128)

In [None]:
callbacks = [ModelCheckpoint('/home/tim/trained/sms/spamclassifier_{val_loss:.2f}', save_best_only=True, verbose=1),
             TensorBoard('home/tim/trained/sms/spamlogs_glove')]

In [None]:
model.fit(x_train, y_train, initial_epoch=8, epochs=10, validation_data=(x_test, y_test), batch_size=BATCH_SIZE, callbacks=callbacks)

In [None]:
model.save('/home/tim/trained/sms/spamclassifier.h5')

In [None]:
from tensorflow.keras.models import load_model

model = load_model('/home/tim/trained/sms/spamclassifier.h5')

SEQ_LEN = 50

In [None]:
def get_predictions(text):
    sequence = tokenizer.texts_to_sequences([text])
    # pad the sequence
    sequence = pad_sequences(sequence, maxlen=SEQ_LEN)
    # get the prediction
    prediction = model.predict(sequence)[0]
    # one-hot encoded vector, revert using np.argmax
    return int2label[np.argmax(prediction)]

In [None]:
text = "Congratulations! you have won 100,000$ this week, click here to claim fast"
print(get_predictions(text))

In [None]:
text = "Hi man, I was wondering if we can meet tomorrow."
print(get_predictions(text))