In [9]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
import tensorflow as tf
from keras.models import load_model


from sklearn.preprocessing import LabelEncoder

In [10]:
# Standard script to load GPU
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print("Num GPUS Available: ", len(physical_devices))
tf.config.experimental.set_memory_growth(physical_devices[0], True)

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Num GPUS Available:  1
Found GPU at: /device:GPU:0


In [11]:
data = pd.read_csv('spam.csv', encoding='ISO-8859-1')
# Keeping only the neccessary columns
data = data[['v2','v1']]
print(data)
data['v2'] = data['v2'].apply(lambda x: x.lower())
data['v2'] = data['v2'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

print(data)

                                                     v2    v1
0     Go until jurong point, crazy.. Available only ...   ham
1                         Ok lar... Joking wif u oni...   ham
2     Free entry in 2 a wkly comp to win FA Cup fina...  spam
3     U dun say so early hor... U c already then say...   ham
4     Nah I don't think he goes to usf, he lives aro...   ham
...                                                 ...   ...
5567  This is the 2nd time we have tried 2 contact u...  spam
5568              Will Ì_ b going to esplanade fr home?   ham
5569  Pity, * was in mood for that. So...any other s...   ham
5570  The guy did some bitching but I acted like i'd...   ham
5571                         Rofl. Its true to its name   ham

[5572 rows x 2 columns]
                                                     v2    v1
0     go until jurong point crazy available only in ...   ham
1                               ok lar joking wif u oni   ham
2     free entry in 2 a wkly comp to win fa c

In [13]:
print(data)
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['v2'].values)
X = tokenizer.texts_to_sequences(data['v2'].values)

X = pad_sequences(X)

print(X[0])

                                                     v2    v1
0     go until jurong point crazy available only in ...   ham
1                               ok lar joking wif u oni   ham
2     free entry in 2 a wkly comp to win fa cup fina...  spam
3           u dun say so early hor u c already then say   ham
4     nah i dont think he goes to usf he lives aroun...   ham
...                                                 ...   ...
5567  this is the 2nd time we have tried 2 contact u...  spam
5568                will _ b going to esplanade fr home   ham
5569  pity  was in mood for that soany other suggest...   ham
5570  the guy did some bitching but i acted like id ...   ham
5571                          rofl its true to its name   ham

[5572 rows x 2 columns]
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0   

In [15]:
embed_dim = 128
lstm_out = 196
def createmodel():
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(2,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    return model
# print(model.summary())

In [20]:
labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['v1'])
y = to_categorical(integer_encoded)
# Find how Hot encoding 
print(y[1], " Is Ham")
print(y[2], " Is Spam")

X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)

[1. 0.]  Is Ham
[0. 1.]  Is Spam


In [28]:
  batch_size = 30
  epoch = 1
  with tf.device('/gpu:0'):
    model = createmodel()
    model.fit(X_train, Y_train, epochs = epoch, batch_size=batch_size, verbose = 2)
    score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size)
    print('The loss: ',score)
    print('The accuracy: ',acc)
    print(model.metrics_names)


125/125 - 63s - loss: 0.1559 - accuracy: 0.9467
62/62 - 2s - loss: 0.0664 - accuracy: 0.9826
The loss:  0.06643752753734589
The accuracy:  0.9825992584228516
['loss', 'accuracy']
