In [2]:
#https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/text/text_classification_rnn.ipynb#scrollTo=z682XYsrjkY9
%tensorflow_version 2.x

from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
import tensorflow_hub as hub

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, GlobalAveragePooling1D, Conv1D, MaxPooling1D, Bidirectional, LSTM

import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import os, re, json, functools

#dark mode
plt.rc_context({'xtick.color':'w', 'ytick.color':'w', 'text.color':'w', 'axes.labelcolor':'w'})

seed=1234
np.random.seed(seed)
tf.random.set_seed(seed)

!pip install kaggle --upgrade

Requirement already up-to-date: kaggle in /usr/local/lib/python3.6/dist-packages (1.5.6)


In [4]:
os.environ['KAGGLE_USERNAME'] = "fredymarroquin"
os.environ['KAGGLE_KEY'] = "eb82530674115717dfba82e38b9742df"

#https://www.kaggle.com/team-ai/spam-text-message-classification
!kaggle datasets download -d team-ai/spam-text-message-classification

Downloading spam-text-message-classification.zip to /content
  0% 0.00/208k [00:00<?, ?B/s]
100% 208k/208k [00:00<00:00, 80.4MB/s]


In [5]:
!unzip -o 'spam-text-message-classification.zip'
os.listdir()

Archive:  spam-text-message-classification.zip
  inflating: SPAM text message 20170820 - Data.csv  


['.config',
 'spam-text-message-classification.zip',
 'SPAM text message 20170820 - Data.csv',
 'sample_data']

In [6]:
data = pd.read_csv('SPAM text message 20170820 - Data.csv')
data.head

<bound method NDFrame.head of      Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]>

In [0]:
data['is_spam']=data['Category'].apply(lambda x: 1 if x == "spam" else 0)

In [8]:
data.drop(columns=['Category'])

Unnamed: 0,Message,is_spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,1
5568,Will ü b going to esplanade fr home?,0
5569,"Pity, * was in mood for that. So...any other s...",0
5570,The guy did some bitching but I acted like i'd...,0


In [0]:
from sklearn.model_selection import train_test_split
X = data.Message
Y = data.is_spam

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.20)

In [0]:
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = pad_sequences(sequences,maxlen=max_len)

In [16]:
model = Sequential([  Embedding(max_words,50,input_length=max_len),
                      Bidirectional(LSTM(64)),
                      Dense(64, activation='relu'),
                      Dense(1)])
model.summary()
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 150, 50)           50000     
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               58880     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 117,201
Trainable params: 117,201
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
          validation_split=0.2)

Train on 3565 samples, validate on 892 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f255c2d5e48>

In [0]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = pad_sequences(test_sequences,maxlen=max_len)

In [23]:
accr = model.evaluate(test_sequences_matrix,Y_test)
print('Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Loss: 0.153
  Accuracy: 0.983
