In [4]:
import tensorflow_hub as hub
import pandas as pd
import tensorflow_text as text
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np

In [5]:
# load data
df = pd.read_csv('spam_data.csv')

In [6]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
df.sample(15)

Unnamed: 0,Category,Message
4124,ham,"Cool, want me to go to kappa or should I meet ..."
1538,ham,All sounds good. Fingers . Makes it difficult ...
2232,ham,"K, wen ur free come to my home and also tel vi..."
3650,ham,Hmm ill have to think about it... ok you're fo...
2979,ham,Yar lor... Keep raining non stop... Or u wan 2...
3266,spam,"44 7732584351, Do you want a New Nokia 3510i c..."
1509,ham,Sounds like something that someone testing me ...
1383,ham,Its ok my arm is feeling weak cuz i got a shot...
194,ham,It will stop on itself. I however suggest she ...
9,spam,Had your mobile 11 months or more? U R entitle...


In [8]:
df.Message[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [9]:
# check count and unique and top values and their frequency
df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [10]:
# check percentage of data - states how much data needs to be balanced
print(str(round(747/4825,2))+'%')

0.15%


In [11]:
# creating 2 new dataframe as df_ham , df_spam
df_spam = df[df['Category']=='spam']

df_ham = df[df['Category']=='ham']

print("Ham Dataset Shape:", df_ham.shape)

print("Spam Dataset Shape:", df_spam.shape)

Ham Dataset Shape: (4825, 2)
Spam Dataset Shape: (747, 2)


In [12]:
# downsampling ham dataset - take only random 747 example
# will use df_spam.shape[0] - 747
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 2)

In [13]:
# concating both dataset - df_spam and df_ham_balanced to create df_balanced dataset
df_balanced = pd.concat([df_spam , df_ham_downsampled])

In [14]:
df_balanced['Category'].value_counts()

spam    747
ham     747
Name: Category, dtype: int64

In [15]:
df_balanced.sample(10)


Unnamed: 0,Category,Message
2975,ham,"I'll text carlos and let you know, hang on"
5071,spam,5p 4 alfie Moon's Children in need song on ur ...
5232,spam,YOU ARE CHOSEN TO RECEIVE A £350 AWARD! Pls ca...
3422,spam,Welcome! Please reply with your AGE and GENDER...
3812,ham,Excellent! Wish we were together right now!
2247,spam,Hi ya babe x u 4goten bout me?' scammers getti...
177,ham,U still going to the mall?
4845,spam,YOU HAVE WON! As a valued Vodafone customer ou...
3763,spam,FREE for 1st week! No1 Nokia tone 4 ur mob eve...
327,ham,Hi da:)how is the todays class?


##### Preprocessing of Spam Detection Data

In [16]:
# creating numerical repersentation of category - one hot encoding
df_balanced['spam'] = df_balanced['Category'].apply(lambda x:1 if x=='spam' else 0)

In [17]:
df_balanced.sample(4)

Unnamed: 0,Category,Message,spam
840,spam,Last chance 2 claim ur £150 worth of discount ...,1
1056,ham,I'm at work. Please call,0
3014,spam,FREE UNLIMITED HARDCORE PORN direct 2 your mob...,1
3078,ham,There is no sense in my foot and penis.,0


In [18]:
df_balanced.Message[1888]

'Urgent! Please call 09061743811 from landline. Your ABTA complimentary 4* Tenerife Holiday or £5000 cash await collection SAE T&Cs Box 326 CW25WX 150ppm'

take the dataset
voc
token_ids

[121, 25, 689, [], [89>][], ]-------->1


--------> [25],[89],[689],[121]>>>>>>>Spam  
------->I am an [Indian] and i speak [hindi].<---- : Classification



Urgent: 121
[key]: [value]
[indian]------[hindi][tamil][telgu][punjabi]
[indiam]------[arabic][hebrew][french] 





In [19]:
# loading train test split
from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(df_balanced['Message'], df_balanced['spam'],
                                                    stratify = df_balanced['spam'])

In [20]:
# downloading preprocessing files and model
bert_preprocessor = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

In [22]:
text_input = tf.keras.layers.Input(shape = (), dtype = tf.string, name = 'Inputs')
preprocessed_text = bert_preprocessor(text_input)
embeed = bert_encoder(preprocessed_text)
dropout = tf.keras.layers.Dropout(0.1, name = 'Dropout')(embeed['pooled_output'])
outputs = tf.keras.layers.Dense(1, activation = 'sigmoid', name = 'Dense')(dropout)

In [23]:
# creating final model
model = tf.keras.Model(inputs = [text_input], outputs = [outputs])

In [24]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Inputs (InputLayer)            [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_type_ids':   0           ['Inputs[0][0]']                 
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [25]:
Metrics = [tf.keras.metrics.BinaryAccuracy(name = 'accuracy'),
           tf.keras.metrics.Precision(name = 'precision'),
           tf.keras.metrics.Recall(name = 'recall')
           ]

In [26]:
# compiling our model
model.compile(optimizer ='adam',
               loss = 'binary_crossentropy',
               metrics = Metrics)

In [27]:
history = model.fit(X_train, y_train, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
# Evaluating performance
model.evaluate(X_test,y_test)



[0.25940293073654175,
 0.9117646813392639,
 0.8969072103500366,
 0.9304812550544739]