## Import Libraries

In [1]:
import tensorflow_hub as hub
import pandas as pd
import tensorflow_text as text
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np

## Read Data

In [2]:
# load data
df = pd.read_csv('spam.csv', encoding = "ISO-8859-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## Exploratory Data Analysis


In [3]:
# check count and unique and top values and their frequency
df['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

## Downsampling Data

Downsampling is a technique where the majority class is downsampled to match the minority class. Since our data has only one column(feature) it ok to use it.

We perform downsampling by just picking any random 747 samples from the ham class

In [4]:
# check percentage of data - states how much data needs to be balanced
print(str(round(747/4825,2))+'%')

0.15%


## filtering and separation

In [5]:
# creating 2 new dataframe as df_ham , df_spam

df_spam = df[df['v1']=='spam']

df_ham = df[df['v1']=='ham']

print("Ham Dataset Shape:", df_ham.shape)

print("Spam Dataset Shape:", df_spam.shape)

Ham Dataset Shape: (4825, 5)
Spam Dataset Shape: (747, 5)


In [6]:
# downsampling ham dataset - take only random 747 example
# will use df_spam.shape[0] - 747
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 5)

In [7]:
# concating both dataset - df_spam and df_ham_balanced to create df_balanced dataset
df_balanced = pd.concat([df_spam , df_ham_downsampled])

In [8]:
df_balanced['v1'].value_counts()

spam    747
ham     747
Name: v1, dtype: int64

In [9]:
df_balanced.sample(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
3862,spam,Oh my god! I've found your number again! I'm s...,,,
3858,spam,Win the newest åÒHarry Potter and the Order of...,,,
3284,ham,They are just making it easy to pay back. I ha...,,,
366,spam,"Update_Now - Xmas Offer! Latest Motorola, Sony...",,,
5497,spam,"SMS SERVICES. for your inclusive text credits,...",,,
5498,ham,Why did I wake up on my own &gt;:(,,,
5155,ham,MY NEW YEARS EVE WAS OK. I WENT TO A PARTY WIT...,,,
1520,spam,"URGENT! Your Mobile No was awarded a å£2,000 B...",,,
909,spam,"January Male Sale! Hot Gay chat now cheaper, c...",,,
3941,ham,She's borderline but yeah whatever.,,,


## Preprocessing of Spam Detection Data

In [10]:
# creating numerical repersentation of category - one hot encoding
df_balanced['spam'] = df_balanced['v1'].apply(lambda x:1 if x=='spam' else 0)

In [11]:
# displaying data - spam -1 , ham-0
df_balanced.sample(4)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,spam
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",,,,1
1904,ham,Wah... Okie okie... Muz make use of e unlimite...,,,,0
3787,spam,Want to funk up ur fone with a weekly new tone...,,,,1
566,ham,Oooh bed ridden ey? What are YOU thinking of?,,,,0


## Performing Train Test Split

In [12]:
# loading train test split
from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(df_balanced['v2'], df_balanced['spam'],
                                                    stratify = df_balanced['spam'])

## Model Creation

## Downloading Prerequisites

In [13]:
# downloading preprocessing files and model
bert_preprocessor = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

## Creating Model

Having downloaded the bert model, we can now use Keras Functional API to build our model.

In [14]:
text_input = tf.keras.layers.Input(shape = (), dtype = tf.string, name = 'Inputs')
preprocessed_text = bert_preprocessor(text_input)
embeed = bert_encoder(preprocessed_text)
dropout = tf.keras.layers.Dropout(0.1, name = 'Dropout')(embeed['pooled_output'])
outputs = tf.keras.layers.Dense(1, activation = 'sigmoid', name = 'Dense')(dropout)
# creating final model
model = tf.keras.Model(inputs = [text_input], outputs = [outputs])

In [15]:
# check the summary of the model
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Inputs (InputLayer)            [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_type_ids':   0           ['Inputs[0][0]']                 
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

## Compiling And Training Model

In [16]:
Metrics = [tf.keras.metrics.BinaryAccuracy(name = 'accuracy'),
           tf.keras.metrics.Precision(name = 'precision'),
           tf.keras.metrics.Recall(name = 'recall')
           ]
# compiling our model
model.compile(optimizer ='adam',
               loss = 'binary_crossentropy',
               metrics = Metrics)

In [17]:
history = model.fit(X_train, y_train, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Model Evaluation

In [18]:
# Evaluating performance
model.evaluate(X_test,y_test)



[0.26949289441108704,
 0.9224599003791809,
 0.9488636255264282,
 0.893048107624054]

## Model Prediction for Spam Detection with some random data

In [25]:
predict_text = ['We’d all like to get a $10,000 deposit on our bank accounts out of the blue, but winning a prize—especially if you’ve never entered a contest', 
                'Netflix is sending you a refund of $12.99. Please reply with your bank account and routing number to verify and get your refund', 
                'Your account is temporarily frozen. Please log in to to secure your account ',
                'The article was published on 18th August itself',
                'Although we are unable to give you an exact time-frame at the moment, I would request you to stay tuned for any updates.',
                'The image you sent is a UI bug, I can check that your article is marked as regular and is not in the monetization program.'
]

In [33]:
test_results = model.predict(predict_text)
output = np.where(test_results<0.5,'spam', 'ham')



In [34]:
output

array([['spam'],
       ['ham'],
       ['ham'],
       ['spam'],
       ['spam'],
       ['ham']], dtype='<U4')