# Email Classification (SPAM/HAM) by using BERT(Bidirectional Encoder Representations)

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/Bhushan0130/Datasets/main/spam.csv', encoding='ISO-8859-1')
df.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace = True)

In [3]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# !pip install tensorflow_text

In [7]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [9]:
df['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [10]:
df.groupby('v1').describe()

Unnamed: 0_level_0,v2,v2,v2,v2
Unnamed: 0_level_1,count,unique,top,freq
v1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [11]:
df['Target'] = df['v1'].apply(lambda x: 1 if x=='spam' else 0 )

In [12]:
df.head()

Unnamed: 0,v1,v2,Target
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [13]:
x = df.drop(columns = ['v1','Target'])
y = df['Target']


In [14]:
from sklearn.model_selection import train_test_split

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2,stratify=df['Target'])

In [16]:
print(x_train.shape)
print(x_test.shape)

(4457, 1)
(1115, 1)


In [17]:
print(y_train.shape)
print(y_test.shape)

(4457,)
(1115,)


In [18]:
y_train.value_counts()

0    3859
1     598
Name: Target, dtype: int64

In [19]:
# https://tfhub.dev/google/collections/bert/1

In [20]:
y_test.value_counts()

0    966
1    149
Name: Target, dtype: int64

In [21]:
encoded = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
proprocessing = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [22]:
bert_proprocessing = hub.KerasLayer(proprocessing)
bert_encoded = hub.KerasLayer(encoded)

In [23]:
def get_sentence_embeding(sentences):
  preprocessed_text = bert_proprocessing(sentences)
  return bert_encoded(preprocessed_text)['pooled_output']

In [24]:
new_text = df['v2'][0]

In [25]:
get_sentence_embeding([new_text])

<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
array([[-6.92833960e-01, -3.82204473e-01, -9.52783883e-01,
         7.51712263e-01,  8.26993883e-01, -1.11341521e-01,
         7.74034500e-01,  3.49266946e-01, -8.27566922e-01,
        -9.99938130e-01, -4.91911769e-01,  8.27597916e-01,
         9.51055884e-01,  5.97681642e-01,  8.68827343e-01,
        -6.57569349e-01, -2.09730461e-01, -5.72027624e-01,
         3.47762525e-01, -8.90989453e-02,  7.33236790e-01,
         9.99997258e-01, -3.19873020e-02,  3.63329798e-01,
         4.31966543e-01,  9.74061251e-01, -6.39868319e-01,
         8.87976766e-01,  9.05901790e-01,  6.73174262e-01,
        -6.16902351e-01,  1.64389327e-01, -9.71923470e-01,
        -2.86567569e-01, -9.75934863e-01, -9.69695449e-01,
         3.52540821e-01, -4.34979498e-01, -7.32300505e-02,
         1.02228358e-01, -8.72294605e-01,  2.68635660e-01,
         9.99939203e-01,  1.91501126e-01,  5.27790546e-01,
        -2.25497440e-01, -9.99999046e-01,  2.58211702e-01,
      

In [26]:
e = get_sentence_embeding([
    "banana",
    "grapes",
    "mango",
    "jeff bezos",
    "elon musk"
])

In [27]:
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
cosine_similarity([e[0]],[e[3]])

array([[0.84703815]], dtype=float32)

## Deep learning (Functional)

In [29]:
# bert_proprocessing = hub.KerasLayer(proprocessing)
# bert_encoded = hub.KerasLayer(encoded)

In [30]:
# def get_sentence_embeding(sentences):
#   preprocessed_text = bert_proprocessing(sentences)
#   return bert_encoded(preprocessed_text)['pooled_output']

In [31]:
text_input = tf.keras.layers.Input(shape=(),dtype=tf.string, name='text')
preprocessed_txt = bert_proprocessing(text_input)
outputs = bert_encoded(preprocessed_txt)

In [32]:
l = tf.keras.layers.Dropout(0.3,name='dropout')
l = l(outputs['pooled_output'])

In [33]:
l = tf.keras.layers.Dense(1, activation='sigmoid', name = 'output')(l)

In [34]:
model = tf.keras.Model(inputs = [text_input],outputs = [l])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 text (InputLayer)           [(None,)]                    0         []                            
                                                                                                  
 keras_layer (KerasLayer)    {'input_word_ids': (None,    0         ['text[0][0]']                
                             128),                                                                
                              'input_mask': (None, 128)                                           
                             , 'input_type_ids': (None,                                           
                              128)}                                                               
                                                                                              

In [35]:
model.compile(optimizer='adam',
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

In [36]:
model.fit(x_train, y_train, epochs =10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7d43391a1300>

In [38]:
model.evaluate(x_test, y_test)



[0.12209820002317429, 0.9713004231452942]

In [39]:
reviews = [
    "Reply to win 1000 Rs  Weekly, where will the 2006 FIFA World Cup be held? Send STOP to 87234",
    "Hey Sam, Are you coming for a cricket game tomorrow",
    "Why don't you wait til at least wednesday to see if you get your ."
    ]

model.predict(reviews)



array([[0.5737727 ],
       [0.04552301],
       [0.01899148]], dtype=float32)

In [40]:
model.predict(x_test)



array([[0.01356408],
       [0.00258096],
       [0.25204453],
       ...,
       [0.03223979],
       [0.00930908],
       [0.02845344]], dtype=float32)

In [42]:
df['v2']

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

In [44]:
model_output = model.predict(df['v2'])



In [45]:
df['Model_output'] = model_output

In [48]:
df['Model Output']=df['Model_output'].apply(lambda x : 1 if x >0.50 else 0)

In [49]:
df

Unnamed: 0,v1,v2,Target,Model_output,Model Output
0,ham,"Go until jurong point, crazy.. Available only ...",0,0.046135,0
1,ham,Ok lar... Joking wif u oni...,0,0.030154,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,0.784867,1
3,ham,U dun say so early hor... U c already then say...,0,0.019394,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,0.014374,0
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1,0.717496,1
5568,ham,Will Ì_ b going to esplanade fr home?,0,0.011676,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0,0.014758,0
5570,ham,The guy did some bitching but I acted like i'd...,0,0.030238,0
