In [145]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd

In [101]:
df = pd.read_csv('Datasets/spam.csv') #Kaggle
df.head(100)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
95,spam,Your free ringtone is waiting to be collected....
96,ham,Watching telugu movie..wat abt u?
97,ham,i see. When we finish we have loads of loans t...
98,ham,Hi. Wk been ok - on hols now! Yes on for a bit...


In [128]:
df['Message'][94
             ]

'Havent planning to buy later. I check already lido only got 530 show in e afternoon. U finish work already?'

In [4]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


Ham emails are way more than spam emails..So we will use downsizing here..

In [6]:
df_spam = df[df['Category']=='spam']
df_spam.shape

(747, 2)

In [137]:
df_ham = df[df['Category']=='ham']
df_ham['Message'][60]

"Your gonna have to pick up a $1 burger for yourself on your way home. I can't even move. Pain is killing me."

In [94]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled

Unnamed: 0,Category,Message
1179,ham,"I'm outside islands, head towards hard rock an..."
832,ham,Hi mate its RV did u hav a nice hol just a mes...
1337,ham,"Sweet, we may or may not go to 4U to meet carl..."
5474,ham,Where's mummy's boy ? Is he being good or bad ...
1315,ham,Got but got 2 colours lor. One colour is quite...
...,...,...
334,ham,Any chance you might have had with me evaporat...
2695,ham,Hey sexy buns! What of that day? No word from ...
1256,ham,Just wait till end of march when el nino gets ...
5437,ham,Am slow in using biola's fne


In [11]:
df_balanced = pd.concat([df_spam,df_ham_downsampled])
df_balanced

Unnamed: 0,Category,Message
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
2002,ham,Idea will soon get converted to live:)
162,ham,New car and house for my parents.:)i have only...
2498,ham,Dai what this da.. Can i send my resume to thi...
3932,ham,Nooooooo I'm gonna be bored to death all day. ...


In [12]:
df_balanced['Category'].value_counts()

spam    747
ham     747
Name: Category, dtype: int64

In [13]:
df_balanced.sample()

Unnamed: 0,Category,Message
309,spam,TheMob> Check out our newest selection of cont...


In [23]:
df_balanced['spam'] = df_balanced['Category'].apply(lambda x: 1 if x=='spam' else 0)
df_balanced.sample(5)

Unnamed: 0,Category,Message,spam
4985,spam,goldviking (29/M) is inviting you to be his fr...,1
2680,spam,"New Tones This week include: 1)McFly-All Ab..,...",1
4064,ham,How are you. Its been ages. How's abj,0
3266,spam,"44 7732584351, Do you want a New Nokia 3510i c...",1
2677,ham,* Am on a train back from northampton so i'm a...,0


In [25]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df_balanced['Message'],df_balanced['spam'],stratify = df_balanced['spam'])

In [28]:
X_train.head()

2284                    I reach home safe n sound liao...
4791    Yup. Izzit still raining heavily cos i'm in e ...
2261    SplashMobile: Choose from 1000s of gr8 tones e...
955             Filthy stories and GIRLS waiting for your
978     Dont hesitate. You know this is the second tim...
Name: Message, dtype: object

In [29]:
preprocess_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
bert_preprocess_model = hub.KerasLayer(preprocess_url)
bert_model = hub.KerasLayer(url)

In [74]:
def get_sentence_embedding( msgs):
    text_preprocessed = bert_preprocess_model(msgs)
    bert_results = bert_model(text_preprocessed)
    return bert_results['pooled_output']

e = get_sentence_embedding(
    ["banana","grapes","mango","apple","steve jobs","mukesh ambani","ratan tata","bill gates","jeff bezos","elon musk"
    ]
)


These models are pretrained, and so the embeddings aren't being generated by training the model now.

In [75]:
from sklearn.metrics.pairwise import cosine_similarity as cs#Basically measures similarity between two vectors
cs([e[-1]],[e[-2]])

array([[0.98720354]], dtype=float32)

In [79]:
#Using functional style instead of sequential..
#Bert Layers 
text_input = tf.keras.layers.Input(shape=(),dtype=tf.string,name="text")
preprocessed_text = bert_preprocess_model(text_input)
outputs = bert_model(preprocessed_text)
#Neural Network Layers
l = tf.keras.layers.Dropout(0.1,name="Dropout")(outputs['pooled_output'])

l = tf.keras.layers.Dense(1,activation='sigmoid',name='output')(l)
#construct final model
model = tf.keras.Model(inputs=[text_input],outputs=[l])


In [80]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [85]:
METRICS=[tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')]
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=METRICS)
model.fit(X_train,y_train,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1c2c237b0a0>

In [86]:
model.evaluate(X_test,y_test)



[0.25420600175857544,
 0.9278075098991394,
 0.9123711585998535,
 0.9465240836143494]

In [87]:
y_predict = model.predict(X_test)
y_predict = y_predict.flatten()
import numpy as np
y_predict = np.where(y_predict>0.5 ,1,0)
y_predict



array([1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1,

In [88]:
from sklearn.metrics import confusion_matrix as cm
cm1 = cm(y_test,y_predict)
cm1

array([[170,  17],
       [ 10, 177]], dtype=int64)

In [97]:
reviews = ['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
, "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
          "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"]
model.predict(reviews)




array([[0.30993414],
       [0.8297009 ],
       [0.6183152 ]], dtype=float32)

In [149]:
import tensorflow as tf
print(tf.__version__)

2.10.1
