In [None]:
#!pip install -U tensorflow
!pip install tensorflow_text==2.8.2
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub



In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)
# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer_4 (KerasLayer)     {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128)}                                                  

In [None]:
import pandas as pd

# Try different encodings until you find the right one
# Common encodings to try: 'latin-1', 'ISO-8859-1', 'cp1252'
try:
    df = pd.read_csv("/content/sample_data/spam.csv", encoding='latin-1')
    # If 'latin-1' doesn't work, try other encodings
except UnicodeDecodeError:
    df = pd.read_csv("/content/sample_data/spam.csv", encoding='ISO-8859-1')
    # If 'ISO-8859-1' doesn't work, try other encodings

df.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
df.groupby('v1').describe()

Unnamed: 0_level_0,v2,v2,v2,v2,Unnamed: 2,Unnamed: 2,Unnamed: 2,Unnamed: 2,Unnamed: 3,Unnamed: 3,Unnamed: 3,Unnamed: 3,Unnamed: 4,Unnamed: 4,Unnamed: 4,Unnamed: 4
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
v1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
ham,4825,4516,"Sorry, I'll call later",30,45,39,"bt not his girlfrnd... G o o d n i g h t . . .@""",3,10,9,GE,2,6,5,"GNT:-)""",2.0
spam,747,653,Please call our customer service representativ...,4,5,4,PO Box 5249,2,2,1,"MK17 92H. 450Ppw 16""",2,0,0,,


In [None]:
df['v1'].value_counts()

Unnamed: 0_level_0,count
v1,Unnamed: 1_level_1
ham,4825
spam,747


In [None]:
df_spam = df[df['v1']=='spam']
df_spam.shape
df_ham = df[df['v1']=='ham']
df_ham.shape

(4825, 5)

In [None]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 5)

In [None]:
df_balanced = pd.concat([df_ham_downsampled, df_spam])
df_balanced.shape

(1494, 5)

In [None]:
df_balanced['v1'].value_counts()

Unnamed: 0_level_0,count
v1,Unnamed: 1_level_1
ham,747
spam,747


In [None]:
df_balanced['spam']=df_balanced['v1'].apply(lambda x: 1 if x=='spam' else 0)
df_balanced.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,spam
2043,spam,4mths half price Orange line rental & latest c...,,,,1
1832,ham,What time is ur flight tmr?,,,,0
3700,ham,Shall i get my pouch?,,,,0
3513,ham,I always chat with you. In fact i need money c...,,,,0
2704,spam,FreeMsg: Fancy a flirt? Reply DATE now & join ...,,,,1


In [None]:
df_balanced['spam']=df_balanced['v1'].apply(lambda x: 1 if x=='spam' else 0)
df_balanced.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,spam
4392,spam,RECPT 1/3. You have ordered a Ringtone. Your o...,,,,1
5367,ham,Just trying to figure out when I'm suppose to ...,,,,0
3289,ham,My tuition is at 330. Hm we go for the 1120 to...,,,,0
3295,ham,Hi there. We have now moved in2 our pub . Woul...,,,,0
5142,ham,Now that you have started dont stop. Just pray...,,,,0


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_balanced['v2'],df_balanced['spam'], stratify=df_balanced['spam'])

In [None]:
X_train.head(4)

Unnamed: 0,v2
867,PRIVATE! Your 2003 Account Statement for 07808...
2548,Honestly i've just made a lovely cup of tea an...
823,25p 4 alfie Moon's Children in need song on ur...
380,I guess that's why you re worried. You must kn...


In [None]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    "500$ discount. hurry up",
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.8435172 , -0.5132728 , -0.88845754, ..., -0.7474889 ,
        -0.75314754,  0.91964495],
       [-0.87208337, -0.5054394 , -0.94446665, ..., -0.85847485,
        -0.71745324,  0.88082963]], dtype=float32)>

In [None]:
e = get_sentence_embeding([
    "banana",
    "grapes",
    "mango",
    "jeff bezos",
    "elon musk",
    "bill gates"
]
)

In [None]:
e

<tf.Tensor: shape=(6, 768), dtype=float32, numpy=
array([[-0.760692  , -0.14219391,  0.49604604, ...,  0.42165324,
        -0.5322142 ,  0.8031219 ],
       [-0.8602322 , -0.21242957,  0.4915689 , ...,  0.3979803 ,
        -0.60506296,  0.84471667],
       [-0.7128861 , -0.15463915,  0.38401663, ...,  0.3527872 ,
        -0.5099134 ,  0.73474085],
       [-0.8253347 , -0.35550573, -0.5906969 , ..., -0.01613709,
        -0.6141758 ,  0.87230283],
       [-0.7504134 , -0.2681261 , -0.26689658, ...,  0.02839418,
        -0.5938099 ,  0.7974987 ],
       [-0.78544396, -0.29949713,  0.41027236, ...,  0.5222534 ,
        -0.4957357 ,  0.81507534]], dtype=float32)>

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([e[0]],[e[4]])

array([[0.89336324]], dtype=float32)

In [None]:
len(X_train)

1120

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [None]:
model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x785d34940f40>

In [None]:
model.evaluate(X_test, y_test)



[0.2753761410713196,
 0.9064171314239502,
 0.8999999761581421,
 0.9144384860992432]

In [None]:
y_predicted = model.predict(X_test)
y_predicted = y_predicted.flatten()

In [None]:
import numpy as np

y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_predicted

array([1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0,

In [None]:
reviews = [
    'Enter a chance to win $5000, hurry up, offer valid until march 31, 2021',
    'You are awarded a SiPix Digital Camera! call 09061221061 from landline. Delivery within 28days. T Cs Box177. M221BP. 2yr warranty. 150ppm. 16 . p pÂ£3.99',
    'it to 80488. Your 500 free text messages are valid until 31 December 2005.',
    'Hey Sam, Are you coming for a cricket game tomorrow',
    "Why don't you wait 'til at least wednesday to see if you get your ."
]
model.predict(reviews)

array([[0.7249197 ],
       [0.79679346],
       [0.786399  ],
       [0.15657091],
       [0.087358  ]], dtype=float32)