[Reference](https://medium.com/@aniketmohan/text-classification-using-bert-and-tensorflow-on-google-colab-b222762f6b48)

![photo](https://miro.medium.com/max/1302/1*uOSGRRS81T54b4X2d1em7w.png)

In [21]:
!pip install tensorflow
!pip install tensorflow_hub
!pip install tensorflow_text



## Import the Dataset
- The dataset is [here](https://www.kaggle.com/uciml/sms-spam-collection-dataset)

In [22]:
import pandas as pd
df = pd.read_csv('https://storage.googleapis.com/kagglesdsdata/datasets/483/982/spam.csv?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20211010%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20211010T033443Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=81fc23e9865efcb4aefb6f579c56cc07e6b7b8c4305a90ca41df8a9702bb4d138b930bb9fa798d41c3c30d99e120e044928dc9057a4bebd824dcbceb208dd8deefb024eda85e3ae571a35a66b7cca2b513afc362d4d65af1e4ba700a585db7700e2d19819be1978d1825def7a042d85e03d3eab444e0c9996c237ed40252b96b6478986a965c021a788b5e0ca0a170b8967de081b4408ace64d4290afecd80d3e5567250a930679bc7f721dbc9f527a31f543900461b6def26eb72e16e658b382d1865edd5e2f2726e0f5170a3b9a75de0b1f49f31fc6ceeb114bbdfe5b93c52f5a3e08b58aa45488f30ec7cc2dde3c722a76fbb3359db0bae5b183704cef155',encoding='latin-1')

In [23]:
df.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [24]:
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
df = df.rename(columns={"v1":"category","v2":"message"})

In [25]:
df.head(5)

Unnamed: 0,category,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Perform Basic Analysis

In [26]:
df.groupby('category').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [27]:
df['spam']=df['category'].apply(lambda x: 1 if x=='spam' else 0)
df.head(5)

Unnamed: 0,category,message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


## Train Test Split

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['message'],df['spam'], stratify=df['spam'])

In [29]:
X_train.head()

2905    Helloooo... Wake up..! \Sweet\" \"morning\" \"...
4777    U R THE MOST BEAUTIFUL GIRL IVE EVER SEEN. U R...
4837    All boys made fun of me today. Ok i have no pr...
370     Hello my boytoy ... Geeee I miss you already a...
5218            I accidentally brought em home in the box
Name: message, dtype: object

## Creating Embedding using BERT

In [30]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text

In [31]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [32]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']
get_sentence_embeding(
    ['500$, discount, hurry up',
     "Branden, are you up for a soccer game tomorrow?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.7557014 , -0.4842263 , -0.7586232 , ..., -0.59951204,
        -0.7264979 ,  0.8702544 ],
       [-0.8961442 , -0.45222998, -0.9513207 , ..., -0.89478016,
        -0.6995349 ,  0.8847723 ]], dtype=float32)>

## Creating a Model

In [33]:
# bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name = 'text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name='dropout')(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(l)
# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

## Model Summary

In [34]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               [(None,)]            0                                            
__________________________________________________________________________________________________
keras_layer_2 (KerasLayer)      {'input_type_ids': ( 0           text[0][0]                       
__________________________________________________________________________________________________
keras_layer_3 (KerasLayer)      {'sequence_output':  109482241   keras_layer_2[0][0]              
                                                                 keras_layer_2[0][1]              
                                                                 keras_layer_2[0][2]              
____________________________________________________________________________________________

## Compile Model


In [35]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [36]:
model.fit(X_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f3a1dc6ef10>

## Evaluate Model


In [37]:
model.evaluate(X_test, y_test)



[0.1615161895751953, 0.9318018555641174]

## Predict on Emails

In [38]:
reviews = [
           'Reply to win $100 weekly! Where will the 2006 FIFA World Cup be held? Send STOP to 87239 to end service',
           'You are awrarded a SiPix Digital Camera! call XXX-XXX-XXX from landline. Delivery within 28days.',
           'It is 80488. Your 500 free text messages are valid until 31 December 2021',
           'Hey Branden, Are you coming for a soccer game tomorrow',
           'Why do not you wait til at least wednesday to see if you get your mail'
]
model.predict(reviews)

array([[0.29653773],
       [0.42460907],
       [0.4019745 ],
       [0.02064326],
       [0.02839532]], dtype=float32)