# Training of BERT model

In [28]:
import pandas as pd

In [29]:
df = pd.read_csv("training.csv", encoding='latin-1', header = None)
df.columns=['label', 'id', 'Date', 'Query', 'User', 'text']
df = df.drop(columns=['id', 'Date', 'Query', 'User'], axis=1)
df.loc[df["label"]==4, "label"] = 1

In [30]:
df = pd.concat([df[df["label"]==0].head(1000), df[df["label"]==1].head(700)])
df.reset_index()

Unnamed: 0,index,label,text
0,0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,1,0,is upset that he can't update his Facebook by ...
2,2,0,@Kenichan I dived many times for the ball. Man...
3,3,0,my whole body feels itchy and like its on fire
4,4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...,...
1695,800695,1,i fell of the tredmill today in sport
1696,800696,1,Look closely at the sign ?
1697,800697,1,going to the sunshine coast on Thursday shoul...
1698,800698,1,@Cookcj I love reading your tweets but it woul...


In [31]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
from math import log, sqrt
import pandas as pd
import numpy as np
import re
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andreaspalmgren/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
hashtags = re.compile(r"^#\S+|\s#\S+")
mentions = re.compile(r"^@\S+|\s@\S+")
urls = re.compile(r"https?://\S+")

def process_text(text):
    text = re.sub(r'http\S+', '', text)
    text = hashtags.sub(' hashtag', text)
    text = mentions.sub(' entity', text)
    return text.strip().lower()

In [33]:
df['text'] = df.text.apply(process_text)

In [34]:
df

Unnamed: 0,label,text
0,0,"entity - awww, that's a bummer. you shoulda ..."
1,0,is upset that he can't update his facebook by ...
2,0,entity i dived many times for the ball. manage...
3,0,my whole body feels itchy and like its on fire
4,0,"entity no, it's not behaving at all. i'm mad. ..."
...,...,...
800695,1,i fell of the tredmill today in sport
800696,1,look closely at the sign ?
800697,1,going to the sunshine coast on thursday shoul...
800698,1,entity i love reading your tweets but it would...


In [35]:
from sklearn.model_selection import train_test_split

TRAIN_SIZE = 0.75
VAL_SIZE = 0.05
dataset_count = len(df)


X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'])

# Model training

In [36]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [37]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [38]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    "500$ discount. hurry up", 
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.8435168 , -0.5132725 , -0.8884571 , ..., -0.7474886 ,
        -0.75314724,  0.91964483],
       [-0.8720836 , -0.5054397 , -0.9444667 , ..., -0.85847515,
        -0.7174535 ,  0.8808299 ]], dtype=float32)>

In [39]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [40]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer_2 (KerasLayer)     {'input_type_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128)}                                                

In [41]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [42]:
model.fit(X_train, y_train, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fa64a5fbb20>

In [43]:
model.evaluate(X_test, y_test)



[0.6513199210166931, 0.6023529171943665, 0.0, 0.0]

In [44]:
reviews = [
    'Enter a chance to win $5000, hurry up, offer valid until march 31, 2021',
    'You are awarded a SiPix Digital Camera! call 09061221061 from landline. Delivery within 28days. T Cs Box177. M221BP. 2yr warranty. 150ppm. 16 . p pÂ£3.99',
    'it to 80488. Your 500 free text messages are valid until 31 December 2005.',
    'Hey Sam, Are you coming for a cricket game tomorrow',
    "Why don't you wait 'til at least wednesday to see if you get your ."
]
model.predict(reviews)

array([[0.38063595],
       [0.37575477],
       [0.37697938],
       [0.3367617 ],
       [0.31647128]], dtype=float32)