# BERT 2

In [1]:
!pip install bert-for-tf2
!pip install sentencepiece



### Imports

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import bert
from tensorflow.keras.models import  Model
from tqdm import tqdm
import numpy as np
import pandas as pd
from collections import namedtuple
print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)
from sklearn.model_selection import train_test_split
import keras


TensorFlow Version: 2.2.0-rc3
Hub version:  0.8.0


Using TensorFlow backend.


### BERT Embedding Layer

In [0]:
bert_layer=hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=True)

In [0]:
MAX_SEQ_LEN=25
input_word_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                    name="segment_ids")

In [0]:
def get_masks(tokens, max_seq_length):
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

In [0]:
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

### Tokenization

In [0]:
FullTokenizer=bert.bert_tokenization.FullTokenizer

vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy()

do_lower_case=bert_layer.resolved_object.do_lower_case.numpy()

tokenizer=FullTokenizer(vocab_file,do_lower_case)

def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens,)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

### Load Data 

In [8]:
df1 = pd.read_json("Sarcasm_Headlines_Dataset.json", lines=True)
df2 = pd.read_json("Sarcasm_Headlines_Dataset_v2.json", lines=True)
# re-order attibute columns in df2
df2 = df2[['article_link','headline','is_sarcastic']]
df = pd.concat([df1, df2], axis=0)
df = df.drop(['article_link'], axis=1)
print(len(df))
df.head()


55328


Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [0]:
df.reset_index(inplace=True, drop=True)

In [10]:
df.sort_values("headline", inplace = True)
df.drop_duplicates(subset ="headline", 
                     keep = 'first', inplace = True) 
print(len(df))

28503


### Split data in train, test
75% for train, 25% for test.

In [0]:
train, test = train_test_split(df, test_size=0.25)

In [12]:
print('train:',train.shape)
print('test:',test.shape)

train: (21377, 2)
test: (7126, 2)


### Prepare Training Data 

In [0]:
train_sentences = train['headline'].values
train_y = train['is_sarcastic'].values

In [0]:
def create_single_input(sentence,MAX_LEN):
  
    stokens = tokenizer.tokenize(sentence)

    stokens = stokens[:MAX_LEN]

    stokens = ["[CLS]"] + stokens + ["[SEP]"]

    ids = get_ids(stokens, tokenizer, MAX_SEQ_LEN)
    masks = get_masks(stokens, MAX_SEQ_LEN)
    segments = get_segments(stokens, MAX_SEQ_LEN)

    return ids,masks,segments

def create_input_array(sentences):

    input_ids, input_masks, input_segments = [], [], []

    for sentence in tqdm(sentences,position=0, leave=True):

        ids,masks,segments=create_single_input(sentence,MAX_SEQ_LEN-2)

        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)

    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]

### Create and Train model

In [0]:
x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
x = tf.keras.layers.Dropout(0.2)(x)
out = tf.keras.layers.Dense(1, activation="sigmoid", name="dense_output")(x)

model = tf.keras.models.Model(
      inputs=[input_word_ids, input_mask, segment_ids], outputs=out)

model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy',keras.metrics.Precision(), keras.metrics.Recall(), keras.metrics.TruePositives()])

In [16]:
inputs=create_input_array(train_sentences)

model.fit(inputs,train_y,epochs=6,batch_size=32,validation_split=0.2,shuffle=True)

100%|██████████| 21377/21377 [00:03<00:00, 5874.14it/s]


Epoch 1/6


ValueError: ignored

### Predict

In [17]:
test_sentences = test['headline'].values
test_inputs=create_input_array(test_sentences)
test_y = test['is_sarcastic'].values

100%|██████████| 7126/7126 [00:01<00:00, 5802.03it/s]


In [18]:
model.metrics_names

['loss', 'accuracy', 'precision_1', 'recall_1', 'true_positives_1']

In [19]:
loss, accuracy, precision, recall, true_positives = model.evaluate(test_inputs, test_y, batch_size=32)



In [0]:
mult_pr=precision*recall
sum_pr=precision+recall
div=mult_pr/sum_pr
f1_score=2*div

#### Loss, Accuracy, Precision, Recall and F1

In [21]:
print('Loss:',loss)
print('Accuracy:',accuracy)
print('Precision:',precision)
print('Recall:',recall)
print('f1 score:',f1_score)
print('True positives:',true_positives)

Loss: 0.6916602253913879
Accuracy: 0.5300308465957642
Precision: 0.47785142064094543
Recall: 0.3092227280139923
f1 score: 0.37547293384866504
True positives: 19449.0
