In [1]:
import pandas as pd
import os
import numpy as np
from tqdm import tqdm 
from tensorflow import keras
import math

from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime

#import bert
#from bert import run_classifier
#from bert import optimization
#from bert import tokenization

#from tensorflow.keras.models import Model
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer

In [2]:
tf.__version__

'2.0.0'

In [2]:
# Define path
#path = 'C:/Users/zhangmen/Downloads/QA_Google/data'
# path = '/Users/Mengying/Desktop/QA_Google' 
path = 'gs://question_answering_bkt' 

In [3]:
# read in dataset from 01_Preprocess
long_df = pd.read_csv(path+"/data/yesno_df_4kobs.csv")
long_df.shape

(4555, 4)

In [4]:
# split into train and test
from sklearn.utils import shuffle
long_df = shuffle(long_df,random_state=221)
cutoff = int(long_df.shape[0]*0.8)
train = long_df.iloc[0:cutoff,]
test = long_df.iloc[cutoff:,]
print("training: {}, test: {}".format(train.shape[0],test.shape[0]))
train.head(2)   

training: 3644, test: 911


Unnamed: 0,paragraph,question,is_answer,example_id
2121,"<P> An invoice , bill or tab is a commercial d...",is a bill the same as an invoice,YES,8182025560135036765
3076,<P> Stand by Me is a 1986 American coming - of...,stand by me movie based on a true story,NO,-2957418540183737743


## use bert-for-tf2 to build BERT model
Reference: https://github.com/kpe/bert-for-tf2

In [5]:
class YesnoData:
    '''
       Process yesno_df's train and test df to token ids in 2-d np arrays.     
    '''
    DATA_COLUMN = ['question','paragraph']
    LABEL_COLUMN = 'is_answer'

    def __init__(self, tokenizer: FullTokenizer, train, test, max_seq_len=1024):
        self.tokenizer = tokenizer
        self.max_seq_len = 0
             
        ((self.train_x, self.train_y, self.train_tokens),
         (self.test_x, self.test_y, self.test_tokens)) = map(self._prepare, [train, test])

        print("input max seq_len: ", self.max_seq_len, ", capped at: ", min(self.max_seq_len, max_seq_len))
        self.max_seq_len = min(self.max_seq_len, max_seq_len)
        
       
        (self.train_segment_ids,
         self.test_segment_ids) = map(self.get_segments, [self.train_tokens, self.test_tokens])

        (self.train_x,
         self.test_x) = map(self._pad, [self.train_x, self.test_x])

    def _prepare(self, df):
        ''' tokenize everyrow in training/test data '''
        x, y = [], []
        x_tokens = []
        
        with tqdm(total=df.shape[0], unit_scale=True) as pbar:
            for ndx, row in df.iterrows():
                question,paragraph, label = row[YesnoData.DATA_COLUMN[0]],row[YesnoData.DATA_COLUMN[1]], row[YesnoData.LABEL_COLUMN]
                question_tokens = self.tokenizer.tokenize(question)
                paragraph_tokens = self.tokenizer.tokenize(paragraph)
                tokens = ["[CLS]"] +  question_tokens + ["[SEP]"] + paragraph_tokens + ["[SEP]"]
                token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
                self.max_seq_len = max(self.max_seq_len, len(token_ids))
                x_tokens.append(tokens)
                x.append(token_ids)
                # Edited:
                label = 1 if label == "YES" else 0
                y.append(label)
                pbar.update()
        return np.array(x), np.array(y), x_tokens

    
    def _pad(self, ids):
        ''' add padding to each sentence array and return input and mask id '''
        x= []
        # one row, one data
        for input_ids in ids: # one concatenated ids of question + paragraph
            input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]  
            input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
            x.append(np.array(input_ids))
           
        print("input matrix dim: ", np.array(x).shape)
        return np.array(x)
    
    def get_segments(self, tokens_data):
        """ segments: 0 for the first sequence, 1 for the second
            return segment id
        """
        s = []
        for tokens in tokens_data: 
            segments = []
            current_segment_id = 0
            for token in tokens[0:min(self.max_seq_len, len(tokens))]:
                segments.append(current_segment_id)
                if token == "[SEP]":
                    current_segment_id = 1
            if current_segment_id != 1: print("No paragraph reached!")
            s.append(segments + [0] * (self.max_seq_len - len(tokens)))
        return np.array(s)

In [6]:
# Specify pre-trained BERT model
bert_model_name="uncased_L-12_H-768_A-12"
bert_ckpt_dir= os.path.join(path,"model",bert_model_name)
bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")
bert_ckpt_dir

'gs://question_answering_bkt/model/uncased_L-12_H-768_A-12'

In [7]:
# Prepare data
tokenizer = FullTokenizer(vocab_file= os.path.join(bert_ckpt_dir, "vocab.txt"))
data = YesnoData(tokenizer, 
                       train, test,
                       max_seq_len=128)


100%|██████████| 3.64k/3.64k [00:35<00:00, 101it/s] 
100%|██████████| 911/911 [00:08<00:00, 105it/s] 


input max seq_len:  74459 , capped at:  128
input matrix dim:  (3644, 128)
input matrix dim:  (911, 128)


In [8]:
def flatten_layers(root_layer):
    if isinstance(root_layer, keras.layers.Layer):
        yield root_layer
    for layer in root_layer._layers:
        for sub_layer in flatten_layers(layer):
            yield sub_layer


def freeze_bert_layers(l_bert):
    """
    Freezes all but LayerNorm and adapter layers - see arXiv:1902.00751.
    """
    for layer in flatten_layers(l_bert):
        if layer.name in ["LayerNorm", "adapter-down", "adapter-up"]:
            layer.trainable = True
        elif len(layer._layers) == 0:
            layer.trainable = False
        l_bert.embeddings_layer.trainable = False


# create model
def create_model(max_seq_len, adapter_size=64):
    """Creates a classification model."""

    #adapter_size = 64  # see - arXiv:1902.00751

    # create the bert layer
    with tf.io.gfile.GFile(bert_config_file, "r") as reader:
        bc = StockBertConfig.from_json_string(reader.read())
        bert_params = map_stock_config_to_params(bc)
        bert_params.adapter_size = adapter_size
        bert = BertModelLayer.from_params(bert_params, name="bert")

    input_ids      = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="input_ids")
    segment_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="segment_ids")    
    output         = bert([input_ids,segment_ids])
    
    print("bert shape", output.shape)
    cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(output)
    cls_out = keras.layers.Dropout(0.5)(cls_out)
    logits = keras.layers.Dense(units=768, activation="tanh")(cls_out)
    logits = keras.layers.Dropout(0.5)(logits)
    logits = keras.layers.Dense(units=2, activation="softmax")(logits)

    model = keras.Model(inputs=[input_ids, segment_ids], outputs=logits)
    model.build(input_shape=[(None, max_seq_len), (None, max_seq_len)])
    

    # load the pre-trained model weights
    load_stock_weights(bert, bert_ckpt_file)

    # freeze weights if adapter-BERT is used
    if adapter_size is not None:
        freeze_bert_layers(bert)

    model.compile(optimizer=keras.optimizers.Adam(),
                loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")])

    model.summary()

    return model

In [14]:
adapter_size = 64 # None # use None to fine-tune all of BERT
model = create_model(data.max_seq_len, adapter_size=adapter_size)

bert shape (None, 128, 768)
loader: No value for:[bert/encoder/layer_0/attention/output/adapter-down/kernel:0], i.e.:[bert/encoder/layer_0/attention/output/adapter-down/kernel] in:[gs://question_answering_bkt/model/uncased_L-12_H-768_A-12/bert_model.ckpt]
loader: No value for:[bert/encoder/layer_0/attention/output/adapter-down/bias:0], i.e.:[bert/encoder/layer_0/attention/output/adapter-down/bias] in:[gs://question_answering_bkt/model/uncased_L-12_H-768_A-12/bert_model.ckpt]
loader: No value for:[bert/encoder/layer_0/attention/output/adapter-up/kernel:0], i.e.:[bert/encoder/layer_0/attention/output/adapter-up/kernel] in:[gs://question_answering_bkt/model/uncased_L-12_H-768_A-12/bert_model.ckpt]
loader: No value for:[bert/encoder/layer_0/attention/output/adapter-up/bias:0], i.e.:[bert/encoder/layer_0/attention/output/adapter-up/bias] in:[gs://question_answering_bkt/model/uncased_L-12_H-768_A-12/bert_model.ckpt]
loader: No value for:[bert/encoder/layer_0/output/adapter-down/kernel:0], i.

In [9]:
def create_learning_rate_scheduler(max_learn_rate=5e-5,
                                   end_learn_rate=1e-7,
                                   warmup_epoch_count=10,
                                   total_epoch_count=90):

    def lr_scheduler(epoch):
        if epoch < warmup_epoch_count:
            res = (max_learn_rate/warmup_epoch_count) * (epoch + 1)
        else:
            res = max_learn_rate*math.exp(math.log(end_learn_rate/max_learn_rate)*(epoch-warmup_epoch_count+1)/(total_epoch_count-warmup_epoch_count+1))
        return float(res)
    learning_rate_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_scheduler, verbose=1)

    return learning_rate_scheduler

In [11]:
%%time

#timestamp = datetime.now().strftime("%Y%m%d-%H%M%s")
timestamp = "20200107-21261578432386"
print("Timestamp: ", timestamp)
log_dir = path+"/log/" + timestamp
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir)

# Edited: add model checkpoints
checkpoint_path = path+"/checkpoint/" + timestamp +".ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)


total_epoch_count = 5
# model.fit(x=(data.train_x, data.train_x_token_types), y=data.train_y,
model.fit(x=(data.train_x, data.train_segment_ids), y=data.train_y,
          validation_split=0.1,
          batch_size=36,
          shuffle=True,
          epochs=total_epoch_count,
          callbacks=[create_learning_rate_scheduler(max_learn_rate=1e-5,
                                                    end_learn_rate=1e-7,
                                                    warmup_epoch_count=2,
                                                    total_epoch_count=total_epoch_count),
                     keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True),
                     tensorboard_callback,
                     cp_callback])

#model.save_weights(path+'/log/longdf_10k.h5', overwrite=True)

 


Timestamp:  20200107-21261578432386
Train on 3279 samples, validate on 365 samples

Epoch 00001: LearningRateScheduler reducing learning rate to 5e-06.
Epoch 1/5
Epoch 00001: saving model to gs://question_answering_bkt/checkpoint/20200107-21261578432386.ckpt

Epoch 00002: LearningRateScheduler reducing learning rate to 1e-05.
Epoch 2/5
Epoch 00002: saving model to gs://question_answering_bkt/checkpoint/20200107-21261578432386.ckpt

Epoch 00003: LearningRateScheduler reducing learning rate to 3.162277660168379e-06.
Epoch 3/5
Epoch 00003: saving model to gs://question_answering_bkt/checkpoint/20200107-21261578432386.ckpt

Epoch 00004: LearningRateScheduler reducing learning rate to 1e-06.
Epoch 4/5

Epoch 00004: saving model to gs://question_answering_bkt/checkpoint/20200107-21261578432386.ckpt

KeyboardInterrupt: 

In [10]:
# Evaluation

model = create_model(data.max_seq_len, adapter_size=None)
# Loads the weights
checkpoint_path = "gs://question_answering_bkt/checkpoint/20200107-21261578432386.ckpt"

model.load_weights(checkpoint_path)

bert shape (None, 128, 768)
Done loading 197 BERT weights from: gs://question_answering_bkt/model/uncased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x7fcda3c36978> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
________________________________

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fcd7d391e10>

In [17]:
# _, train_acc = model.evaluate((data.train_x,data.train_segment_ids), data.train_y,verbose=2)
# print("train acc", train_acc) # loss: 0.5023 - acc: 0.7431

# _, test_acc = model.evaluate((data.test_x,data.test_segment_ids), data.test_y, verbose=2)
# print(" test acc", test_acc) # loss: 0.5088 - acc: 0.7482


In [11]:
# Inspect
test_pred = model.predict((data.test_x,data.test_segment_ids))#.argmax(axis=-1)

In [13]:
test_pred[1]

array([0.01299549, 0.9870046 ], dtype=float32)

In [14]:
test['pred'] = test_pred[:,1]
test.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,paragraph,question,is_answer,example_id,pred
881,<P> During Microsoft 's E3 2015 press conferen...,will an xbox 360 game work on an xbox one,YES,-5000357343921088794,0.981851


In [15]:
test.to_csv(path+"/data/yesno_test_pred.csv",index=False)

In [30]:
# true positive
print("TP: ", test[(test.pred==1) & (test.is_answer=="YES")].shape[0]/test[test.is_answer=="YES"].shape[0])
print("FP: ", test[(test.pred==1) & (test.is_answer=="NO")].shape[0]/test[test.is_answer=="NO"].shape[0])
print("Accuracy: ",test[(test.pred==1) & (test.is_answer=="YES") | (test.pred==0) & (test.is_answer=="NO") ].shape[0]/test.shape[0])


TP:  0.963855421686747
FP:  0.8848484848484849
Accuracy:  0.6564215148188803


In [29]:
# missed (false negative)
print("missed: ", test[(test.pred==0) & (test.is_answer=="YES")].shape[0]/test[test.is_answer=="YES"].shape[0])

missed:  0.03614457831325301


In [37]:
test[test.is_answer=="YES"].shape[0]/test.shape[0]

0.6377607025246982

In [34]:
test[test.pred==1].shape[0]/test.shape[0]

0.9352360043907794

In [94]:
# Read in test predict data
test_pred = pd.read_csv(path+"/test_pred.csv")

In [95]:
test_pred.iloc[126:130,]

Unnamed: 0,paragraph,question,is_answer,example_id,pred
126,<Tr> <Td> 2 . </Td> <Td> `` Ogre Hunters / Fai...,what song is at the end of shrek,0,6006233773350327013,0
127,"<Table> <Tr> <Th_colspan=""2""> `` ( I 'd Be ) A...",who sang i'd be a legend in my time,0,7811273331944324574,1
128,<Ul> <Li> David Essex ... Jim Maclaine </Li> <...,where was that'll be the day filmed,0,-2889700351156361235,0
129,<P> `` I Wanna Be Your Man '' is a Lennon -- M...,who wrote i want to be your man,1,8107395391359962444,0


In [45]:
train[train.example_id==8107395391359962444	]

Unnamed: 0,paragraph,question,is_answer,example_id
8683,<Tr> <Th> Genre </Th> <Td> <Ul> <Li> Rock and ...,who wrote i want to be your man,0,8107395391359962444


In [46]:
test_pred[test.example_id==8107395391359962444]

Unnamed: 0,paragraph,question,is_answer,example_id,pred
129,<P> `` I Wanna Be Your Man '' is a Lennon -- M...,who wrote i want to be your man,1,8107395391359962444,0
2534,"<Tr> <Td_colspan=""2""> <Table> <Tr> <Td> `` Com...",who wrote i want to be your man,0,8107395391359962444,0


In [48]:
test_pred.iloc[129,0]

"<P> `` I Wanna Be Your Man '' is a Lennon -- McCartney - penned song recorded and released as a single by the Rolling Stones , a"

In [44]:
test_pred.iloc[1072,0]

"<P> `` ( I 'd Be ) A Legend in My Time '' is a song written and recorded by Don Gibson in 1960 . It appeared as the B - side of his hit `` Far Far Away '' , from the album Sweet Dreams . Gibson re-recorded the song on the 1972 album Country Green . </P>"

In [43]:
test_pred.iloc[127,1]

"who sang i'd be a legend in my time"

In [12]:
test_pred.iloc[0,1]

'plates that are bounded in part by the mid-atlantic ridge'

In [53]:
# Predict on new unseen data
unseen = pd.read_csv(path+"/data/long_df_test.csv")
tokenizer = FullTokenizer(vocab_file= os.path.join(bert_ckpt_dir, "vocab.txt"))
unseen_data = LongDfData(tokenizer, 
                       test, unseen,
                       max_seq_len=128)


100%|██████████| 3.00k/3.00k [00:11<00:00, 265it/s]
100%|██████████| 7.60k/7.60k [00:27<00:00, 279it/s]


input max seq_len:  54838 , capped at:  128
input matrix dim:  (2999, 128)
input matrix dim:  (7596, 128)


In [57]:
model = create_model(unseen_data.max_seq_len, adapter_size=None)
# Loads the weights
checkpoint_path = "gs://question_answering_bkt/checkpoint/20191203-17281575394094.ckpt"

model.load_weights(checkpoint_path)

unseen_pred = model.predict((unseen_data.test_x,unseen_data.test_segment_ids)).argmax(axis=-1)

bert shape (None, 128, 768)
Done loading 197 BERT weights from: gs://question_answering_bkt/model/uncased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x7f7bd76194e0> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
________________________________

In [58]:
unseen['pred'] = unseen_pred
unseen.head(1)

Unnamed: 0,paragraph,question,is_answer,example_id,pred
0,"<Table> London Underground <Tr> <Td_colspan=""2...",which was the first tube station in london,1,8778323820284358127,0


In [59]:
# accuracy
print("Acc: ", unseen[unseen.pred==unseen.is_answer].shape[0]/unseen.shape[0])
# true positive
print("TP: ", unseen[(unseen.pred==1) & (unseen.is_answer==1)].shape[0]/unseen[unseen.is_answer==1].shape[0])
print("FP: ", unseen[(unseen.pred==1) & (unseen.is_answer==0)].shape[0]/unseen[unseen.is_answer==0].shape[0])
# missed (false negative)
print("missed: ", unseen[(unseen.pred==0) & (unseen.is_answer==1)].shape[0]/unseen[unseen.is_answer==1].shape[0])


Acc:  0.7392048446550816
TP:  0.7176656151419558
FP:  0.25
missed:  0.2823343848580442


In [64]:
unseen[(unseen.pred==0) & (unseen.is_answer==1)]

Unnamed: 0,paragraph,question,is_answer,example_id,pred
0,"<Table> London Underground <Tr> <Td_colspan=""2...",which was the first tube station in london,1,8778323820284358127,0
3,"<Li> Poppy Drayton as Elizabeth , the mermaid ...",who's going to be the little mermaid,1,-3655724318328977880,0
9,<Table> <Tr> <Th> Wrestler </Th> <Th> Victorie...,who won the most money in the bank,1,4925287453027636288,0
18,<P> The courts of the United States are closel...,the american judicial system is divided into t...,1,-8210862213696434902,0
33,<Table> <Tr> <Th> Year </Th> <Th> Title </Th> ...,who played dan pruitt on grey's anatomy,1,4365737497942094400,0
...,...,...,...,...,...
7563,<P> Plantations were an important aspect of th...,who provided most of the labor on southern pla...,1,7984379203023000937,0
7575,<P> In 2009 Humphrey appeared in the Canadian ...,who plays the new pastor on when calls the heart,1,-1955031650897404973,0
7584,<Table> <Tr> <Th> Episode Title </Th> <Th> Son...,what is the episode of phineas and ferb with s...,1,-3562346415240663031,0
7590,<P> A common misconception is that a person mu...,when can you call in a missing person,1,1837476516469930674,0


In [None]:
ind =  7593
print(unseen.iloc[ind,1])
unseen.iloc[ind,0]

In [None]:
# Prediction

pred_sentences = [
  "That movie was absolutely awful",
  "The acting was a bit lacking",
  "The film was creative and surprising",
  "Absolutely fantastic!"
]

tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))
pred_tokens    = map(tokenizer.tokenize, pred_sentences)
pred_tokens    = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens))

pred_token_ids = map(lambda tids: tids +[0]*(data.max_seq_len-len(tids)),pred_token_ids)
pred_token_ids = np.array(list(pred_token_ids))

print('pred_token_ids', pred_token_ids.shape)

res = model.predict(pred_token_ids).argmax(axis=-1)

for text, sentiment in zip(pred_sentences, res):
  print(" text:", text)
  print("  res:", ["negative","positive"][sentiment])


### Access pre-trained model in tensorflow-hub, by keras
Reference: https://towardsdatascience.com/simple-bert-using-tensorflow-2-0-132cb19e9b22, 
 https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/
 
The BERT layer requires 3 input sequence:
<li>Token ids: for every token in the sentence. We restore it from the BERT vocab dictionary</li>
<li>Mask ids: for every token to mask out tokens used only for the sequence padding (so every sequence has the same length).</li>
<li>Segment ids: 0 for one-sentence sequence, 1 if there are two sentences in the sequence and it is the second one (see the original paper or the corresponding part of the BERT on GitHub for more details: convert_single_example in the run_classifier.py).</li>

In [None]:
# Reference: tensorflowhub: https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1
max_seq_length = 512  # Your choice here.
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])