In [1]:
import pandas as pd
import os
import numpy as np
from tqdm import tqdm 
from tensorflow import keras

from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime

import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer

from keras.activations import softmax

def softMaxAxis1(x):
    return tf.nn.log_softmax(x,axis=1)
    #return softmax(x,axis=1)

Using TensorFlow backend.


In [37]:
tf.__version__

'2.0.0'

In [2]:
# Define path
#path = 'C:/Users/zhangmen/Downloads/QA_Google/data'
# path = '/Users/Mengying/Desktop/QA_Google' 
path = 'gs://question_answering_bkt' 

In [3]:
# read in dataset from 01_Preprocess
short_df = pd.read_csv(path+"/data/short_df_60kobs.csv")
short_df.shape

(26410, 4)

In [4]:
# split into train and test
from sklearn.utils import shuffle
short_df = shuffle(short_df,random_state=221)
cutoff = int(short_df.shape[0]*0.8)
train = short_df.iloc[0:cutoff,]
test = short_df.iloc[cutoff:,]
print("training: {}, test: {}".format(train.shape[0],test.shape[0]))   

training: 21128, test: 5282


In [5]:
#test.to_csv(path+"/data/test_short_df.csv",index=False)

In [5]:
train.head(2)

Unnamed: 0,paragraph,question,is_answer,example_id
2803,<P> Ammonium nitrate is used in some instant c...,what happens when ammonium nitrate reacts with...,endothermic,-7122609906755824580
16720,"<P> The film premiered June 25 , 2004 , in the...",when did the movie the notebook come out,"June 25 , 2004",-2916262218690511803


## use bert-for-tf2 to build BERT model
Reference: https://github.com/kpe/bert-for-tf2

In [5]:
class ShortDfData:
    '''
       Process short_df's train and test df to token ids in 2-d np arrays.     
    '''
    DATA_COLUMN = ['question','paragraph']
    LABEL_COLUMN = 'is_answer'

    def __init__(self, tokenizer: FullTokenizer, train, test, max_seq_len=1024):
        self.tokenizer = tokenizer
        self.max_seq_len = 0
             
        ((self.train_x, self.train_y, self.train_tokens),
         (self.test_x, self.test_y, self.test_tokens)) = map(self._prepare, [train, test])

        print("input max seq_len: ", self.max_seq_len, ", capped at: ", min(self.max_seq_len, max_seq_len))
        self.max_seq_len = min(self.max_seq_len, max_seq_len)
        
        ((self.train_x, self.train_y, self.train_tokens),
         (self.test_x, self.test_y, self.test_tokens)) = map(lambda x: self._shift_and_convert(x[0],x[1],x[2]), [(self.train_x, self.train_y, self.train_tokens), (self.test_x, self.test_y, self.test_tokens)])
       
        (self.train_segment_ids,
         self.test_segment_ids) = map(self.get_segments, [self.train_tokens, self.test_tokens])

        (self.train_x,
         self.test_x) = map(self._pad, [self.train_x, self.test_x])

    def _prepare(self, df):
        ''' tokenize everyrow in training/test data '''
        x, y = [], []
        x_tokens = []
        
        with tqdm(total=df.shape[0], unit_scale=True) as pbar:
            for ndx, row in df.iterrows():
                question,paragraph, answer = row[ShortDfData.DATA_COLUMN[0]],row[ShortDfData.DATA_COLUMN[1]], row[ShortDfData.LABEL_COLUMN]
                question_tokens = self.tokenizer.tokenize(question)
                paragraph_tokens = self.tokenizer.tokenize(paragraph)
                tokens = ["[CLS]"] +  question_tokens + ["[SEP]"] + paragraph_tokens + ["[SEP]"]
                token_ids = self.tokenizer.convert_tokens_to_ids(tokens)

                # Edited: process answer, find start/end index in tokens.
                if answer != "YES" and answer != "NO":
                    answer_tokens = self.tokenizer.tokenize(answer)
                    start = -1
                    end = -1
                    for t in range(len(tokens)):
                        if tokens[t] == answer_tokens[0]:
                            start = t
                            i = 1
                            while i < len(answer_tokens): #and t+i < len(tokens):
                                if tokens[t+i] == answer_tokens[i]:
                                    i += 1
                                else: 
                                    break
                            if i == len(answer_tokens):
                                end = t+i # found it!
                                break

           
                    if start != -1 and (end == start + len(answer_tokens)): 
                        self.max_seq_len = max(self.max_seq_len, len(token_ids))
                        x_tokens.append(tokens)
                        x.append(token_ids)
                        y.append([start, end])
                        
#                 else: # Yes/No answer
#                     self.max_seq_len = max(self.max_seq_len, len(token_ids))
#                     x_tokens.append(tokens)
#                     x.append(token_ids)
#                     context_start_index = tokens.index('[SEP]')
#                     y.append([context_start_index+1, -1])
                        
                    
                                     
                pbar.update()
        return np.array(x), y, x_tokens

    
    def _shift_and_convert(self, tokens_id_array, span_array, tokens_array):
        ''' convert list of answer span [start,end] of tokens to output array shape '''
    
        y = []
        for i in range(len(span_array)):
            span = span_array[i]
            start = span[0]
            end = span[1]
            
            # YES/NO question: start = context_start_index, end = -1, ok
            
            # short answer:    
            if span[0] >= self.max_seq_len or span[1] >= self.max_seq_len: # let's shift if the span is not in the context or being cut off
                context_start_index = tokens_array[i].index('[SEP]') # find the context start index, only shift the context part

                # special case when we don't want to shift: span is way too long
                if start-context_start_index < int(self.max_seq_len/2): # only with we covered the start token, not end token case
                    end = -1

                else:
                    # at least half way we can shift and not reach the start token
                    shifted_start = span[0] - int(self.max_seq_len/2)

                    # update corresponding tokens and tokens_id
                    tokens_array[i] = tokens_array[i][:context_start_index+1] + tokens_array[i][shifted_start:]
                    tokens_id_array[i] = tokens_id_array[i][:context_start_index+1] + tokens_id_array[i][shifted_start:]

                    answer_diff = end-start
                    start = context_start_index + int(self.max_seq_len/2)+1
                    end = start + answer_diff
          
            one_hot_start = [0]*self.max_seq_len   
            one_hot_start[start] = 1
            one_hot_end = [0]*self.max_seq_len
            if end >= len(one_hot_end): # it has to cover the start, but end may not
                end = -1
            one_hot_end[end] = 1

            y.append([one_hot_start,one_hot_end])
        return np.array(tokens_id_array), np.array(y).astype('float32') , tokens_array 

            
                
        
    def _pad(self, ids):
        ''' add padding to each sentence array and return input '''
        x= []
        # one row, one data
        for input_ids in ids: # one concatenated ids of question + paragraph
            input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]  
            input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
            x.append(np.array(input_ids))
           
        print("input matrix dim: ", np.array(x).shape)
        return np.array(x)
    
    def get_segments(self, tokens_data):
        """ segments: 0 for the first sequence, 1 for the second
            return segment id
        """
        s = []
        for tokens in tokens_data: 
            segments = []
            current_segment_id = 0
            for token in tokens[0:min(self.max_seq_len, len(tokens))]:
                segments.append(current_segment_id)
                if token == "[SEP]":
                    current_segment_id = 1
            if current_segment_id != 1: print("No paragraph reached!")
            s.append(segments + [0] * (self.max_seq_len - len(tokens)))
        return np.array(s)

In [6]:
# Specify pre-trained BERT model
bert_model_name="uncased_L-12_H-768_A-12"
bert_ckpt_dir= os.path.join(path,"model",bert_model_name)
bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")
bert_ckpt_dir

'gs://question_answering_bkt/model/uncased_L-12_H-768_A-12'

In [7]:
# Prepare data
tokenizer = FullTokenizer(vocab_file= os.path.join(bert_ckpt_dir, "vocab.txt"))

data = ShortDfData(tokenizer, 
                       train, test,
                       max_seq_len=128)


100%|██████████| 21.1k/21.1k [02:25<00:00, 146it/s] 
100%|██████████| 5.28k/5.28k [00:36<00:00, 146it/s] 


input max seq_len:  55495 , capped at:  128
input matrix dim:  (20520, 128)
input matrix dim:  (5142, 128)


In [23]:
ind = 1
" ".join(data.train_tokens[ind])

'[CLS] who won the very first season of survivor [SEP] < p > richard holm ##an hatch jr ( born april 8 , 1961 ) is an american former reality television contestant . in 2000 , he won the first season of the cbs reality series survivor . he was a contestant on a subsequent all - stars season of survivor , on one season of celebrity apprentice , and on one season of the biggest loser . < / p > [SEP]'

In [24]:
_, [start,end] = np.nonzero(data.train_y[ind])
" ".join(data.train_tokens[ind][start:end])

'richard holm ##an hatch jr'

In [25]:
train.iloc[ind]

paragraph     <P> Richard Holman Hatch Jr ( born April 8 , 1...
question              who won the very first season of survivor
is_answer                               Richard Holman Hatch Jr
example_id                                  7539255480183006840
Name: 6693, dtype: object

In [8]:
def flatten_layers(root_layer):
    if isinstance(root_layer, keras.layers.Layer):
        yield root_layer
    for layer in root_layer._layers:
        for sub_layer in flatten_layers(layer):
            yield sub_layer


def freeze_bert_layers(l_bert):
    """
    Freezes all but LayerNorm and adapter layers - see arXiv:1902.00751.
    """
    for layer in flatten_layers(l_bert):
        if layer.name in ["LayerNorm", "adapter-down", "adapter-up"]:
            layer.trainable = True
        elif len(layer._layers) == 0:
            layer.trainable = False
        l_bert.embeddings_layer.trainable = False


In [9]:
# customize loss function
def compute_loss(y_true, y_pred):
    loss = -tf.reduce_mean(
        tf.reduce_sum(y_true * y_pred, axis=-1))
    return loss


# create model
def create_short_answer_model(max_seq_len, adapter_size=64):
    """Creates a classification model."""

    #adapter_size = 64  # see - arXiv:1902.00751
    
    # create the bert layer
    with tf.io.gfile.GFile(bert_config_file, "r") as reader:
        bc = StockBertConfig.from_json_string(reader.read())
        bert_params = map_stock_config_to_params(bc)
        bert_params.adapter_size = adapter_size
        bert = BertModelLayer.from_params(bert_params, name="bert")

    input_ids      = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="input_ids")
    segment_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="segment_ids")    
    output         = bert([input_ids,segment_ids])
    
    print("bert shape", output.shape) #(None, 128, 768)
    #cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(output) # Edited: we want all sequence output
    #cls_out = keras.layers.Dropout(0.5)(cls_out)
    logits = keras.layers.Dropout(0.5)(output)
    logits = keras.layers.Dense(units=2, activation=softMaxAxis1, use_bias =True)(logits) # output: (128,2)
    print(logits.shape)
    logits = tf.transpose(logits,perm=[0,2,1]) # output(y): (2,128)
    print(logits.shape)
    model = keras.Model(inputs=[input_ids, segment_ids], outputs=logits)
    model.build(input_shape=[(None, max_seq_len), (None, max_seq_len)])
    

    # load the pre-trained model weights
    load_stock_weights(bert, bert_ckpt_file)

    # freeze weights if adapter-BERT is used
    if adapter_size is not None:
        freeze_bert_layers(bert)

    model.compile(optimizer=keras.optimizers.Adam(),
                  loss = compute_loss)
                #loss=keras.losses.CategoricalCrossentropy(from_logits=True),
                  #metrics=[compute_loss(name="loss")])
                #metrics=[keras.metrics.CategoricalAccuracy(name="acc")])

    model.summary()

    return model



def create_learning_rate_scheduler(max_learn_rate=5e-5,
                                   end_learn_rate=1e-7,
                                   warmup_epoch_count=10,
                                   total_epoch_count=90):

    def lr_scheduler(epoch):
        if epoch < warmup_epoch_count:
            res = (max_learn_rate/warmup_epoch_count) * (epoch + 1)
        else:
            res = max_learn_rate*math.exp(math.log(end_learn_rate/max_learn_rate)*(epoch-warmup_epoch_count+1)/(total_epoch_count-warmup_epoch_count+1))
        return float(res)
    learning_rate_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_scheduler, verbose=1)

    return learning_rate_scheduler

In [12]:
# first time training
adapter_size = 64 #None # use None to fine-tune all of BERT
model = create_short_answer_model(data.max_seq_len, adapter_size=adapter_size)

bert shape (None, 128, 768)
(None, 128, 2)
(None, 2, 128)
loader: No value for:[bert/encoder/layer_0/attention/output/adapter-down/kernel:0], i.e.:[bert/encoder/layer_0/attention/output/adapter-down/kernel] in:[gs://question_answering_bkt/model/uncased_L-12_H-768_A-12/bert_model.ckpt]
loader: No value for:[bert/encoder/layer_0/attention/output/adapter-down/bias:0], i.e.:[bert/encoder/layer_0/attention/output/adapter-down/bias] in:[gs://question_answering_bkt/model/uncased_L-12_H-768_A-12/bert_model.ckpt]
loader: No value for:[bert/encoder/layer_0/attention/output/adapter-up/kernel:0], i.e.:[bert/encoder/layer_0/attention/output/adapter-up/kernel] in:[gs://question_answering_bkt/model/uncased_L-12_H-768_A-12/bert_model.ckpt]
loader: No value for:[bert/encoder/layer_0/attention/output/adapter-up/bias:0], i.e.:[bert/encoder/layer_0/attention/output/adapter-up/bias] in:[gs://question_answering_bkt/model/uncased_L-12_H-768_A-12/bert_model.ckpt]
loader: No value for:[bert/encoder/layer_0/out

In [10]:
# continue training
model = create_short_answer_model(data.max_seq_len, adapter_size=None)

# Loads the weights
checkpoint_path = "gs://question_answering_bkt/checkpoint/20191212-19361576179403.ckpt"

model.load_weights(checkpoint_path)

bert shape (None, 128, 768)
(None, 128, 2)
(None, 2, 128)
Done loading 197 BERT weights from: gs://question_answering_bkt/model/uncased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x7f69011947b8> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f690258ac50>

In [None]:
%%time

#timestamp = datetime.now().strftime("%Y%m%d-%H%M%s")
timestamp="20191212-19361576179403"
print("Timestamp: ", timestamp)
log_dir = path+"/log/" + timestamp
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir)

# Edited: add model checkpoints
checkpoint_path = path+"/checkpoint/" + timestamp +".ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)


total_epoch_count = 8
# model.fit(x=(data.train_x, data.train_x_token_types), y=data.train_y,
model.fit(x=(data.train_x, data.train_segment_ids), y=data.train_y,
          validation_split=0.1,
          batch_size=16, # was 48
          shuffle=True,
          epochs=total_epoch_count,
          callbacks=[create_learning_rate_scheduler(max_learn_rate=1e-5,
                                                    end_learn_rate=1e-7,
                                                    warmup_epoch_count=20,
                                                    total_epoch_count=total_epoch_count),
                     keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True),
                     tensorboard_callback,
                     cp_callback])

#model.save_weights(path+'/log/longdf_10k.h5', overwrite=True)

 


Timestamp:  20191212-19361576179403
Train on 18468 samples, validate on 2052 samples

Epoch 00001: LearningRateScheduler reducing learning rate to 5.000000000000001e-07.
Epoch 1/8
Epoch 00001: saving model to gs://question_answering_bkt/checkpoint/20191212-19361576179403.ckpt

Epoch 00002: LearningRateScheduler reducing learning rate to 1.0000000000000002e-06.
Epoch 2/8
Epoch 00002: saving model to gs://question_answering_bkt/checkpoint/20191212-19361576179403.ckpt

Epoch 00003: LearningRateScheduler reducing learning rate to 1.5000000000000002e-06.
Epoch 3/8

In [10]:
# Evaluation

model = create_short_answer_model(data.max_seq_len, adapter_size=None)
# Loads the weights
checkpoint_path = "gs://question_answering_bkt/checkpoint/20191212-19361576179403.ckpt"

model.load_weights(checkpoint_path)

bert shape (None, 128, 768)
(None, 128, 2)
(None, 2, 128)
Done loading 197 BERT weights from: gs://question_answering_bkt/model/uncased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x7fc8d4c252b0> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fc8d608a940>

In [17]:
_, train_acc = model.evaluate((data.train_x,data.train_segment_ids), data.train_y,verbose=2)
print("train acc", train_acc) # loss: 0.5023 - acc: 0.7431

_, test_acc = model.evaluate((data.test_x,data.test_segment_ids), data.test_y, verbose=2)
print(" test acc", test_acc) # loss: 0.5088 - acc: 0.7482


In [11]:
# Inspect
test_pred = model.predict((data.test_x[1:200],data.test_segment_ids[1:200]))#.argmax(axis=-1)
test_pred.shape

(199, 2, 128)

In [27]:
os.listdir(path+"/data")

FileNotFoundError: [Errno 2] No such file or directory: 'gs://question_answering_bkt/data'

02_BERT-Long.ipynb  02_BERT-Short.ipynb  02_BERT-YesNo.ipynb  tutorials


In [34]:
import pickle
with open("test_short_df_tokens.pkl",'wb') as f:
    pickle.dump(data.test_tokens,f)

In [12]:
test.iloc[1,1]

'when the letters of a word stand for something'

In [13]:
test.iloc[1,0]

'<P> Whereas an abbreviation may be any type of shortened form , such as words with the middle omitted ( for example , Rd for road or Dr for Doctor ) , an acronym is a word formed from the first letter or first few letters of each word in a phrase ( such as sonar , created from so und na vigation and ranging ) . Attestations for Akronym in German are known from 1921 , and for acronym in English from 1940 . </P>'

In [14]:
test.iloc[1,2]

'acronym'

In [15]:
print(test_pred[1,0,:].argmax())
print(test_pred[1,1,:].argmax())

35
34


In [21]:
data.test_tokens[1][34:3]

107

ls: cannot access 'path': No such file or directory


In [48]:
import pickle
filename = '/test_pred'
outfile = open(path+filename,'wb')
pickle.dump(data,outfile)
outfile.close()

FileNotFoundError: [Errno 2] No such file or directory: 'gs://question_answering_bkt/test_pred'

In [14]:
test_pred_df = pd.DataFrame(test_pred, columns=["pred"])
test_pred_df.head()

ValueError: Must pass 2-d input

In [35]:
test['pred'] = test_pred_df['pred'].values
test.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,paragraph,question,is_answer,example_id,pred
5567,"<P> In the 1950s , mapping of the Earth 's oce...",plates that are bounded in part by the mid-atl...,0,4411529973849255531,1


In [38]:
test.to_csv(path+"/test_pred.csv",index=False)

In [29]:
# true positive
print("TP: ", test[(test.pred==1) & (test.is_answer==1)].shape[0]/test[test.is_answer==1].shape[0])
print("FP: ", test[(test.pred==1) & (test.is_answer==0)].shape[0]/test[test.is_answer==0].shape[0])

TP:  0.6264705882352941
FP:  0.1889843355229914


In [30]:
# missed (false negative)
print("missed: ", test[(test.pred==0) & (test.is_answer==1)].shape[0]/test[test.is_answer==1].shape[0])

missed:  0.3735294117647059


In [94]:
# Read in test predict data
test_pred = pd.read_csv(path+"/test_pred.csv")

In [95]:
test_pred.iloc[126:130,]

Unnamed: 0,paragraph,question,is_answer,example_id,pred
126,<Tr> <Td> 2 . </Td> <Td> `` Ogre Hunters / Fai...,what song is at the end of shrek,0,6006233773350327013,0
127,"<Table> <Tr> <Th_colspan=""2""> `` ( I 'd Be ) A...",who sang i'd be a legend in my time,0,7811273331944324574,1
128,<Ul> <Li> David Essex ... Jim Maclaine </Li> <...,where was that'll be the day filmed,0,-2889700351156361235,0
129,<P> `` I Wanna Be Your Man '' is a Lennon -- M...,who wrote i want to be your man,1,8107395391359962444,0


In [45]:
train[train.example_id==8107395391359962444	]

Unnamed: 0,paragraph,question,is_answer,example_id
8683,<Tr> <Th> Genre </Th> <Td> <Ul> <Li> Rock and ...,who wrote i want to be your man,0,8107395391359962444


In [46]:
test_pred[test.example_id==8107395391359962444]

Unnamed: 0,paragraph,question,is_answer,example_id,pred
129,<P> `` I Wanna Be Your Man '' is a Lennon -- M...,who wrote i want to be your man,1,8107395391359962444,0
2534,"<Tr> <Td_colspan=""2""> <Table> <Tr> <Td> `` Com...",who wrote i want to be your man,0,8107395391359962444,0


In [48]:
test_pred.iloc[129,0]

"<P> `` I Wanna Be Your Man '' is a Lennon -- McCartney - penned song recorded and released as a single by the Rolling Stones , a"

In [44]:
test_pred.iloc[1072,0]

"<P> `` ( I 'd Be ) A Legend in My Time '' is a song written and recorded by Don Gibson in 1960 . It appeared as the B - side of his hit `` Far Far Away '' , from the album Sweet Dreams . Gibson re-recorded the song on the 1972 album Country Green . </P>"

In [43]:
test_pred.iloc[127,1]

"who sang i'd be a legend in my time"

In [12]:
test_pred.iloc[0,1]

'plates that are bounded in part by the mid-atlantic ridge'

In [53]:
# Predict on new unseen data
unseen = pd.read_csv(path+"/data/long_df_test.csv")
tokenizer = FullTokenizer(vocab_file= os.path.join(bert_ckpt_dir, "vocab.txt"))
unseen_data = LongDfData(tokenizer, 
                       test, unseen,
                       max_seq_len=128)


100%|██████████| 3.00k/3.00k [00:11<00:00, 265it/s]
100%|██████████| 7.60k/7.60k [00:27<00:00, 279it/s]


input max seq_len:  54838 , capped at:  128
input matrix dim:  (2999, 128)
input matrix dim:  (7596, 128)


In [57]:
model = create_model(unseen_data.max_seq_len, adapter_size=None)
# Loads the weights
checkpoint_path = "gs://question_answering_bkt/checkpoint/20191203-17281575394094.ckpt"

model.load_weights(checkpoint_path)

unseen_pred = model.predict((unseen_data.test_x,unseen_data.test_segment_ids)).argmax(axis=-1)

bert shape (None, 128, 768)
Done loading 197 BERT weights from: gs://question_answering_bkt/model/uncased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x7f7bd76194e0> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
________________________________

In [58]:
unseen['pred'] = unseen_pred
unseen.head(1)

Unnamed: 0,paragraph,question,is_answer,example_id,pred
0,"<Table> London Underground <Tr> <Td_colspan=""2...",which was the first tube station in london,1,8778323820284358127,0


In [59]:
# accuracy
print("Acc: ", unseen[unseen.pred==unseen.is_answer].shape[0]/unseen.shape[0])
# true positive
print("TP: ", unseen[(unseen.pred==1) & (unseen.is_answer==1)].shape[0]/unseen[unseen.is_answer==1].shape[0])
print("FP: ", unseen[(unseen.pred==1) & (unseen.is_answer==0)].shape[0]/unseen[unseen.is_answer==0].shape[0])
# missed (false negative)
print("missed: ", unseen[(unseen.pred==0) & (unseen.is_answer==1)].shape[0]/unseen[unseen.is_answer==1].shape[0])


Acc:  0.7392048446550816
TP:  0.7176656151419558
FP:  0.25
missed:  0.2823343848580442


In [64]:
unseen[(unseen.pred==0) & (unseen.is_answer==1)]

Unnamed: 0,paragraph,question,is_answer,example_id,pred
0,"<Table> London Underground <Tr> <Td_colspan=""2...",which was the first tube station in london,1,8778323820284358127,0
3,"<Li> Poppy Drayton as Elizabeth , the mermaid ...",who's going to be the little mermaid,1,-3655724318328977880,0
9,<Table> <Tr> <Th> Wrestler </Th> <Th> Victorie...,who won the most money in the bank,1,4925287453027636288,0
18,<P> The courts of the United States are closel...,the american judicial system is divided into t...,1,-8210862213696434902,0
33,<Table> <Tr> <Th> Year </Th> <Th> Title </Th> ...,who played dan pruitt on grey's anatomy,1,4365737497942094400,0
...,...,...,...,...,...
7563,<P> Plantations were an important aspect of th...,who provided most of the labor on southern pla...,1,7984379203023000937,0
7575,<P> In 2009 Humphrey appeared in the Canadian ...,who plays the new pastor on when calls the heart,1,-1955031650897404973,0
7584,<Table> <Tr> <Th> Episode Title </Th> <Th> Son...,what is the episode of phineas and ferb with s...,1,-3562346415240663031,0
7590,<P> A common misconception is that a person mu...,when can you call in a missing person,1,1837476516469930674,0


In [None]:
ind =  7593
print(unseen.iloc[ind,1])
unseen.iloc[ind,0]

In [None]:
# Prediction

pred_sentences = [
  "That movie was absolutely awful",
  "The acting was a bit lacking",
  "The film was creative and surprising",
  "Absolutely fantastic!"
]

tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))
pred_tokens    = map(tokenizer.tokenize, pred_sentences)
pred_tokens    = map(lambda tok: ["[CLS]"] + tok + ["[SEP]"], pred_tokens)
pred_token_ids = list(map(tokenizer.convert_tokens_to_ids, pred_tokens))

pred_token_ids = map(lambda tids: tids +[0]*(data.max_seq_len-len(tids)),pred_token_ids)
pred_token_ids = np.array(list(pred_token_ids))

print('pred_token_ids', pred_token_ids.shape)

res = model.predict(pred_token_ids).argmax(axis=-1)

for text, sentiment in zip(pred_sentences, res):
  print(" text:", text)
  print("  res:", ["negative","positive"][sentiment])


### Access pre-trained model in tensorflow-hub, by keras
Reference: https://towardsdatascience.com/simple-bert-using-tensorflow-2-0-132cb19e9b22, 
 https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/
 
The BERT layer requires 3 input sequence:
<li>Token ids: for every token in the sentence. We restore it from the BERT vocab dictionary</li>
<li>Mask ids: for every token to mask out tokens used only for the sequence padding (so every sequence has the same length).</li>
<li>Segment ids: 0 for one-sentence sequence, 1 if there are two sentences in the sequence and it is the second one (see the original paper or the corresponding part of the BERT on GitHub for more details: convert_single_example in the run_classifier.py).</li>

In [None]:
# Reference: tensorflowhub: https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1
max_seq_length = 512  # Your choice here.
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])