In [62]:
import pandas as pd
import os
import numpy as np
from tqdm import tqdm 
from tensorflow import keras

from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime

#import bert
#from bert import run_classifier
#from bert import optimization
#from bert import tokenization

#from tensorflow.keras.models import Model
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer

In [58]:
tf.__version__

'2.0.0'

In [22]:
# Define path
#path = 'C:/Users/zhangmen/Downloads/QA_Google/data'
path = '/Users/Mengying/Desktop/QA_Google' 

In [23]:
# read in dataset from 01_Preprocess
long_df = pd.read_csv(path+"/data/long_df_50obs.csv")
long_df.shape

(5663, 4)

In [14]:
#long_df[long_df.is_answer==1]

In [24]:
# split into train and test
from sklearn.utils import shuffle
long_df = shuffle(long_df,random_state=22221)
cutoff = int(long_df.shape[0]*0.8)
train = long_df.iloc[0:cutoff,]
test = long_df.iloc[cutoff:,]
print("training: {}, test: {}".format(train.shape[0],test.shape[0]))
test.is_answer.sum()      

training: 5288, test: 375


6

In [23]:
train.iloc[:,0].str.len().describe() # char length of paragraph

count     5288.000000
mean       297.455938
std       1119.309588
min         19.000000
25%         64.000000
50%        118.000000
75%        272.000000
max      49142.000000
Name: paragraph, dtype: float64

In [36]:
m = np.array([[1,1,1,0],[1,1,0,0]])
print(m.shape)
for i in m:
    print(i)

(2, 4)
[1 1 1 0]
[1 1 0 0]


## use bert-for-tf2 to build BERT model
Reference: https://github.com/kpe/bert-for-tf2

In [79]:
class LongDfData:
    '''
       Process long_df's train and test df to token ids in 2-d np arrays.     
    '''
    DATA_COLUMN = ['question','paragraph']
    LABEL_COLUMN = 'is_answer'

    def __init__(self, tokenizer: FullTokenizer, train, test, max_seq_len=1024):
        self.tokenizer = tokenizer
        self.max_seq_len = 0
             
        ((self.train_x, self.train_y, self.train_tokens),
         (self.test_x, self.test_y, self.test_tokens)) = map(self._prepare, [train, test])

        print("input max seq_len: ", self.max_seq_len, ", capped at: ", min(self.max_seq_len, max_seq_len))
        self.max_seq_len = min(self.max_seq_len, max_seq_len)
        
       
        (self.train_segment_ids,
         self.test_segment_ids) = map(self.get_segments, [self.train_tokens, self.test_tokens])

        (self.train_x,
         self.test_x) = map(self._pad, [self.train_x, self.test_x])

    def _prepare(self, df):
        ''' tokenize everyrow in training/test data '''
        x, y = [], []
        x_tokens = []
        
        with tqdm(total=df.shape[0], unit_scale=True) as pbar:
            for ndx, row in df.iterrows():
                question,paragraph, label = row[LongDfData.DATA_COLUMN[0]],row[LongDfData.DATA_COLUMN[1]], row[LongDfData.LABEL_COLUMN]
                question_tokens = self.tokenizer.tokenize(question)
                paragraph_tokens = self.tokenizer.tokenize(paragraph)
                tokens = ["[CLS]"] +  question_tokens + ["[SEP]"] + paragraph_tokens + ["[SEP]"]
                token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
                self.max_seq_len = max(self.max_seq_len, len(token_ids))
                x_tokens.append(tokens)
                x.append(token_ids)
                y.append(int(label))
                pbar.update()
        return np.array(x), np.array(y), x_tokens

    
    def _pad(self, ids):
        ''' add padding to each sentence array and return input and mask id '''
        x= []
        # one row, one data
        for input_ids in ids: # one concatenated ids of question + paragraph
            input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]  
            input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
            x.append(np.array(input_ids))
           
        print("input matrix dim: ", np.array(x).shape)
        return np.array(x)
    
    def get_segments(self, tokens_data):
        """ segments: 0 for the first sequence, 1 for the second
            return segment id
        """
        s = []
        for tokens in tokens_data: 
            segments = []
            current_segment_id = 0
            for token in tokens[0:min(self.max_seq_len, len(tokens))]:
                segments.append(current_segment_id)
                if token == "[SEP]":
                    current_segment_id = 1
            if current_segment_id != 1: print("No paragraph reached!")
            s.append(segments + [0] * (self.max_seq_len - len(tokens)))
        return np.array(s)

In [26]:
# Specify pre-trained BERT model
bert_model_name="uncased_L-12_H-768_A-12"
bert_ckpt_dir= os.path.join(path,"model",bert_model_name)
bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")
bert_ckpt_dir

'/Users/Mengying/Desktop/QA_Google/model/uncased_L-12_H-768_A-12'

In [80]:
# Prepare data
tokenizer = FullTokenizer(vocab_file= os.path.join(bert_ckpt_dir, "vocab.txt"))
data = LongDfData(tokenizer, 
                       train, test,
                       max_seq_len=512)


100%|██████████| 5.29k/5.29k [00:08<00:00, 616it/s]
100%|██████████| 375/375 [00:00<00:00, 647it/s] 


input max seq_len:  22265 , capped at:  512
input matrix dim:  (5288, 512)
input matrix dim:  (375, 512)


In [None]:
def flatten_layers(root_layer):
    if isinstance(root_layer, keras.layers.Layer):
        yield root_layer
    for layer in root_layer._layers:
        for sub_layer in flatten_layers(layer):
            yield sub_layer


def freeze_bert_layers(l_bert):
    """
    Freezes all but LayerNorm and adapter layers - see arXiv:1902.00751.
    """
    for layer in flatten_layers(l_bert):
        if layer.name in ["LayerNorm", "adapter-down", "adapter-up"]:
            layer.trainable = True
        elif len(layer._layers) == 0:
            layer.trainable = False
        l_bert.embeddings_layer.trainable = False


In [81]:
# create model
def create_model(max_seq_len, adapter_size=64):
    """Creates a classification model."""

    #adapter_size = 64  # see - arXiv:1902.00751

    # create the bert layer
    with tf.io.gfile.GFile(bert_config_file, "r") as reader:
        bc = StockBertConfig.from_json_string(reader.read())
        bert_params = map_stock_config_to_params(bc)
        bert_params.adapter_size = adapter_size
        bert = BertModelLayer.from_params(bert_params, name="bert")

    input_ids      = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="input_ids")
    segment_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="segment_ids")    
    output         = bert([input_ids,segment_ids])
    
    print("bert shape", output.shape)
    cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(output)
    cls_out = keras.layers.Dropout(0.5)(cls_out)
    logits = keras.layers.Dense(units=768, activation="tanh")(cls_out)
    logits = keras.layers.Dropout(0.5)(logits)
    logits = keras.layers.Dense(units=2, activation="softmax")(logits)

    model = keras.Model(inputs=[input_ids, segment_ids], outputs=logits)
    model.build(input_shape=[(None, max_seq_len), (None, max_seq_len)])
    

    # load the pre-trained model weights
    load_stock_weights(bert, bert_ckpt_file)

    # freeze weights if adapter-BERT is used
    if adapter_size is not None:
        freeze_bert_layers(bert)

    model.compile(optimizer=keras.optimizers.Adam(),
                loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")])

    model.summary()

    return model

In [82]:
adapter_size = None # use None to fine-tune all of BERT
model = create_model(data.max_seq_len, adapter_size=adapter_size)

bert shape (None, 512, 768)
Done loading 197 BERT weights from: /Users/Mengying/Desktop/QA_Google/model/uncased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x1a68e3f588> (prefix:bert_8). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________

In [86]:
def create_learning_rate_scheduler(max_learn_rate=5e-5,
                                   end_learn_rate=1e-7,
                                   warmup_epoch_count=10,
                                   total_epoch_count=90):

    def lr_scheduler(epoch):
        if epoch < warmup_epoch_count:
            res = (max_learn_rate/warmup_epoch_count) * (epoch + 1)
        else:
            res = max_learn_rate*math.exp(math.log(end_learn_rate/max_learn_rate)*(epoch-warmup_epoch_count+1)/(total_epoch_count-warmup_epoch_count+1))
        return float(res)
    learning_rate_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_scheduler, verbose=1)

    return learning_rate_scheduler

In [None]:
%%time

timestamp = datetime.now().strftime("%Y%m%d-%H%M%s")
log_dir = path+"/log/" + timestamp
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir)

# Edited: add model checkpoints
checkpoint_path = path+"/checkpoint/" + timestamp +".ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)


total_epoch_count = 20
# model.fit(x=(data.train_x, data.train_x_token_types), y=data.train_y,
model.fit(x=(data.train_x, data.train_segment_ids), y=data.train_y,
          validation_split=0.1,
          batch_size=48,
          shuffle=True,
          epochs=total_epoch_count,
          callbacks=[create_learning_rate_scheduler(max_learn_rate=1e-5,
                                                    end_learn_rate=1e-7,
                                                    warmup_epoch_count=20,
                                                    total_epoch_count=total_epoch_count),
                     keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True),
                     tensorboard_callback,
                     cp_callback])

model.save_weights(path+'/log/longdf_10k.h5', overwrite=True)

 
# Loads the weights
#model.load_weights(checkpoint_path)

Train on 4759 samples, validate on 529 samples

Epoch 00001: LearningRateScheduler reducing learning rate to 5.000000000000001e-07.
Epoch 1/20


In [83]:
_, train_acc = model.evaluate(data.train_x, data.train_y)
_, test_acc = model.evaluate(data.test_x, data.test_y)

print("train acc", train_acc)
print(" test acc", test_acc)

'/Users/Mengying/Desktop/QA_Google/code'

In [None]:
# Save&Load weights: https://www.tensorflow.org/tutorials/keras/save_and_load
# maximize colab: https://medium.com/@oribarel/getting-the-most-out-of-your-google-colab-2b0585f82403

### Access pre-trained model in tensorflow-hub, by keras
Reference: https://towardsdatascience.com/simple-bert-using-tensorflow-2-0-132cb19e9b22, 
 https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/
 
The BERT layer requires 3 input sequence:
<li>Token ids: for every token in the sentence. We restore it from the BERT vocab dictionary</li>
<li>Mask ids: for every token to mask out tokens used only for the sequence padding (so every sequence has the same length).</li>
<li>Segment ids: 0 for one-sentence sequence, 1 if there are two sentences in the sequence and it is the second one (see the original paper or the corresponding part of the BERT on GitHub for more details: convert_single_example in the run_classifier.py).</li>

In [None]:
# Reference: tensorflowhub: https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1
max_seq_length = 512  # Your choice here.
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

NameError: name 'path' is not defined