In [1]:
import tensorflow as tf
import numpy as np

In [2]:
import matplotlib.pyplot as plt

In [3]:
import json
import string
from string import punctuation

In [4]:
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig

In [5]:
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

#import evaluate
import sys
import os

def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    l = []
    for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
        l.append(subtree.leaves())
    
    return l

def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
    word = word.lower()
    #word = stemmer.stem(word)
    #word = lemmatizer.lemmatize(word)
    return word

def acceptable_word(word):
    """Checks conditions for acceptable word: length, stopword."""
    accepted = bool(2 <= len(word) <= 40
        and word.lower() not in stopwords)
    return accepted


def get_terms(tree):
    kp = []
    for leaf in leaves(tree):
        term = [ normalise(w) for w,t in leaf if acceptable_word(w) ]
        if term:
            kp.append(term)
    
    return kp


In [6]:
def get_kp(text):
    
    # Used when tokenizing words
    sentence_re = r'''(?x)      # set flag to allow verbose regexps
            (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
          | \w+(?:-\w+)*        # words with optional internal hyphens
          | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
          | \.\.\.              # ellipsis
          | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
        '''

    #lemmatizer = nltk.WordNetLemmatizer()
    #stemmer = nltk.stem.porter.PorterStemmer()

    #Taken from Su Nam Kim Paper
    grammar = r"""
        NBAR:
            {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns

        NP:
            {<NBAR>}
            {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
    """

    #toks = nltk.regexp_tokenize(text, sentence_re)
    postoks = nltk.tag.pos_tag(text)
    chunker = nltk.RegexpParser(grammar)
    tree = chunker.parse(postoks)
    terms = get_terms(tree)
    #pos,pos_set = find_positions(text,terms)
    return terms#,pos,pos_set

In [7]:
data = 'Inspec/docsutf8/'

In [8]:
keys = 'Inspec/keys/'

In [9]:
files = os.listdir(data)
key_files = os.listdir(keys)

In [10]:
def find_positions(document,bert_tocs, kps):
    ''' 
    merge the same kps & keep present kps in document
    Inputs:
        document : a word list : ['sun', 'sunshine', ...] || lower cased
        kps : can have more than one kp : [['sun'], ['key','phrase'], ['sunshine']] || not duplicate
    Outputs:
        all_present_kps : present keyphrases
        positions_for_all : start_end_posisiton for prensent keyphrases
        a present kp postions list : every present's positions in documents, 
        each kp can be presented in several postions .
        [[[0,0],[20,21]], [[1,1]]]
    '''
    tot_doc_char = ' '.join(document)
    
    positions_for_all = []
    position_start,position_end =[],[]
    all_present_kps = []
    for kp in kps:
        ans_string = ' '.join(kp)
        
        if ans_string not in tot_doc_char:
            continue
        else: 
            positions_for_each = []
            # find all positions for each kp
            for i in range(0, len(bert_tocs) - len(kp) + 1):
                found = False
                search_str = ''
                if ans_string.startswith(bert_tocs[i]):
                    found = True
                    search_str +=bert_tocs[i]
                    search_idx = i
                    while found and search_idx<(len(bert_tocs)-1):
                        search_idx+=1
                        if search_str+bert_tocs[search_idx] in ans_string:
                            search_str+=bert_tocs[search_idx]
                        elif search_str+' '+bert_tocs[search_idx] in ans_string:
                            search_str+=' '+bert_tocs[search_idx]
                        else:
                            found = False
                        
                if (search_str==ans_string) and (i<search_idx):
                    assert len(kp) >= 1
                    positions_for_each.append((i+1, search_idx))
                    position_start.append(i+1)
                    position_end.append(search_idx)
                    
        if len(positions_for_each) > 0 :
            positions_for_all.extend(positions_for_each)
            all_present_kps.append(kp)
           
    assert len(positions_for_all) >= len(all_present_kps)
    
    if len(all_present_kps) == 0:
        return [None,None]
    return [position_start,position_end],set(positions_for_all)

In [11]:
def remove_punctuation(text):
    no_punct=[words for words in text if words not in string.punctuation]
    words_wo_punct=''.join(no_punct)
    return words_wo_punct

def remove_stopwords(text):
    text=[word for word in text if word not in stopword]
    return text

In [12]:
def calculate_f1(y_labels,y_preds,depth,levels):
    precision = []
    recall = []
    f1 = []
    
    for idx,y_label in enumerate(y_labels):
        tp = 0
        p = []
        r = []
        y_label = set(np.where(y_label==1)[0])
        #print(y_preds[idx].shape)
        key_idx = np.argsort(y_preds[idx])#[:,0])
        #print(sorted(-y_preds[idx]))
        y_new = np.sort(y_preds[idx])[::-1]
        #print(y_preds[idx])
        preds = key_idx[y_new>=0.5]
        for i in range(depth):
            if len(preds)>i:
                if preds[i] in y_label:
                    tp+=1
            p.append(tp/(min(i,len(preds))+1))
            r.append(tp/max(len(y_label),1))
    
    
        level_index = []
        level_p = []
        level_r = []
        for idx,level in enumerate(levels):
            level_p.append(p[level-1])
            level_r.append(r[level-1])
            if p[level-1]+r[level-1]>0:
                level_index.append(2*p[level-1]*r[level-1]/(p[level-1]+r[level-1]))
            else:
                level_index.append(0)
        #print('k',level_index)
        precision.append(level_p)
        recall.append(level_r)
        f1.append(level_index)
    precision = np.array(precision)
    recall = np.array(recall)
    f1 = np.array(f1)
    
    print('F1',np.mean(f1,axis=0),np.mean(precision,axis=0),np.mean(recall,axis=0))
    
                
            

In [13]:
vocab = "D:/Word embedding/bert/assets/vocab.txt"
tokenizer = BertWordPieceTokenizer(vocab, lowercase=True)
encoder = TFBertModel.from_pretrained("bert-base-uncased")

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [14]:
sentence_re = r'''(?x)      # set flag to allow verbose regexps
        (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
      | \.\.\.              # ellipsis
      | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
    '''

max_kp = 0
min_len =1e100
all_reps = []
att_masks = []
key_positions = []
ref_positions = []
vocab = "D:/Word embedding/bert/assets/vocab.txt"
tokenizer = BertWordPieceTokenizer(vocab, lowercase=True)
max_len = 512
for file in files[:1600]:
    with open(data+file, 'r') as in_file:
        text = in_file.read()
        txt = remove_punctuation(text)
        text_toc = nltk.regexp_tokenize(txt, sentence_re)
        bert_toc_idx = tokenizer.encode(txt)
        bert_tocs = [txt[start:end] for start,end in bert_toc_idx.offsets]
        padding_length = max_len - len(bert_tocs)
        rep = bert_toc_idx.ids
        
        if padding_length<0:
            rep = np.array(rep[:max_len])
            attention_mask = np.array([1]*max_len)
        else:
            attention_mask = np.array(([1]*len(rep))+([0]*padding_length))
            rep = np.array(rep + ([0]*padding_length))
            
         
        kps = get_kp(text_toc)
        pos,pos_set = find_positions(text_toc,bert_tocs,kps)
    
    name = file.split('.')[0]
    with open(keys+name+'.key', 'r') as in_file:
        if pos_set:
            can = in_file.readlines()
            can = [line.rstrip('\n').split() for line in can]
            ref_pos,ref_set = find_positions(text_toc,bert_tocs,can)
            
        if pos_set and ref_set:
            key_positions.append(pos_set)
            #rep = np.zeros((len(text_toc),300),dtype=float)
            #rep[idx] = model2[np.array(text_toc)[idx]]  
            all_reps.append(rep)
            att_masks.append(attention_mask)
            #max_len = max(max_len,len(text_toc))
            #min_len = min(min_len,len(text_toc))
            max_kp = max(max_kp,len(pos_set))
            ref_positions.append(ref_set)
            
print(max_len,min_len)

512 1e+100


In [15]:
y_label = []

final_positions = []
pos_mask = []
final_kp_list = []
for idx,kp in enumerate(key_positions):
    kp = list(kp)
    start = []
    end = []
    y_val = []
    #y_val = [2 if (key in ref_positions[idx] and key[1]<512) else 1 for key in kp]
    for key in kp:
        if key[1]<512:
            start.append(key[0]-1)
            end.append(key[1]-1)
            y_val.append([2,1] if key in ref_positions[idx] else [1,2])
            
    y_val.extend([[0,0]]*(max_kp-len(start)))
    final_positions.append(tf.pad([start,end],[[0,0],[0,max_kp-len(start)]]))
    pos_mask.append([True]*len(start)+[False]*(max_kp-len(start)))
    
    y_label.append(y_val)
    final_kp_list.append(kp)
    #print(idx,len(y_val),max_kp-len(start))

In [16]:
len(y_label[164])

146

In [17]:
x_train = tf.transpose(all_reps,perm=[0,1])
x_mask = tf.transpose(att_masks,perm=[0,1])
x_pos = tf.stack(final_positions)
#x_pos = tf.tile(tf.expand_dims(x_pos,-1),[1,1,1,100])
y_train = tf.stack(y_label)
y_train = tf.cast(y_train,dtype=float)

In [18]:
y_train.shape

TensorShape([1588, 146, 2])

In [19]:
print(x_train[0].shape,x_train[1].shape,y_train.shape)

(512,) (512,) (1588, 146, 2)


In [20]:
input_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32)
attention_mask = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32)
embedding = encoder(input_ids, attention_mask=attention_mask)[0]

bilstm1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(24,
                                                             #kernel_initializer=tf.keras.initializers.RandomNormal(mean=0.02,stddev=0.25),
                                                             dropout = 0.35,
                                                             return_sequences=True),
                                                             merge_mode=None)(embedding)
pos_mask = tf.keras.layers.Input(shape=(2,146),dtype='int32')
mask_start = pos_mask[0][0]
mask_end = pos_mask[0][1]

start_rep_fr = tf.gather(bilstm1[0],mask_start,axis=1)
start_rep_bk = tf.gather(bilstm1[1],mask_start,axis=1)
end_rep_fr = tf.gather(bilstm1[0],mask_end,axis=1)
end_rep_bk = tf.gather(bilstm1[0],mask_end,axis=1)


span_fe_diff_fr = start_rep_fr-end_rep_fr
span_fe_prod_fr = tf.math.multiply(start_rep_fr,end_rep_fr)
span_fe_diff_bk = start_rep_bk-end_rep_bk
span_fe_prod_bk = tf.math.multiply(start_rep_bk,end_rep_bk)


span_fe = tf.keras.layers.concatenate([start_rep_fr,
                     end_rep_fr,
                     start_rep_bk,
                     end_rep_bk,
                     span_fe_diff_fr,
                     span_fe_diff_bk,
                     span_fe_prod_fr,
                     span_fe_prod_bk
                    ],2)
bilstm2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(4,return_sequences=True,dropout = 0.15,
                                                            #kernel_initializer=tf.keras.initializers.(mean=0.0,stddev=0.05),
                                                            ),
                                        
                                         merge_mode='ave',
                                         input_shape=(146,16*4))(span_fe)
output = tf.keras.layers.Dense(2,activation='softmax')(bilstm2)


In [21]:
kpe_model = tf.keras.models.Model(inputs=[input_ids,attention_mask,pos_mask], outputs=output)
kpe_model.layers[3].trainable = False

In [22]:
print(kpe_model.summary())

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 2, 146)]     0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 512, 768), ( 109482240   input_1[0][0]                    
                                                                 input_2[0][0]         

In [23]:
import gc
gc.collect()

20

In [24]:
loss_object = tf.keras.losses.CategoricalCrossentropy()

def loss_function(y_true, y_pred):
    #print(y_pred.shape,y_true.shape)
    mask = tf.math.logical_not(tf.math.equal(y_true, 0))
    y_true = tf.clip_by_value(y_true-1, 0, 1)
    loss_ = loss_object(y_true, y_pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    #loss_ = tf.boolean_mask(loss_,mask)
    
    return tf.reduce_mean(loss_)#tf.reduce_mean(tf.reduce_sum(loss_,axis=1)/tf.reduce_sum(mask,axis=1))

def ac_metrics(y_true,y_pred):
    mask = tf.math.logical_not(tf.math.equal(y_true, 0))
    y_true = tf.cast(tf.clip_by_value(y_true-1, 0, 1),dtype='int32')
    y_pred = tf.cast(tf.where(y_pred>=0.5,1,0),dtype='int32')
    diff = 1-tf.abs(y_true-y_pred)
    mask = tf.cast(mask, dtype=diff.dtype)
    diff*= mask
    
    return tf.reduce_mean(tf.reduce_sum(diff,axis=1)/tf.reduce_sum(mask,axis=1))

In [25]:
opt = tf.keras.optimizers.Adam(learning_rate=0.000003)
kpe_model.compile(optimizer=opt,
              loss=loss_function,
              metrics=[ac_metrics])

#checkpoint
filepath="weights-improvement-{epoch:02d}-{val_ac_metrics:.2f}.hdf5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_ac_metrics', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
history = kpe_model.fit([x_train[:1300],x_mask[:1300],x_pos[:1300]], y_train[:1300], 
                          batch_size=32,epochs=6,#callbacks=callbacks_list, 
                          use_multiprocessing=True,validation_split=0.1)


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6

In [None]:
plt.plot(history.history['ac_metrics'])
plt.plot(history.history['val_ac_metrics'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`
print("Generate predictions for test")
predictions = kpe_model.predict([x_train[1300:],x_mask[1300:],x_pos[1300:]])
print("predictions shape:", predictions.shape)


In [None]:
calculate_f1(np.where(y_train[1300:,:,0]-1==1,1,0),predictions[:,:,0],20,[5,10])

In [None]:
np.where(predictions[101,:,0][np.where((y_train[1101,:,0])>0)]>=0.5)

In [None]:
predictions[40,:,0]

In [None]:
np.where((y_train[1101,:,0]-1)==1)