In [1]:
import tensorflow as tf
import numpy as np
from tokenizers import BertWordPieceTokenizer
import collections
import pickle

In [2]:
from models import extractor as ext
from utils import extractionUtils as extUtil

In [4]:
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [23]:
def predict_kps(doc,model,mode='inspec'):
    if mode=='inspec':
        kp_lim = 154
    else:
        kp_lim = 258
    sentence_re = r'''(?x)      # set flag to allow verbose regexps
            (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
          | \w+(?:-\w+)*        # words with optional internal hyphens
          | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
          | \.\.\.              # ellipsis
          | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
        '''   
    txt = extUtil.remove_punctuation(doc)
    text_toc = nltk.regexp_tokenize(txt, sentence_re)
    bert_toc_idx = tokenizer.encode(txt)
    bert_tocs = [txt[start:end] for start,end in bert_toc_idx.offsets]
    padding_length = max_len - len(bert_tocs)
    rep = bert_toc_idx.ids

    if padding_length<0:
        rep = np.array(rep[:max_len])
        attention_mask = np.array([1]*max_len)
    else:
        attention_mask = np.array(([1]*len(rep))+([0]*padding_length))
        rep = np.array(rep + ([0]*padding_length))


    kps = extUtil.get_kp(text_toc)
    pos,pos_set = extUtil.find_positions(text_toc,bert_tocs,kps)
    kp = list(pos_set)
    start = []
    end = []
    
    #y_val = [2 if (key in ref_positions[idx] and key[1]<512) else 1 for key in kp]
    for key in kp:
        if key[1]<512:
            start.append(key[0]-1)
            end.append(key[1]-1)
            
            
    x_pos = tf.pad([start,end],[[0,0],[0,max_kp-len(start)]])
    x_pred = tf.expand_dims(rep,0)
    
    x_pred_mask = tf.expand_dims(attention_mask,0)
    x_pred_pos = tf.expand_dims(tf.stack(x_pos),0)
    
    y_pred = model.predict([x_pred,x_pred_mask,x_pred_pos])
    
    idx = np.argsort(-y_pred[0,:,0])
    pred_idx = np.where(y_pred[0,np.argsort(-y_pred[0,:,0])]>=0.5)
    scores = y_pred[0,idx]
    res = np.array(x_pred_pos)[0,:,idx[pred_idx[0]]]
    text = tokenizer.decode(list(tf.cast(x_pred[0,:],dtype='int32')))
    
    keyphrases = collections.defaultdict(float)
    for i in range(res.shape[0]):
        phrase = res[i,:]
        phrase = tokenizer.decode(list(tf.cast(x_pred[0,phrase[0]:(phrase[1]+1)],dtype='int32')))
        keyphrases[phrase] = max(keyphrases[phrase],scores[i][0])
    
    return keyphrases

In [24]:
vocab = "D:/Word embedding/bert/assets/vocab.txt"
tokenizer = BertWordPieceTokenizer(vocab, lowercase=True)


In [25]:
# max_kp depends on the data that was used to 
max_len = 512
max_kp = 154

In [26]:
# use model configuration according to training 
tp_model = ext.get_model(max_len,max_kp,36,6)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [27]:
tp_model.load_weights('./checkpoints/inspec_final')


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x15bbf793108>

In [28]:
text = '''AI in healthcare is often used for classification, whether to automate initial evaluation of a CT scan or EKG or to identify high-risk patients for population health. The breadth of applications is rapidly increasing. As an example, AI is being applied to the high-cost problem of dosage issues—where findings suggested that AI could save $16 billion. In 2016, a groundbreaking study in California found that a mathematical formula developed with the help of AI correctly determined the accurate dose of immunosuppressant drugs to give to organ patients. Artificial intelligence is assisting doctors. According to Bloomberg Technology, Microsoft has developed AI to help doctors find the right treatments for cancer. There is a great amount of research and drugs developed relating to cancer. In detail, there are more than 800 medicines and vaccines to treat cancer. This negatively affects the doctors, because there are too many options to choose from, making it more difficult to choose the right drugs for the patients. Microsoft is working on a project to develop a machine called "Hanover". Its goal is to memorize all the papers necessary to cancer and help predict which combinations of drugs will be most effective for each patient. 
One project that is being worked on at the moment is fighting myeloid leukemia, a fatal cancer where the treatment has not improved in decades. Another study was reported to have found that artificial intelligence was as good as trained doctors in identifying skin cancers. Another study is using artificial intelligence to try to monitor multiple high-risk patients, and this is done by asking each patient numerous questions based on data acquired from live doctor to patient interactions. One study was done with transfer learning, the machine performed a diagnosis similarly to a well-trained ophthalmologist, and could generate a decision within 30 seconds on whether or not the patient should be referred for treatment, with more than 95% accuracy.
According to CNN, a recent study by surgeons at the Children's National Medical Center in Washington successfully demonstrated surgery with an autonomous robot. The team supervised the robot while it performed soft-tissue surgery, stitching together a pig's bowel during open surgery, and doing so better than a human surgeon, the team claimed. IBM has created its own artificial intelligence computer, the IBM Watson, which has beaten human intelligence (at some levels). Watson has struggled to achieve success and adoption in healthcare.
'''

In [29]:
predict_kps(text,tp_model)

defaultdict(float,
            {'moment': 0.7169198,
             'pigs bowel': 0.6988427,
             'robot': 0.6974003,
             'patient': 0.6962047,
             'using': 0.69249254,
             'accuracy': 0.6915171,
             'multiple highrisk patients': 0.6894527,
             'team': 0.6885516,
             'transfer': 0.6859856,
             'help': 0.68410856,
             'mathematical formula': 0.6839912,
             'study': 0.68347615,
             'dosage issues': 0.68192583,
             'research': 0.6791351,
             'goal': 0.6772472,
             'artificial intelligence computer': 0.676593,
             'vaccines': 0.674519,
             'learning': 0.6727299,
             'seconds': 0.6722069,
             'surgery': 0.6712767,
             'papers': 0.6707486,
             'drugs': 0.6706586,
             'data': 0.6692536,
             'softtissue surgery': 0.66810095,
             'immunosuppressant drugs': 0.6666205,
             'project': 0.6