<a href="https://colab.research.google.com/github/aivscovid19/covid-19_research_collaboration/blob/master/notebooks/Extract_BioMedBERT_LG_Embedding_bioasq_8b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MRR for BERT and BERT + RESCORE

[Evaluation of Ranking for Medical Papers](https://github.com/aivscovid19/covid-19_research_collaboration/issues/9)

This notebook shows how to perform 3 Experiments (as of May 24 2020) on ranking using the dataset from Bioasq.

- Experiment 1: score: 0.9124373431039607
    This uses the vanilla elastic search ranking algorithm 

- Experiment 2: BioMedBERT LG @ 770K train step: 0.5496368744211725

- Experiment 3: 0.38342058777179133 using [universal-sentence-encoder-lite](https://tfhub.dev/google/universal-sentence-encoder-lite/2)


Right now Elastic search algorithm is the best.


# Setup Elastic Search Account

Login to [Kibana](https://d7e0e807713c441295ed9707b13a089f.us-central1.gcp.cloud.es.io:9243/app/kibana#/dev_tools/console)
Create a access token than expries in 30 days

```
POST /_security/api_key
{
  "name": "suzanne",
  "expiration": "30d", 
  "role_descriptors": { 
    "role-a": {
      "cluster": ["all"],
      "index": [
        {
          "names": ["bioasq-8b-baseline", "ncbi"],
          "privileges": ["read"]
        }
      ]
    }
  }
}
```


In [0]:
# paste the token here
ES_ACCOUNT = {
  "id" : "wpxtRXIBEis4ExSHjqqf",
  "name" : "bert_embedding",
  "expiration" : 1590389107342,
  "api_key" : "2Sc8FRMnQ8i1kxJ4Dovs6Q"
}

## Setup

In [0]:
from google.colab.auth import authenticate_user
authenticate_user()

In [0]:
%tensorflow_version 1

In [0]:
!pip install --quiet elasticsearch

In [0]:
import tensorflow as tf
tf.__version__

In [0]:
import numpy as np
from elasticsearch import helpers, Elasticsearch
import csv
import pandas as pd
import os
import time
from tqdm import tqdm
import re
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt

# Read BIOASQ 8b question json  

In [0]:
!gsutil cp gs://bioasq/DATA/training8b_list.json .

In [0]:
!ls .

In [0]:
import json
with open('./training8b_list.json') as fd:
    dataset = json.load(fd)

In [0]:
!head training8b_list.json

In [0]:
questions = dataset['questions']
len(dataset['questions'])

# Upload to Elastic Search 

In [0]:
ES_ENDPOINT='https://d1f43211bd5c4fc29e56a232832b7b17.us-central1.gcp.cloud.es.io:9243'

In [0]:
es = Elasticsearch([ES_ENDPOINT], api_key=(ES_ACCOUNT['id'], ES_ACCOUNT['api_key']))
es.info()

In [0]:
def SEARCH(text, index, key, limit=31):
    res = es.search(index=index,
                    body={
                        "query": {
                            "match": {
                                'context': {
                                    "query": text,
                                    "operator": "or",
                                    "fuzziness": "0"
                                }
                            }
                        },
                        "min_score": -1,
                    },
                    size=limit)

    return ([(x.get('_source'), x.get('_score')) for x in res['hits']['hits']])


In [0]:
qin = questions[0]['question']
qin

In [0]:
total = SEARCH(qin, 'bioasq-8b-baseline', 'context', limit=700)

In [0]:
len(total)

In [0]:
total

# Create BERT embeddings from the BIOSEQ

## Load BERT

In [0]:
import tensorflow as tf
import os

In [0]:
bucket_name = 'ekaba-assets'
model_dir = 'COPY_biomedbert_base_bert_weights_and_vocab'

In [0]:
![ -d bert ] || git clone https://github.com/google-research/bert

In [0]:
import numpy as np
from numpy.linalg import norm
import pandas as pd
import json

In [0]:
#@title get sent embedding
def get_sent_embed(dict_list) :
    sent_embed = []
    for line in dict_list:
        feat_embed = np.array(line['features'][0]['layers'][0]['values'])
        # assert feat_embed.sum() > 0
        sent_embed.append(feat_embed)
    return sent_embed
     
def cosine_rank(qe, ce) :
    # ce (644, 1024)
    
    # Query Embedding
    query_embed_norm_np = query_embed/np.norm(query_embed)
    
    # Abstracts embedding
    abstract_list_embed_py = df_abstracts['embedding'].tolist()
    
    # getting the Abstracts Embedding
    abstract_list_embed_np =  np.stack(abstract_list_embed_py, 
                                       axis=0)
    abstract_list_embed_norm_np = abstract_list_embed_np/norm(abstract_list_embed_np, 
                                         axis=1, keepdims= True)
    
    # compute the cosine similarity
    # shape: ( len(ABSTRACTS), 1)
    cos_sim = np.dot(abstract_list_embed_norm_np, 
                    query_embed_norm_np)
    rank_index = cos_sim.argsort()[::-1]
    scores_sorted = np.sort(cos_sim)[::-1]

    return rank_index, scores_sorted

In [0]:
import sys
sys.path.append('/content/bert')

In [0]:
import modeling
import tokenization
import tensorflow as tf

from extract_features import InputExample

def read_examples(text_lines=[]):
    """Read a list of `InputExample`s from an input file."""
    examples = []
    unique_id = 0
    for line in text_lines:
        line = line.strip()
        linet = tokenization.convert_to_unicode(line)
        text_a = None
        text_b = None
        m = re.match(r"^(.*) \|\|\| (.*)$", line)
        if m is None:
            text_a = line
        else:
            text_a = m.group(1)
            text_b = m.group(2)
        examples.append(
            InputExample(unique_id=unique_id, 
                            text_a=text_a, 
                            text_b=text_b))
        unique_id += 1
    return examples

In [0]:
def input_fn_builder():
  all_unique_ids = []
  all_input_ids = []
  all_input_mask = []
  all_input_type_ids = []

  for feature in features:
    all_unique_ids.append(feature.unique_id)
    all_input_ids.append(feature.input_ids)
    all_input_mask.append(feature.input_mask)
    all_input_type_ids.append(feature.input_type_ids)

  def input_fn(params):
    """The actual input function."""
    batch_size = params["batch_size"]

    num_examples = len(features)

    # This is for demo purposes and does NOT scale to large data sets. We do
    # not use Dataset.from_generator() because that uses tf.py_func which is
    # not TPU compatible. The right way to load data is with TFRecordReader.
    d = tf.data.Dataset.from_tensor_slices({
        "unique_ids":
            tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
        "input_ids":
            tf.constant(
                all_input_ids, shape=[num_examples, seq_length],
                dtype=tf.int32),
        "input_mask":
            tf.constant(
                all_input_mask,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "input_type_ids":
            tf.constant(
                all_input_type_ids,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
    })

    d = d.batch(batch_size=batch_size, drop_remainder=False)
    return d

  return input_fn


In [0]:
from extract_features import convert_examples_to_features, model_fn_builder, input_fn_builder

import collections

def load_model(model_path, batch_size=8):

    init_checkpoint = tf.train.latest_checkpoint(model_path)
    vocab_file = os.path.join(model_path, 'vocab.txt')
    bert_config_file = os.path.join(model_path, 'bert_config.json')
    # TODO: assert the files exists

    # we only pick the last layer
    layer_indexes = [-1]
    
    bert_config = modeling.BertConfig.from_json_file(bert_config_file)
   
    do_lower_case = True

    tokenizer = tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=do_lower_case)
    
    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    master = None
    num_tpu_cores = 8
    run_config = tf.contrib.tpu.RunConfig(
        master=master,
        tpu_config=tf.contrib.tpu.TPUConfig(
            num_shards=num_tpu_cores,
            per_host_input_for_training=is_per_host))
   
    use_tpu = False
    use_one_hot_embeddings = False
    # BERT MODEL
    model_fn = model_fn_builder(
        bert_config=bert_config,
        init_checkpoint=init_checkpoint,
        layer_indexes=layer_indexes,
        use_tpu=use_tpu,
        use_one_hot_embeddings=use_one_hot_embeddings)
    
    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=use_tpu,
        model_fn=model_fn,
        config=run_config,
        predict_batch_size=batch_size)
    
    return estimator, tokenizer


def predict_fn(input_values, estimator, tokenizer, max_seq_length=128):
    # process input 
    examples = read_examples(input_values)

    # Build 
    features = convert_examples_to_features(
        examples=examples, seq_length=max_seq_length, tokenizer=tokenizer)
    
    # input_fn = input_fn_builder(
    #     features=features, seq_length=max_seq_length)
    
    all_unique_ids = []
    all_input_ids = []
    all_input_mask = []
    all_input_type_ids = []

    for feature in features:
        all_unique_ids.append(feature.unique_id)
        all_input_ids.append(feature.input_ids)
        all_input_mask.append(feature.input_mask)
        all_input_type_ids.append(feature.input_type_ids)

    def input_fn(params):
        """The actual input function."""
        batch_size = params["batch_size"]
    
        num_examples = len(features)
    
        # This is for demo purposes and does NOT scale to large data sets. We do
        # not use Dataset.from_generator() because that uses tf.py_func which is
        # not TPU compatible. The right way to load data is with TFRecordReader.
        d = tf.data.Dataset.from_tensor_slices({
            "unique_ids":
                tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
            "input_ids":
                tf.constant(
                    all_input_ids, shape=[num_examples, max_seq_length],
                    dtype=tf.int32),
            "input_mask":
                tf.constant(
                    all_input_mask,
                    shape=[num_examples, max_seq_length],
                    dtype=tf.int32),
            "input_type_ids":
                tf.constant(
                    all_input_type_ids,
                    shape=[num_examples, max_seq_length],
                    dtype=tf.int32),
        })

        d = d.batch(batch_size=batch_size)
        return d

    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature

    layer_indexes = [-1]
    result_list = []
    for result in estimator.predict(input_fn, yield_single_examples=True):
        unique_id = int(result["unique_id"])
        feature = unique_id_to_feature[unique_id]
        output_json = collections.OrderedDict()
        output_json["linex_index"] = unique_id
        all_features = []
        for (i, token) in enumerate(feature.tokens):
            all_layers = []
            for (j, layer_index) in enumerate(layer_indexes):
                layer_output = result["layer_output_%d" % j]
                layers = collections.OrderedDict()
                layers["index"] = layer_index
                layers["values"] = [
                    round(float(x), 6) for x in layer_output[i:(i + 1)].flat
                ]
                all_layers.append(layers)
            features = collections.OrderedDict()
            features["token"] = token
            features["layers"] = all_layers
            all_features.append(features)
        output_json["features"] = all_features
        result_list.append(output_json)
    return result_list

In [0]:
MODEL_LOCATION=f'gs://{bucket_name}/{model_dir}/'
MODEL_LOCATION

In [0]:
estimator, tokenizer = load_model(MODEL_LOCATION, batch_size=64)

# QUERY SYSTEM

In [0]:
def extract_embeddings(questions):
    # 1. extract context
    context_list = [ q['context'] for q in questions]
    query_list = [ q['question'] for q in questions] 
    # 2. compute the embeddings
    embedding_list = predict_fn(query_list + context_list, estimator, tokenizer)

    query_embeddings = embedding_list[:len(query_list)] 
    contex_embeddings = embedding_list[len(query_list):] 
    assert len(query_embeddings) == len(context_embeddings)

    query_embeddings = get_sent_embed(query_embeddings)
    context_embeddings = get_sent_embed(context_embeddings)
    
    return query_embeddings, context_embeddings

In [0]:
def do_query(query):
    # 1. Ask elastic search for the data
    results = SEARCH(query, 'ncbi', 'title', limit=100)
    input_values = [query]
    for doc, score in results:
        if 'abstract' in doc:
            abstract = doc['abstract']
            input_values.append(abstract)

    print(64, len(input_values))
    input_values = input_values[:64]
    # 2. Compute Embeddings for query + abstracts
    bert_output_list = predict_fn(input_values, estimator, tokenizer)
    # 3. Rank them
    abstracts_text_list = input_values[1:]
    rank_index, scores = rank_abstracts(bert_output_list)
    df = pd.DataFrame({ 'abstract': abstracts_text_list, "original_rank": rank_index, "scores": scores }, index=rank_index) 
    return df

In [0]:
query="What are the COVID-19 symptoms?"

In [0]:
%%capture
query_result = do_query(query)

In [0]:
%load_ext google.colab.data_table

In [0]:
query_result

# Compute MRR

The '''mean reciprocal rank''' is a statistic measure for evaluating any process that produces a list of possible responses to a sample of queries, ordered by probability of correctness.

The reciprocal rank of a query response is the multiplicative inverse of the rank of the first correct answer: 
- 1 for first place, 
- $\frac12$ for second place, 
- $\frac13$ for third place and so on.

The mean reciprocal rank is the average of the reciprocal ranks of results for a sample of queries 

$$
 \text{MRR} = \frac{1}{|Q|} \sum_{i=1}^{|Q|} \frac{1}{\text{rank}_i}
$$

where <math> \text{rank}_i</math> refers to the rank position of the ''first'' relevant document for the ''i''-th query.



In [0]:
from tqdm import tqdm

In [0]:
import textwrap

# Experiment 1

In [0]:
scores = []
all_results = []
#enumerate(questions[:10])
it = tqdm(enumerate(questions), total=len(questions))

for qix, q in it:
    qin = q['question']
    ranking = SEARCH(qin, 'bioasq-8b-baseline', 'context', limit=700)
    all_results.append(ranking)
    for i, (r, _) in enumerate(ranking):
        #print('\n\n', q['question'], '\n\n\n',  '\n'.join(textwrap.wrap(r['context'], 80)))
        if r['context'] == q['context']:
            scores.append(1./(i+1.)) 
            break

In [0]:
len(scores)

In [0]:
np.average(scores)

# Experiment 2

In [0]:
# 1. extract context
context_list = [ q['context'] for q in questions]
query_list = [ q['question'] for q in questions] 
# 2. compute the embeddings
embedding_list = predict_fn(query_list + context_list, estimator, tokenizer)

query_embeddings = embedding_list[:len(query_list)] 
context_embeddings = embedding_list[len(query_list):] 


In [0]:
assert len(query_embeddings) == len(context_embeddings)

In [0]:
query_embeddings_x = get_sent_embed(query_embeddings)

In [0]:
context_embeddings_x = get_sent_embed(context_embeddings)

In [0]:
QE = np.array(query_embeddings_x)

In [0]:
CE = np.array(context_embeddings_x)

In [0]:
QE.shape, CE.shape

In [0]:
from sklearn.metrics.pairwise import cosine_similarity

In [0]:
result = cosine_similarity(QE, CE)

In [0]:
result.shape

In [0]:
cosine_similarity(CE, CE)


In [0]:
cosine_similarity(QE, QE)

In [0]:
qix = 100
qe = QE[qix]
questions[qix]['question'], questions[qix]['context'] 

In [0]:
CE.shape

In [0]:
CES= CE[:100,]

In [0]:
scores = cosine_similarity(QE, CES)
scores.shape

In [0]:
scores = cosine_similarity([qe], CE)
scores.shape

In [0]:
ranking = scores[0].argsort() # [::-1]
ranking.shape

In [0]:
ranking[rix]

In [0]:
qix

In [0]:
result_ix = np.where(ranking == qix)

In [0]:
rix = result_ix[0][0]

In [0]:
questions[rix]


In [0]:
questions[qix]

In [0]:
count = 0
scores = []
for qix, q in enumerate(questions):
    qe = QE[qix]
    
    score = cosine_similarity([qe], CE)
    ranking = score[0].argsort()[::-1]
    result_ix = np.where(ranking == qix)
    rix = result_ix[0][0]
    scores.append( 1./(rix + 1) )

    assert questions[ranking[rix]]['context'] == questions[qix]['context']

In [0]:
np.average(scores)

# Experiment 3 
Use a Sentence Encoder to generete the embeddings.

https://tfhub.dev/google/universal-sentence-encoder-lite/2



In [0]:
!pip3 install sentencepiece
import sentencepiece as spm

In [0]:
import tensorflow_hub as hub

In [0]:
def process_to_IDs_in_sparse_format(sp, sentences):
  # An utility method that processes sentences with the sentence piece processor
  # 'sp' and returns the results in tf.SparseTensor-similar format:
  # (values, indices, dense_shape)
  ids = [sp.EncodeAsIds(x) for x in sentences]
  max_len = max(len(x) for x in ids)
  dense_shape=(len(ids), max_len)
  values=[item for sublist in ids for item in sublist]
  indices=[[row,col] for row in range(len(ids)) for col in range(len(ids[row]))]
  return (values, indices, dense_shape)


In [0]:
with tf.Session() as sess:
  module = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-lite/2")
  spm_path = sess.run(module(signature="spm_path"))
  # spm_path now contains a path to the SentencePiece model stored inside the
  # TF-Hub module



In [0]:
tf.__version__

In [0]:
def compute_embeddings(sentences):  
    with tf.Session() as session:
        input_placeholder = tf.sparse_placeholder(tf.int64, shape=[None, None])

        module = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-lite/2")
        
        spm_path = session.run(module(signature="spm_path"))
        
        sp = spm.SentencePieceProcessor()
        sp.Load(spm_path)
    
        embeddings = module(
            inputs=dict(
                values=input_placeholder.values,
                indices=input_placeholder.indices,
                dense_shape=input_placeholder.dense_shape))
        
        values, indices, dense_shape = process_to_IDs_in_sparse_format(sp, sentences)
        # initialize
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])        
        # compute
        message_embeddings = session.run(
            embeddings,
            feed_dict={input_placeholder.values: values,
                        input_placeholder.indices: indices,
                        input_placeholder.dense_shape: dense_shape})
    return message_embeddings 

In [0]:
context_list = [ q['context'] for q in questions]
query_list = [ q['question'] for q in questions] 

In [0]:
CE = compute_embeddings(context_list)

In [0]:
CE.shape

In [0]:
QE = compute_embeddings(query_list)


In [0]:
count = 0
scores = []
for qix, q in enumerate(questions):
    qe = QE[qix]
    
    score = cosine_similarity([qe], CE)
    ranking = score[0].argsort()[::-1]
    result_ix = np.where(ranking == qix)
    rix = result_ix[0][0]
    scores.append( 1./(rix + 1) )

    assert questions[ranking[rix]]['context'] == questions[qix]['context']

In [0]:
np.average(scores)

# Check Vocabolary

In [0]:
!gsutil cp {MODEL_LOCATION}vocab.txt .

In [0]:
!head -n 10 vocab.txt

In [0]:
df = pd.read_table('vocab.txt', names=['WORD'], encoding='utf-8')

In [0]:
df.info()

In [0]:
!head -n 5028 vocab.txt | tail -n 20

In [0]:
vocab

In [0]:
with open('vocab.txt') as fd:
    vocab = set(list(l.strip() for l in fd.readlines()))


In [0]:
tokenizer.tokenize("don't be so judgmental")

In [0]:
unique_words = []

In [0]:
for q in questions:
    tokens = tokenizer.tokenize(q['question'])
    found = 0
    for word in tokens:
        unique_words.append(word)
        if word in vocab:
            found += 1

In [0]:
'judgment' in vocab

In [0]:
found

In [0]:
found /  len(set(unique_words))

In [0]:
!head -n 2500 vocab.txt | tail -n 50