## Sentence BioBERT
Stan and Emilie - 14.04.2020

### References

Initial tutorial 
https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/

How to load BioBERT model into Pytorch
https://github.com/huggingface/transformers/blob/master/src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py

How to handle the issue to load the model
https://github.com/dmis-lab/biobert/issues/2

NOTE: Github library doing BioBERT sentence emdedding (mean pooling on the last encoded layer)
https://github.com/Overfitter/biobert_embedding/blob/master/biobert_embedding/embedding.py

In [None]:
MODEL = 'bert-large-uncased'

In [None]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertConfig, BertForPreTraining
#from transformers import BertConfig, BertForPreTraining

import logging
logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
%matplotlib inline

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained(MODEL)

In [None]:
text = "Here is the sentence I want embeddings for."
marked_text = "[CLS] " + text + " [SEP]"

# Tokenize our sentence with the BERT tokenizer.
tokenized_text = tokenizer.tokenize(marked_text)

# Print out the tokens.
print (tokenized_text)

In [None]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained(MODEL)

In [None]:
#state_dict = torch.load('BioBERT_checkpoints/pytorch_model.bin')
bert_config_file = "BioBERT_checkpoints/biobert_v1.1_pubmed/bert_config.json"
config = BertConfig.from_json_file(bert_config_file) 
model = BertForPreTraining(config)

In [None]:
state_dict = torch.load('BioBERT_checkpoints/pytorch_model.bin')
model.load_state_dict(state_dict)

In [None]:
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

In [None]:
# Define a new example sentence with multiple meanings of the word "bank"
text = "After stealing money from the bank vault, the bank robber was seen " \
       "fishing on the Mississippi river bank."

# Add the special tokens.
marked_text = "[CLS] " + text + " [SEP]"

# Split the sentence into tokens.
tokenized_text = tokenizer.tokenize(marked_text)

# Map the token strings to their vocabulary indices.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Display the words with their indices.
for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

In [None]:
# Mark each of the 22 tokens as belonging to sentence "1".
segments_ids = [1] * len(tokenized_text)
# segments_ids = [0] * 5 + [1] * 17
print (segments_ids)

In [None]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [None]:
# Predict hidden states features for each layer
with torch.no_grad():
    encoded_layers, _ = model.bert(tokens_tensor, segments_tensors)

In [None]:
encoded_layers[0].size()

In [None]:
print ("Number of layers:", len(encoded_layers))
layer_i = 0

print ("Number of batches:", len(encoded_layers[layer_i]))
batch_i = 0

print ("Number of tokens:", len(encoded_layers[layer_i][batch_i]))
token_i = 0

print ("Number of hidden units:", len(encoded_layers[layer_i][batch_i][token_i]))

In [None]:
# For the 5th token in our sentence, select its feature values from layer 5.
token_i = 5
layer_i = 5
vec = encoded_layers[layer_i][batch_i][token_i]

# Plot the values as a histogram to show their distribution.
plt.figure(figsize=(10,10))
plt.hist(vec, bins=200)
plt.show()

In [None]:
# Concatenate the tensors for all layers. We use `stack` here to
# create a new dimension in the tensor.
token_embeddings = torch.stack(encoded_layers, dim=0)

token_embeddings.size()

In [None]:
# Remove dimension 1, the "batches".
token_embeddings = torch.squeeze(token_embeddings, dim=1)

token_embeddings.size()

In [None]:
# Swap dimensions 0 and 1.
token_embeddings = token_embeddings.permute(1,0,2)

token_embeddings.size()

In [None]:
# Stores the token vectors, with shape [22 x 3,072]
token_vecs_cat = []

# `token_embeddings` is a [22 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:
    
    # `token` is a [12 x 768] tensor

    # Concatenate the vectors (that is, append them together) from the last 
    # four layers.
    # Each layer vector is 768 values, so `cat_vec` is length 3,072.
    cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
    
    # Use `cat_vec` to represent `token`.
    token_vecs_cat.append(cat_vec)

print ('Shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))

In [None]:
# Stores the token vectors, with shape [22 x 768]
token_vecs_sum = []

# `token_embeddings` is a [22 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:

    # `token` is a [12 x 768] tensor

    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[-4:], dim=0)
    
    # Use `sum_vec` to represent `token`.
    token_vecs_sum.append(sum_vec)

print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

In [None]:
# `encoded_layers` has shape [12 x 1 x 22 x 768]

# `token_vecs` is a tensor with shape [22 x 768]
token_vecs = encoded_layers[11][0]

# Calculate the average of all 22 token vectors.
sentence_embedding = torch.mean(token_vecs, dim=0)

In [None]:
for i, token_str in enumerate(tokenized_text):
    print(i, token_str)

In [None]:
print('First 5 vector values for each instance of "bank".')
print('')
print("bank vault   ", str(token_vecs_sum[6][:5]))
print("bank robber  ", str(token_vecs_sum[10][:5]))
print("river bank   ", str(token_vecs_sum[19][:5]))

In [None]:
print('First 5 vector values for each instance of "bank".')
print('')
print("bank vault   ", str(token_vecs_sum[6][:5]))
print("bank robber  ", str(token_vecs_sum[10][:5]))
print("river bank   ", str(token_vecs_sum[19][:5]))

In [None]:
print('First 5 vector values for each instance of "bank".')
print('')
print("bank vault   ", str(token_vecs_cat[6][:5]))
print("bank robber  ", str(token_vecs_cat[10][:5]))
print("river bank   ", str(token_vecs_cat[19][:5]))

In [None]:
from scipy.spatial.distance import cosine

# Calculate the cosine similarity between the word bank 
# in "bank robber" vs "river bank" (different meanings).
diff_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[19])

# Calculate the cosine similarity between the word bank
# in "bank robber" vs "bank vault" (same meaning).
same_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[6])

print('Vector similarity for  *similar*  meanings:  %.3f' % same_bank)
print('Vector similarity for *different* meanings:  %.3f' % diff_bank)

In [None]:
from scipy.spatial.distance import cosine

# Calculate the cosine similarity between the word bank 
# in "bank robber" vs "river bank" (different meanings).
diff_bank = 1 - cosine(token_vecs_cat[10], token_vecs_cat[19])

# Calculate the cosine similarity between the word bank
# in "bank robber" vs "bank vault" (same meaning).
same_bank = 1 - cosine(token_vecs_cat[10], token_vecs_cat[6])

print('Vector similarity for  *similar*  meanings:  %.3f' % same_bank)
print('Vector similarity for *different* meanings:  %.3f' % diff_bank)

### Test Embeddings

In [None]:
ex_sent = ['I like my phone', 
           'Your cellphone looks great.', 
           'Will it snow tomorrow?',
          'Hurricanes have hit the US',
          'How old are you?',
          'what is your age?']

In [None]:
processed_ex_sent = []
tag_ex_sent = []
all_sent_emb = []
for text in ex_sent:
    marked_text = "[CLS] " + text + " [SEP]"
    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)
    # Map the token strings to their vocabulary indices.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [0] * len(tokenized_text)
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    with torch.no_grad():
        # BERT
        #encoded_layers, _ = model(tokens_tensor, segments_tensors)
        # BioBERT
        encoded_layers, _ = model.bert(tokens_tensor, segments_tensors)
    # Concatenate the tensors for all layers. We use `stack` here to
    # create a new dimension in the tensor.
    
    # SOLUTION 1 
#     token_embeddings = torch.stack(encoded_layers, dim=0)
#     token_embeddings = torch.squeeze(token_embeddings, dim=1)
#     token_embeddings = token_embeddings.permute(1,0,2)
#     print(text)
#     print(tokenized_text)
#     token_vecs_sum = []
#     print(token_embeddings.size())
#     for token in token_embeddings:
#         sum_vec = torch.sum(token[-4:], dim=0)
#         # Use `sum_vec` to represent `token`.
#         token_vecs_sum.append(sum_vec)
#     token_vecs_sum_tensor = torch.stack(token_vecs_sum)
#     sentence_embedding = torch.mean(token_vecs_sum_tensor, dim=0)
    
    # SOLUTION 2 
    token_vecs = encoded_layers[-2][0]
    # Calculate the average of all 22 token vectors.
    # sentence_embedding = torch.mean(token_vecs, dim=0)
    sentence_embedding = torch.tensor(token_vecs[0])

    all_sent_emb.append(sentence_embedding)
all_sent_emb = torch.stack(all_sent_emb)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
type(all_sent_emb)

In [None]:
f, ax = plt.subplots()
cs = ax.imshow(cosine_similarity(all_sent_emb, all_sent_emb), cmap='hot')
f.colorbar(cs)
ax.set_yticklabels([None, ] + ex_sent)
ax.set_xticklabels([None, ] + ex_sent, rotation = 45, ha="right")

In [None]:
plt.imshow(np.triu(cosine_similarity(all_sent_emb, all_sent_emb), k=1), vmin=0, vmax=1)

In [None]:
### UKPLab

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
sentences = ['I like my phone', 
           'Your cellphone looks great.', 
           'Will it snow tomorrow?',
          'Hurricanes have hit the US',
          'How old are you?',
          'what is your age?']

In [None]:
sentence_embeddings = model.encode(sentences)

In [None]:
for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

In [None]:
print(len(sentence_embeddings))
print(sentence_embeddings[0].shape)

In [None]:
f, ax = plt.subplots()
cs = ax.imshow(cosine_similarity(sentence_embeddings, sentence_embeddings), cmap='hot')
f.colorbar(cs)
ax.set_yticklabels([None, ] + ex_sent)
ax.set_xticklabels([None, ] + ex_sent, rotation = 45, ha="right")

In [None]:
## USE 

In [None]:
import tensorflow_hub as hub

In [None]:
univ_sent_emb = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")
embedding = univ_sent_emb([query]).numpy().squeeze()