Use docker image `pafe`

In [None]:
from pathlib import Path
from collections import defaultdict
import sqlite3
from time import time

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub

import torch
import torch.nn

import sent2vec
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from sentence_transformers import SentenceTransformer

In [None]:
data_path = Path("/raid/covid_data/data/2020-04-08")

cord_path = data_path / "CORD-19-research-challenge"
databases_path = data_path / "databases"
embeddings_path = data_path / "embeddings"
assets_path = Path("/raid/covid_data/assets")

assert data_path.exists()
assert cord_path.exists()
assert databases_path.exists()
assert embeddings_path.exists()
assert assets_path.exists()

## Load Models and Data

In [None]:
# import nltk

# nltk.download('stopwords')
# nltk.download('punkt')

In [None]:
# Load universal sentence encoder
univsentenc_version = 5
univsentenc_embedder = hub.load(f"https://tfhub.dev/google/universal-sentence-encoder-large/{univsentenc_version}")

In [None]:
# Load SBERT
sbert_embedder = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
%%time

# Load BioSentVec
bsv_embedder = sent2vec.Sent2vecModel()
bsv_embedder.load_model(str(assets_path / 'BioSentVec_PubMed_MIMICIII-bigram_d700.bin'))

bsv_stopwords = set(stopwords.words('english'))

def bsv_preprocess(text):
    text = text.replace('/', ' / ')
    text = text.replace('.-', ' .- ')
    text = text.replace('.', ' . ')
    text = text.replace('\'', ' \' ')
    text = text.lower()
    tokens = [token for token in word_tokenize(text)
              if token not in punctuation and token not in bsv_stopwords]
    return ' '.join(tokens)

In [None]:
synonyms_dict = dict()
with open(assets_path / 'synonyms_list.txt', 'r', encoding='utf-8-sig') as f:
    for l in [l_.strip().lower() for l_ in f]:
        if l:
            w = [l_.strip() for l_ in l.split('=')]
            synonyms_dict[w[0]] = w[1:]

In [None]:
del synonyms_dict['sars']

In [None]:
synonyms_index = {x.lower(): k.lower() for k,v in synonyms_dict.items() for x in v}

In [None]:
db_filename = str(databases_path / 'articles.sqlite')
db = sqlite3.connect(db_filename)
curs = db.cursor()

## Temporary: BioBert

In [None]:
sentences = [
    "The weather is good today.",
    "I want to go outside.",
    "COVID-19 please go away.",
]

In [None]:
import torch
from pytorch_pretrained_bert import BertConfig, BertForPreTraining, BertTokenizer, BertModel

In [None]:
name = "biobert_v1.1_pubmed"

In [None]:
bert_config_path = assets_path / name / "bert_config.json"
checkpoint_path = assets_path / f"{name}.pth"

In [None]:
config = BertConfig.from_json_file(bert_config_path) 
state_dict = torch.load(checkpoint_path)

model_pre_training = BertForPreTraining(config)
model_pre_training.load_state_dict(state_dict)
biobert_model = model_pre_training.bert

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

Check that the tokenizer vocabulary is the same as the vocabulary of the model

In [None]:
with open(assets_path / name / "vocab.txt", 'r') as f:
    vocab = [line.strip() for line in f]
    
all(v1 == v2 for v1, v2 in zip(vocab, tokenizer.vocab.keys()))

In [None]:
class SBioBERT(torch.nn.Module):
    
    def __init__(self, bert_config_path, checkpoint_path):
        super().__init__()
        
        config = BertConfig.from_json_file(bert_config_path) 
        state_dict = torch.load(checkpoint_path)

        model_pre_training = BertForPreTraining(config)
        model_pre_training.load_state_dict(state_dict)
        
        self.biobert_model = model_pre_training.bert
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
        
    def preprocess_sentence(self, sentence):
        # Add the special tokens.
        marked_text = "[CLS] " + sentence + " [SEP]"

        # Split the sentence into tokens.
        tokenized_text = self.tokenizer.tokenize(marked_text)

        # Map the token strings to their vocabulary indices.
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)

        # Mark each of the tokens as belonging to sentence "1".
        segments_ids = [1] * len(tokenized_text)

        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
    
        return tokens_tensor, segments_tensors
        
        
    def encode(self, sentences):
        device = next(bio_embedder.parameters()).device
        preprocessed_sentences = [self.preprocess_sentence(sentence)
                                  for sentence in sentences]
        
        results = []
        for tokens_tensor, segments_tensors in preprocessed_sentences:
            with torch.no_grad():
                tokens_tensor = tokens_tensor.to(device)
                segments_tensors = segments_tensors.to(device)
                encoded_layers, _ = self.biobert_model(tokens_tensor, segments_tensors)
                encoded_layers = torch.stack(encoded_layers)
                sentence_encoding = encoded_layers[-1].squeeze().mean(axis=0)
                results.append(sentence_encoding.detach().cpu().numpy())
                
                del tokens_tensor, segments_tensors
            
        return results

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bio_embedder = SBioBERT(bert_config_path, checkpoint_path)
bio_embedder = bio_embedder.to(device)
bio_embeddings = bio_embedder.encode(sentences)
np.stack(bio_embeddings).shape

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(np.stack(bio_embeddings).ravel(), bins=50);

In [None]:
sbert_embeddings = sbert_embedder.encode(sentences)
plt.hist(np.stack(sbert_embeddings).ravel(), bins=50);

## Try Sentence Transformers

In [None]:
import sentence_transformers

In [None]:
# Use BERT for mapping tokens to embeddings
word_embedding_model = sentence_transformers.models.BERT("bert-base-cased")
word_embedding_model.bert = biobert_model
word_embedding_model.tokenizer = tokenizer
word_embedding_model.cls_token_id = word_embedding_model.tokenizer.convert_tokens_to_ids(
    [word_embedding_model.tokenizer.cls_token])[0]
word_embedding_model.sep_token_id = word_embedding_model.tokenizer.convert_tokens_to_ids(
    [word_embedding_model.tokenizer.sep_token])[0]

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = sentence_transformers.models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = sentence_transformers.SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
len(model.encode(sentences))

In [None]:
model.encode(sentences)[0].shape

## Compute the Embeddings

In [None]:
def sent_preprocessing(sentences, 
                      synonyms_index):
    """Preprocessing of the sentences. (Lower + Split + Replace Synonym)
    
    Parameters
    ----------
    sentences : List[str]
        List of N strings.
    synonyms_index: dict
        Dictionary containing as key the synonym term and as values the reference of this term.
    """
    
    return [" ".join(synonyms_index.get(y, y) for y in word_tokenize(x.lower()))
            for x in sentences]

In [None]:
def embed_sentences(sentences,
                    embedding_name,
                    embedding_model):
    '''Sentence embedding.
    
    Parameters
    ----------
    sentences : List[str]
        List of N strings.
    embedding_name : str
        Name of the embedding type. One of ('USE', 'SBERT', 'BSV').
    embedding_model : tf.Model or torch.Module
        Neural net model to create sentence embeddings.
        
    Return
    ------
    encodded_sentences : np.ndarray
        Numpy array of shape (N, n_dims).
    '''
    if embedding_name == 'USE':
        return embedding_model(sentences).numpy()
    
    elif embedding_name == 'SBERT':
        return np.stack(embedding_model.encode(sentences), axis=0)
    
    elif embedding_name == 'SBIOBERT':
        return np.stack(embedding_model.encode(sentences), axis=0)
    
    elif embedding_name == 'BSV':
        preprocessed = [bsv_preprocess(x) for x in sentences]
        return embedding_model.embed_sentences(preprocessed)
        
    else:
        raise NotImplementedError(f'Embedding {repr(embedding_name)} not '
                                  f'available!')

In [None]:
def create_sentence_embeddings(embedding_name,
                              embedding_model,
                              preprocessing=False):

    batch_size = 1_000

    x = defaultdict(list)
    arr = defaultdict(list)
    all_ids = []

    curs.execute('SELECT Id, Text FROM sections WHERE Tags IS NOT NULL')
    i = 0
    t0 = time()
    while True:
        i += 1
        batch = curs.fetchmany(batch_size)
        if not batch:
            break
        ids, sentences = zip(*batch)  

        all_ids.extend(ids)

        if preprocessing:
            sentences = sent_preprocessing(sentences, synonyms_index)

        x_ = embed_sentences(sentences, 
                             embedding_name=embedding_name, 
                             embedding_model=embedding_model)
        x[embedding_name].append(x_)

        print(f'Done processing {batch_size * i} in {time()-t0:.1f} s.')


    print('Concatenate...')

    print(f'processing: {embedding_name}')
    # Concatenate
    xx = np.concatenate(x[embedding_name], axis=0)
    all_ids = np.array(all_ids).reshape((-1, 1))
    arr[embedding_name] = np.concatenate((all_ids, xx), axis=1)
    
    print('Save...')

    if preprocessing:
        file_name = f"{embedding_name}_sentence_embeddings_merged_synonyms.npz"
    else:
        file_name = f"{embedding_name}_sentence_embeddings.npz"
    
    np.savez_compressed(file=str(file_name), **arr)

In [None]:
%%time
embedding_names = ['USE', 'SBERT', 'BSV', 'SBIOBERT']
embedding_models = [univsentenc_embedder, sbert_embedder, bsv_embedder, bio_embedder]
for embedding_name, embedding_model in zip(embedding_names, embedding_models):
    create_sentence_embeddings(embedding_name=embedding_name,
                               embedding_model=embedding_model,
                               preprocessing=True)

In [None]:
%%time
for embedding_name, embedding_model in zip(embedding_names, embedding_models):
    create_sentence_embeddings(embedding_name=embedding_name,
                               embedding_model=embedding_model,
                               preprocessing=False)