### TODO

03.04.2020
- Use 'bert-large-nli-mean-tokens'.

06.04.2020
- Add the lower ranking of some keywords (like 'diabetes').
- Explore how synonyms impact sentence embeddings space search.

---

### Context

**Dataset**

Human curated WHO papers + query* on PMC / bioRxiv / medRxiv.

**Query**

- "COVID-19"
- OR Coronavirus
- OR "Corona virus"
- OR "2019-nCoV"
- OR "SARS-CoV"
- OR "MERS-CoV"
- OR “Severe Acute Respiratory Syndrome”
- OR “Middle East Respiratory Syndrome” 

---

In [None]:
import time
import sqlite3

import numpy as np
import pandas as pd
import tensorflow_hub as hub
import ipywidgets as widgets

import sent2vec
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from sentence_transformers import SentenceTransformer

from sklearn.metrics.pairwise import cosine_similarity

In [None]:
class Color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

In [None]:
%%time

# Load USE
use_version = 5
use = hub.load(f"https://tfhub.dev/google/universal-sentence-encoder-large/{use_version}")

In [None]:
%%time

# Load SBERT
sbert = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
%%time

# Load BioSentVec
bsv = sent2vec.Sent2vecModel()
bsv.load_model('BioSentVec_PubMed_MIMICIII-bigram_d700.bin')

bsv_stopwords = set(stopwords.words('english'))

def bsv_preprocess(text):
    text = text.replace('/', ' / ')
    text = text.replace('.-', ' .- ')
    text = text.replace('.', ' . ')
    text = text.replace('\'', ' \' ')
    text = text.lower()
    tokens = [token for token in word_tokenize(text)
              if token not in punctuation and token not in bsv_stopwords]
    return ' '.join(tokens)

In [None]:
def embed_sentences(sentences, embedding_name, embedding_model):
    if embedding_name == 'USE':
        return embedding_model(sentences).numpy()
    
    elif embedding_name == 'SBERT':
        return np.stack(embedding_model.encode(sentences), axis=0)
    
    elif embedding_name == 'BSV':
        preprocessed = [bsv_preprocess(x) for x in sentences]
        return embedding_model.embed_sentences(preprocessed)
        
    else:
        raise NotImplementedError(f'Embedding {repr(embedding_name)} not '
                                  f'available!')

In [None]:
EMBEDDINGS_NAMES = ['USE', 'SBERT', 'BSV']

In [None]:
embeddings = np.load('sentence_embeddings/sentence_embeddings.npz')

In [None]:
db = sqlite3.connect('../cord19q/articles.sqlite')

In [None]:
def investigate():
    
    def on_clicked(b):
        wout.clear_output()
        with wout:
            print()
            t0 = time.time()
            
            print('Embedding sentence...    ', end=' ')
            embedding_query = embed_sentences([wtext.value], wselect_model.value, eval(wselect_model.value.lower()))
            print(f'{time.time()-t0:.2f} s.')
            
            print('Computing similarities...', end=' ')
            # For scalability, we will replace this part with FAISS, as in the other part of the code base.
            arr = embeddings[wselect_model.value]
            uids, embedding_docs = arr[:, 0], arr[:, 1:]
            similarities = cosine_similarity(X=embedding_query, Y=embedding_docs).squeeze()
            print(f'{time.time()-t0:.2f} s.')
            
            print('Ranking documents...     ', end=' ')
            indices = np.argsort(-similarities)[:wselect_count.value]
            print(f'{time.time()-t0:.2f} s.')
            
            print()
            for i, (uid_, sim_) in enumerate(zip(uids[indices], similarities[indices])):
                article_sha, text = db.execute('SELECT Article, Text FROM sections WHERE Id = ?', [uid_]).fetchall()[0]
                print(f'Rank: {i} --- Section id: {int(uid_):>7,d} --- Similarity: {sim_:.2f}')
                print(Color.BLUE + text + Color.END)
                article_title = db.execute('SELECT Title FROM articles WHERE Id = ?', [article_sha]).fetchone()[0]
                print(Color.GREEN + 'From: ' + article_title + Color.END)
                print()
    
    wselect_model = widgets.ToggleButtons(
        options=[ 'USE', 'SBERT', 'BSV'],
        description='Model:',
        tooltips=['Universal Sentence Encoder', 'Sentence BERT', 'BioSentVec'],
    )
    
    wselect_count = widgets.IntSlider(value=10, min=0, max=100, description='Top N:',)
    
    wtext = widgets.Textarea(layout=widgets.Layout(width='90%', height='80px'))

    button = widgets.Button(description='Investigate!')
    button.on_click(on_clicked)
    
    wout = widgets.Output(layout={'border': '1px solid black'})

    display(widgets.VBox([wselect_model, wselect_count, wtext, button, wout]))

---

#### Investigations

- Inhibition of N-glycosylation (using N-glycosylation inhibitors or Lectins) is a potential therapeutic approach for COVID-19 therapy.
- Is high blood / plasma sugar level or hyperglycemia associated with higher susceptibility to coronavirus infection or higher virus replication?
- Glucose or sugar is a risk factor for COVID-19.
- Ketogenic diet is protective against COVID-19.

In [None]:
investigate()

In [None]:
# db.close()

In [None]:
# embeddings.close()