### TODO

03.04.2020
- Use 'bert-large-nli-mean-tokens'.

06.04.2020
- Add the lower ranking of some keywords (like 'diabetes').
- Explore how synonyms impact sentence embeddings space search.

---

### Context

**Dataset**

Human curated WHO papers + query* on PMC / bioRxiv / medRxiv.

**Query**

- "COVID-19"
- OR Coronavirus
- OR "Corona virus"
- OR "2019-nCoV"
- OR "SARS-CoV"
- OR "MERS-CoV"
- OR “Severe Acute Respiratory Syndrome”
- OR “Middle East Respiratory Syndrome” 

---

## Imports

In [1]:
import textwrap
import hashlib
import time
import sqlite3
from pathlib import Path
import json

from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import tensorflow_hub as hub
import ipywidgets as widgets
from IPython.core.display import HTML

import sent2vec
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from sentence_transformers import SentenceTransformer

from sklearn.metrics.pairwise import cosine_similarity

<img src="https://p1.hiclipart.com/preview/389/225/95/party-emoji-face-qualatex-smiley-party-guy-38-foil-balloon-emoticon-birthday-party-hat-burtonburton-northwest-greetingsballoon-world-gift-png-clipart.jpg" />

## Definitions

In [10]:
class Color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

In [11]:
for var in dir(Color):
    if not var.startswith('__') and var != 'END':
        c = getattr(Color, var)
        print(c + f"This is {var}" + Color.END)

[94mThis is BLUE[0m
[1mThis is BOLD[0m
[96mThis is CYAN[0m
[36mThis is DARKCYAN[0m
[92mThis is GREEN[0m
[95mThis is PURPLE[0m
[91mThis is RED[0m
[4mThis is UNDERLINE[0m
[93mThis is YELLOW[0m


In [12]:
print(Color.BOLD + Color.PURPLE + "This is a test" + Color.END)

[1m[95mThis is a test[0m


## Load Data: SQL, JSON, Metadata

In [15]:
db = sqlite3.connect('../cord19q/articles.sqlite')

In [2]:
data_path = Path("../v6")

In [3]:
df_metadata = pd.read_csv(data_path / "metadata.csv")
df_metadata.head(2)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,8q5ondtn,,Elsevier,Intrauterine virus infections and congenital h...,10.1016/0002-8703(72)90077-4,,4361535.0,els-covid,Abstract The etiologic basis for the vast majo...,1972-12-31,"Overall, James C.",American Heart Journal,,,False,False,custom_license,https://doi.org/10.1016/0002-8703(72)90077-4
1,pzfd0e50,,Elsevier,Coronaviruses in Balkan nephritis,10.1016/0002-8703(80)90355-5,,6243850.0,els-covid,,1980-03-31,"Georgescu, Leonida; Diosi, Peter; Buţiu, Ioan;...",American Heart Journal,,,False,False,custom_license,https://doi.org/10.1016/0002-8703(80)90355-5


Remove rows with no title and no SHA

In [4]:
mask_useless = df_metadata['title'].isna() & df_metadata['sha'].isna()
df_metadata = df_metadata[~mask_useless]

Generate fake SHAs

In [5]:
mask = df_metadata['sha'].isna()
df_metadata.loc[mask, 'sha'] = df_metadata[mask]['title'].apply(
    lambda text: hashlib.sha1(str(text).encode("utf-8")).hexdigest())
df_metadata.head(2)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,8q5ondtn,9656dc6b0a8f22905c6a7117e123d6ae754cc7d4,Elsevier,Intrauterine virus infections and congenital h...,10.1016/0002-8703(72)90077-4,,4361535.0,els-covid,Abstract The etiologic basis for the vast majo...,1972-12-31,"Overall, James C.",American Heart Journal,,,False,False,custom_license,https://doi.org/10.1016/0002-8703(72)90077-4
1,pzfd0e50,f2e65cbf7654953918d9b88caa4d15b231fd23fd,Elsevier,Coronaviruses in Balkan nephritis,10.1016/0002-8703(80)90355-5,,6243850.0,els-covid,,1980-03-31,"Georgescu, Leonida; Diosi, Peter; Buţiu, Ioan;...",American Heart Journal,,,False,False,custom_license,https://doi.org/10.1016/0002-8703(80)90355-5


Load JSON Files

In [6]:
json_files = []

for f in data_path.rglob("*.json"):
    json_files.append(json.load(open(f)))

Fill in missing titles from the metadata

In [7]:
for json_file in tqdm(json_files):
    if json_file['metadata']['title'] == '':
        sha = json_file['paper_id']
        idx = np.where(df_metadata['sha'] == sha)[0]
        if len(idx) > 0:
            new_title = df_metadata['title'].iloc[idx[0]]
            json_file['metadata']['title'] = new_title

HBox(children=(FloatProgress(value=0.0, max=52097.0), HTML(value='')))




Create a dictionary with JSON files based on their SHAs

In [8]:
json_files_d = {
    json_file['paper_id']: json_file
    for json_file in json_files
}

In [16]:
def highlight_in_paragraph(uid, sentence, width=80, indent=0, color=Color.BOLD + Color.PURPLE):
    """Highlight a given sentence in the paragraph
    
    Parameters
    ----------
    uid : int
        The identifier of the given sentence
    sentence: str
        The sentence to highlight
    width : int
        The width to which to wrapt the returned paragraph
    indent : int
        The indentation for the lines in the returned apragraph
    color : str
        The color to use for the highlight encoded as an ANSI
        escape code
    
    Returns
    -------
    formatted_paragraph : str
        The paragraph containing `sentence` with the sentence highlighted
        in color
    """
    
    sha = db.execute(f'SELECT Article FROM sections WHERE Id = {uid}').fetchall()[0][0]
    if sha in json_files_d:
        json_file = json_files_d[sha]
        if sentence in json_file['metadata']['title']:
            paragraph = json_file['metadata']['title']
        else:
            for text_chunk in json_file['abstract'] + json_file['body_text']:
                paragraph = text_chunk['text']
                if sentence in paragraph:
                    break
            else: 
                raise ValueError("sentence not found in body_text and abstract")
    else:
        if not sha in list(df_metadata['sha']):
            raise ValueError(f"SHA  not found:\nSHA={sha}\nsentence={sentence}")
        df_row = df_metadata[df_metadata['sha'] == sha].iloc[0]
        if sentence in df_row['title']:
            paragraph = df_row['title']
        else:
            paragraph = df_row['abstract']
    
    start = paragraph.index(sentence)
    end = start + len(sentence)
    hightlighted_paragraph = ''.join([
        paragraph[:start],
        color + paragraph[start:end] + Color.END,
        paragraph[end:]
    ])
    wrapped_lines = textwrap.wrap(hightlighted_paragraph, width=width)
    wrapped_lines = [' ' * indent + line for line in wrapped_lines]
    formatted_paragraph = '\n'.join(wrapped_lines)
    
    return formatted_paragraph

In [17]:
uid = 6797013
sentence = "Indeed, diabetes was seen as an important risk factor for Data about COVID-19 in patients with diabetes is limited at present."


print(highlight_in_paragraph(uid, sentence, width=80, indent=10))

          Individuals with diabetes are at risk of infections, especially influenza and
          pneumonia. This risk can be reduced, though not completely eliminated, by good
          glycaemic control. All people with diabetes (above 2 years of age) are
          recommended pneumococcal and annual influenza vaccinations. Not only this,
          patients with diabetes have a severe disease when infected with respiratory
          viruses. [1m[95mIndeed, diabetes was seen as an important risk factor for Data
          about COVID-19 in patients with diabetes is limited at present.[0m Diabetes was
          present in 42.3% of 26 fatalities due to COVID-19 in Wuhan, China [8] . In a
          study in 140 patients with COVID-19 in Wuhan, China, diabetes was not a risk
          factor for severe disease course [9] . However, another study in 150 patients
          (68 deaths and 82 recovered patients) in Wuhan showed that the number of co-
          morbidities to be a significan

## Load Models

In [18]:
%%time

# Load USE
use_version = 5
use = hub.load(f"https://tfhub.dev/google/universal-sentence-encoder-large/{use_version}")

INFO:absl:Using /tmp/tfhub_modules to cache modules.


CPU times: user 17.5 s, sys: 3.43 s, total: 20.9 s
Wall time: 17.5 s


In [19]:
%%time

# Load SBERT
sbert = SentenceTransformer('bert-base-nli-mean-tokens')

CPU times: user 4.78 s, sys: 815 ms, total: 5.59 s
Wall time: 3.58 s


In [20]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
%%time

# Load BioSentVec
bsv = sent2vec.Sent2vecModel()
bsv.load_model('BioSentVec_PubMed_MIMICIII-bigram_d700.bin')

bsv_stopwords = set(stopwords.words('english'))

def bsv_preprocess(text):
    text = text.replace('/', ' / ')
    text = text.replace('.-', ' .- ')
    text = text.replace('.', ' . ')
    text = text.replace('\'', ' \' ')
    text = text.lower()
    tokens = [token for token in word_tokenize(text)
              if token not in punctuation and token not in bsv_stopwords]
    return ' '.join(tokens)

CPU times: user 2.37 s, sys: 12 s, total: 14.4 s
Wall time: 14.3 s


## Preprocessing of Sentences

In [22]:
synonyms_dict = dict()
with open('../synonyms_list.txt', 'r', encoding='utf-8-sig') as f:
    for l in [l_.strip().lower() for l_ in f]:
        if l:
            w = [l_.strip() for l_ in l.split('=')]
            synonyms_dict[w[0]] = w[1:]

del synonyms_dict['sars']

synonyms_index = {x.lower(): k.lower() for k,v in synonyms_dict.items() for x in v}

def sent_preprocessing(sentences, 
                      synonyms_index):
    """Preprocessing of the sentences. (Lower + Split + Replace Synonym)
    
    Parameters
    ----------
    sentences : List[str]
        List of N strings.
    synonyms_index: dict
        Dictionary containing as key the synonym term and as values the reference of this term.
    """
    
    return [" ".join(synonyms_index.get(y, y) for y in word_tokenize(x.lower()))
            for x in sentences]

In [23]:
def embed_sentences(sentences, embedding_name, embedding_model):
    if embedding_name == 'USE':
        return embedding_model(sentences).numpy()
    
    elif embedding_name == 'SBERT':
        return np.stack(embedding_model.encode(sentences), axis=0)
    
    elif embedding_name == 'BSV':
        preprocessed = [bsv_preprocess(x) for x in sentences]
        return embedding_model.embed_sentences(preprocessed)
        
    else:
        raise NotImplementedError(f'Embedding {repr(embedding_name)} not '
                                  f'available!')

In [24]:
EMBEDDINGS_NAMES = ['USE', 'SBERT', 'BSV']

In [25]:
embeddings = np.load('sentence_embeddings/sentence_embeddings.npz')

In [26]:
embeddings_syns = np.load('sentence_embeddings/sentence_embeddings_merged_synonyms.npz')

## Actual Widget

In [27]:
def investigate():
    
    def on_clicked(b):        
        wout.clear_output()
        with wout:
            print()
            t0 = time.time()
            
            if wcheck.value:
                query_value = sent_preprocessing([wtext_query.value], synonyms_index)
                exclu_value = sent_preprocessing([wtext_exclusion.value], synonyms_index)                
            else:
                query_value = [wtext_query.value]
                exclu_value = [wtext_exclusion.value]
                                    
            print('Embedding query...    ', end=' ')
            embedding_query = embed_sentences(query_value, 
                                              wselect_model.value, 
                                              eval(wselect_model.value.lower()))
            print(f'{time.time()-t0:.2f} s.')
            
            if exclu_value[0]:
                print('Embedding exclusion...    ', end=' ')
                embedding_exclu = embed_sentences(exclu_value, 
                                                  wselect_model.value, 
                                                  eval(wselect_model.value.lower()))
                print(f'{time.time()-t0:.2f} s.')                
            
            print('Computing similarities...', end=' ')
            # For scalability, we will replace this part with FAISS, as in the other part of the code base.
            if wcheck.value:
                arr = embeddings_syns[wselect_model.value]
            else:
                arr = embeddings[wselect_model.value]
            uids, embedding_docs = arr[:, 0], arr[:, 1:]
            similarities_query = cosine_similarity(X=embedding_query, Y=embedding_docs).squeeze()

            if exclu_value[0]:
                similarities_exclu = cosine_similarity(X=embedding_exclu, Y=embedding_docs).squeeze()
            else:
                similarities_exclu = np.zeros_like(similarities_query)
                            
            # now: maximize L = (1-a) * cos(x, query) - a * cos(x, exclusions)
            alpha = exclusion_floatslider.value / 100 / 2
            similarities = (1 - alpha) * similarities_query - alpha * similarities_exclu
            
            print(f'{time.time()-t0:.2f} s.')
            
            print('Ranking documents...     ', end=' ')
            indices = np.argsort(-similarities)[:wselect_count.value]
            print(f'{time.time()-t0:.2f} s.')
            
            print(Color.RED + f'\nInvestigating: {query_value[0]}\n' + Color.END)
            
            for i, (uid_, sim_) in enumerate(zip(uids[indices], similarities[indices])):
                article_sha, text = db.execute('SELECT Article, Text FROM sections WHERE Id = ?', [uid_]).fetchall()[0]
#                 print(f'Rank: {i} --- Section id: {int(uid_):>7,d} --- Similarity: {sim_:.2f}')
                article_auth, article_title, date, ref = db.execute('SELECT Authors, Title, Published, Reference FROM articles WHERE Id = ?', [article_sha]).fetchall()[0]
                article_auth = article_auth.split(';')[0] + ' et al.'
                date = date.split()[0]
                ref = ref if ref else ''
                 
                width = 80
                if w_check_whole_paragraph.value:
                    formatted_output = highlight_in_paragraph(uid_, text, width=width, indent=2)
                else:
                    formatted_output = textwrap.fill(text, width=width)
                display(HTML(f'<a href="{ref}">&nbsp;[{i:2d}]</a>'))
                print(formatted_output)
                print()
#                 print(Color.BLUE + article_auth + ': "' + article_title + '", ' + date + Color.END)
#                 display(HTML(f'<a href="{ref}">{ref}</a>'))
#                 print(Color.GREEN + f'Top {i+1} match (similarity: {sim_:.2f})' + Color.END)
#                 print()
    
    wselect_model = widgets.ToggleButtons(
        options=[ 'USE', 'SBERT', 'BSV'],
        description='Model:',
        tooltips=['Universal Sentence Encoder', 'Sentence BERT', 'BioSentVec'],
    )
    
    wselect_count = widgets.IntSlider(value=10, min=0, max=100, description='Top N:',)
    
    wcheck = widgets.Checkbox(value=False, description='merge synonyms')
    w_check_whole_paragraph = widgets.Checkbox(value=True, description='show whole paragraph')
    
    wtext_query = widgets.Textarea(layout=widgets.Layout(width='90%', height='80px'), 
                                   value='Glucose is a risk factor for COVID-19.',
                                   description='Query: ')
    wtext_exclusion = widgets.Textarea(layout=widgets.Layout(width='90%', height='80px'),
                                       value='',
                                       description='Exclusion: ')
    exclusion_floatslider = widgets.IntSlider(min=0, max=100, 
                                                value=0, 
                                                step=5,
                                                description='Exclusion strength [%]',
                                                style = {'description_width': 'initial'}) # represents alpha * 2 * 100
    
    button = widgets.Button(description='Investigate!')
    button.on_click(on_clicked)
    
    wout = widgets.Output(layout={'border': '1px solid black'})

    display(widgets.VBox([wselect_model, 
                          wselect_count, 
                          wcheck,
                          w_check_whole_paragraph,
                          wtext_query, 
                          wtext_exclusion, 
                          exclusion_floatslider,
                          button, 
                          wout]))

In [27]:
investigate()

VBox(children=(ToggleButtons(description='Model:', options=('USE', 'SBERT', 'BSV'), tooltips=('Universal Sente…

---

#### Example Queries

1. Inhibition of N-glycosylation (using N-glycosylation inhibitors or Lectins) is a potential therapeutic approach for COVID-19 therapy.
1. Is high blood / plasma sugar level or hyperglycemia associated with higher susceptibility to coronavirus infection or higher virus replication?
1. Glucose or sugar is a risk factor for COVID-19.
1. Ketogenic diet is protective against COVID-19.

## Sandbox

In [22]:
synonyms_dict['sugar']

['glucose', 'carbohydrates']

In [23]:
synonyms_dict['risk factor']

['prediction factor', 'susceptibility factor', 'severity']

In [34]:
HTML('And everyone knows <font style="background-color: #992200"> coronavirus</font> is dangerous.')

In [None]:
# db.close()

In [None]:
# embeddings.close()

In [None]:
# embeddings_syns.close()