<!---
Blue Brain Search is a text mining toolbox focused on scientific use cases.

Copyright (C) 2020  Blue Brain Project, EPFL.

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
-->

# Introduction

**Context**

We would want to improve the way our models find hypotheses in CORD-19 papers.
We need to build a benchmark dataset of sentence pairs adapted to CORD-19 and the task.
This benchmark dataset will help to build models which better capture the semantic similarity.

**Solution**

A notebook helping to investigate and build such a dataset of sentence pairs.

This notebook follow these major steps:
1. Select sentences with some keywords.
2. Sample randomly a subset of N of them.
3. Pair the subset with the most similar ones.
4. Compute a word-based similarity for each pair.
5. Print & Export pairs in a human-readable format.

**Getting Started**

This notebook requires that:

- a database of sentences has been created.
- sentence embeddings have been computed.

The logic in the notebook is agnostic of the dataset and the model.
Any dataset of sentences and any sentence embedding model could be used.

However, for demonstration purposes, we reuse below the dataset and the model from the `README`.
This means that we reuse the values of:

- `DATABASE_URL` from [Create the database](https://github.com/BlueBrain/Search#initialize-the-database-server),
- `EMBEDDING_MODEL` and `BBS_SEARCH_EMBEDDINGS_PATH` from [Compute the sentence embeddings](https://github.com/BlueBrain/Search#compute-the-sentence-embeddings).

# Configuration

In [None]:
import os

# Number of sentence pairs to sample.
N = 100

# Maximum number of sentences to consider.
# This could be used for experimenting or debugging.
# If None, all sentences are loaded.
LIMIT = None

# File path to a dump of all the sentences.
# This allows a faster loading of the sentences.
# If None, sentences are loaded from the DATABASE (see below).
# Example: 'sentences.parquet'.
DUMP = None

# Sentence embedding model to use for guiding pairing.
# A key in the EMBEDDINGS file (see below).
# Example: 'Sent2Vec'.
MODEL = os.getenv('EMBEDDING_MODEL')
print(f"MODEL='{MODEL}'")

# Seed for reproducibility of the random sampling.
SEED = 9173

# SQLAlchemy database URL.
# Example: '<dialect>+<driver>://<username>:<password>@<host>:<port>/<database>'.
DATABASE = f"mysql+pymysql://guest:guest@{os.getenv('DATABASE_URL')}"
print(f"DATABASE='{DATABASE}'")

# Path to the pre-computed sentence embeddings.
# They must be indexed on the sentence_id from the DATABASE (see above).
# Example: './embeddings.h5'.
EMBEDDINGS = os.getenv('BBS_SEARCH_EMBEDDINGS_PATH')
print(f"EMBEDDINGS='{EMBEDDINGS}'")

# Imports

In [None]:
import re
from pathlib import Path
from datetime import datetime

In [None]:
import torch
import spacy
import sqlalchemy
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from bluesearch.utils import H5

# Load sentences

In [None]:
if DUMP:
    print('<loading> from dump')
    sentences = pd.read_parquet(DUMP)
    if LIMIT:
        sentences = sentences[:LIMIT]
else:
    print('<loading> from database')
    engine = sqlalchemy.create_engine(DATABASE)
    statement = 'SELECT sentence_id, text FROM sentences'
    if LIMIT:
        statement += f' LIMIT {LIMIT}'
    sentences = pd.read_sql(statement, engine, 'sentence_id')

scount = sentences.size
print(f'{scount:,} sentences')

# Takes for 20.5 millions sentences:
#  - Parquet: 25 secs
#  - MySQL: 5 mins
# 20,510,932 sentences

In [None]:
# Create a dump of the sentences.
# sentences.to_parquet('sentences.parquet', index=True)

# Takes 20 secs for 20.5 millions sentences.

# Deduplicate sentences

In [None]:
sentences = sentences.drop_duplicates()

dcount = sentences.size
print(f'{dcount:,} sentences (- {scount-dcount:,} duplicates)')

# Takes 30 secs for 20.5 millions sentences.
# 19,131,302 sentences (- 1,379,630 duplicates)

# Select sentences

In [None]:
# All keywords in bold from BBS Ontology v0.3 on 17.09.2020.
# keywords = {
#     'pathogens', 'cardiac injury', 'cardiovascular disease', 'sars',
#     'acute respiratory distress syndrome', 'gas exchange', 'inflammation',
#     'sars-cov-2 infection', 'viral entry', 'glucose metabolism', 'golgi', 'human',
#     'dry cough', 'mammals', 'cardiovascular injury', 'glycation', 'endoplasmic reticulum',
#     'carbohydrates', 'innate immunity', 'igt', 'polysaccharide', 'hypertension',
#     'thrombotic events', 'neutrophils', 'dc cells', 'obesity', 'congested cough',
#     'influenzavirus', 'viral replication', 'septic shock', 'macrophages', 'cvd', 'lactate',
#     'myalgia', 'chest pain', 'oxygen', 'mucociliary clearance', 'high blood sugar level',
#     'respiratory failure', 'fever', 'systemic disorder', 'flu', 'influenzae',
#     'hyperglycemia', 'impaired glucose tolerance', 'iron',
#     'severe acute respiratory syndrome', 'immunity', 'host defense',
#     'respiratory viral infection', 'multi-organs failure', 'blood clot',
#     'viral infection', 'hypoxia', 'glucose homeostasis', 'vasoconstriction', 'covid-19',
#     'sars-cov-2', 'fatigue', 'multiple organ failure', 'productive cough',
#     'adaptive immunity', 'atp', 'bacteria', 'nk cells', 'coagulation', 'ards', 'diarrhea',
#     'cytokine storm', 'dendritic cells', 'pneumonia', 'thrombosis', 'phagocytosis',
#     'alveolar macrophages', 'glucose', 'clearance', 'epithelial cells', 'glucose uptake',
#     'coronavirus', 'plasma membrane', 'lymphocytes', 'oxidative stress', 'glycans',
#     'glycolysis', 'pulmonary embolism', 'glycosylation', 'viruses',
#     'viral respiratory tract infection', 'diabetes', 'life-cycle', 'mammalia',
#     'antimicrobials activity', 'ketones', 'immune system', 'pathogen'
# }

# Pierre-Alexandre's keywords for glucose AND (covid-19 OR sars-cov-2).
keywords = {
    # COVID-19
    'covid-19', 'covid', 'cytokine', 'cytokines', 'hypercytokinemia',
    # SARS-CoV-2
    'sars-cov-2', '2019-ncov', 'hcov-19', 'coronavirus',
    # Glucose
    'glucose', 'd-glucose', 'l-glucose', 'sugar', 'sugars',
    'carbohydrate', 'carbohydrates', 'monosaccharide', 'monosaccharides',
    'polysaccharide', 'polysaccharides', 'glycan', 'glycans', 'glucan', 'glucans', 'glycogen',
    'glycation', 'glycogenolysis', 'glycosylation', 'glycolysis', 'glycosidic',
    'hyperglycemia', 'diabetes', 'diabetic', 'diabetics', 'insulin', 'obesity', 'obese',
}

def ok(text):
    """Check if a sentence should be kept according to its content.
    
    Parameters
    ----------
    text : str
        The sentence content.
    
    Returns
    -------
    bool
        True if the sentence should be kept. False otherwise.
    """
    conditions = (
        # Keep sentences of length between 100 to 300 characters.
        # These sentences are long enough to be meaningful.
        # They are short enough for humans to evaluate semantic similarity.
        100 <= len(text) <= 300,
        # Keep sentences starting with a capitalized word.
        # Sentences which don't are incorrect sentences (extraction issue, tokenization error, ...).
        re.match('[A-Z][a-z]+ ', text),
        # Keep sentences which contains some keywords.
        # Sentences which do are more interesting for training / evaluating a model for a domain.
        not {x.lower() for x in text.split()}.isdisjoint(keywords),
    )
    return all(conditions)

filtered = sentences[sentences.text.map(ok)].copy()

fcount = filtered.size
print(f'{fcount:,} sentences ({scount-fcount:,} not selected)')

# Takes 2 mins 45 for 20.5 millions sentences and 40 or 100 keywords.
# 631,854 sentences (19,879,078 not selected)

In [None]:
# Mapping between the sentence ID and the index in the embeddings.
# The embeddings are indexed on the sentence ID.
# This is no more the case when loading a subset of the embeddings.
# The loaded subset is indexed from 0 to fcount - 1.
filtered['mapping'] = np.arange(fcount)

# Sample sentences

In [None]:
sampled = filtered.sample(N, random_state=SEED)

# Load embeddings

In [None]:
def load_embeddings(model, sentence_ids):
    """Load pre-computed embeddings of sentences for a given model.
    
    Parameters
    ----------
    model : str
        The sentence embedding model.
    sentence_ids : np.ndarray
        The identifiers of the sentences.
    
    Returns
    -------
    torch.Tensor
        The pre-computed embeddings of the specified sentences.
    """
    path = Path(EMBEDDINGS)
    embeddings = H5.load(path, model, indices=sentence_ids)
    tensor = torch.from_numpy(embeddings)
    norm = torch.norm(tensor, dim=1, keepdim=True)
    norm[norm == 0] = 1
    tensor /= norm
    return tensor

embeddings = load_embeddings(MODEL, filtered.index.values)

ecount = embeddings.size()[0]
print(f'{ecount:,} embeddings (same as selected sentences? {ecount == fcount})')

# Takes 25 secs for 630 thousands embeddings to select from 20.5 millions in total.
# 631,854 embeddings (same as selected sentences? True)

# Pair sentences

In [None]:
nlp = spacy.load('en_core_sci_lg')

# Takes 15 secs.

In [None]:
def pair(mapping, embeddings, sentences):
    """Match a sentence with a given one in a meaningful way.
    
    Parameters
    ----------
    mapping : int
        The index in `sentences` of the sentence to match with another one.
    embeddings : torch.Tensor
        The pre-computed embeddings of the sentences in `sentences`.
    sentences : pd.DataFrame
        The sentences in which to match the sentence specified by `mapping`.
    
    Returns
    -------
    sentence_id : int
        The identifier of the matched sentence.
    sentence_text : str
        The content of the matched sentence.
    vectors_similarity : float
        The cosine similarity between the specified sentence and the matched one.
    """
    embedding = embeddings[mapping]
    similarities = torch.nn.functional.linear(embedding, embeddings)
    # The top element is the sampled sentence.
    stop, itop = similarities.topk(2)
    # The most similar sentence is then the second top element.
    sim, idx = stop[1].item(), itop[1].item()
    # Retrieve paired sentence ID and content.
    row = sentences.loc[sentences.mapping == idx]
    return row.index.item(), row.text.item(), sim

def words_similarity(text1, text2, nlp):
    """Compute a word-based similarity between sentences.
    
    Parameters
    ----------
    text1 : str
        The content of the first sentence.
    text2 : str
        The content of the second sentence.
    nlp : spacy.lang.en.English
        The spaCy model to use for tokenization and lemmatization.
    
    Returns
    -------
    float:
        The word-based similarity between the two sentences.
    """
    doc1 = nlp(text1)
    doc2 = nlp(text2)
    set1 = {x.lemma_ for x in doc1 if not x.is_punct}
    set2 = {x.lemma_ for x in doc2 if not x.is_punct}
    dissimilarity12 = len(set1 - set2) / len(set1)
    dissimilarity21 = len(set2 - set1) / len(set2)
    return 1 - min(dissimilarity12, dissimilarity21)

rows = []

for x in tqdm(sampled.itertuples(), total=N):
    sid1, stext1 = x.Index, x.text

    # Pair the sentence with the most similar sentence.
    # The most similar sentence is the sentence with the highest cosine similarity.
    sid2, stext2, vsimilarity = pair(x.mapping, embeddings, filtered)
    
    # Compute a word-based similarity for each pair.
    # When 1, the two sentences use exactly the same wording.
    # When 0, the two sentences use a completely different wording.
    wsimilarity = words_similarity(stext1, stext2, nlp)
    
    rows.append((sid1, sid2, stext1, stext2, vsimilarity, wsimilarity))

cols = ['sentence_id_1', 'sentence_id_2', 'sentence_text_1', 'sentence_text_2',
        'vectors_similarity', 'words_similarity']
pairs = pd.DataFrame(rows, columns=cols).sort_values('vectors_similarity', ascending=False)

# Takes 10 secs for 100 pairs amongst 630 thousands selected embeddings / sentences.

In [None]:
pairs.head(3)

In [None]:
def format_results(pairs):
    """Format sentence pairs in a human-readable format.
    
    Parameters
    ----------
    pairs : pd.DataFrame
        The sentence pairs.
    
    Returns
    -------
    str
        The sentences pairs formatted in a human-readable format.
    """
    def _(i, x):
        return (
            f'pair: {i}  id_1: {x.sentence_id_1}  id_2: {x.sentence_id_2}  '
            f'vectors_sim: {x.vectors_similarity:.2f}  words_sim: {x.words_similarity:.2f}\n'
            f'-\n'
            f'{x.sentence_text_1.strip()}\n'
            f'-\n'
            f'{x.sentence_text_2.strip()}\n'
        )
    return '\n\n'.join(_(i, x) for i, x in enumerate(pairs.itertuples()))

print(format_results(pairs[:3]))

# Export sentence pairs

In [None]:
def write_results_txt(pairs, n, directory):
    """Write sentence pairs to disk in a human-readable format.
    
    Parameters
    ----------
    pairs : pd.DataFrame
        The sentence pairs.
    n : int
        The number of sentence pairs.
    directory : str
        The directory where to write the sentence pairs.
    """
    time =  datetime.now().strftime("%Y-%m-%d_%Hh%M")
    filename = f'pairs_n{n}_{time}.txt'
    path = Path(directory, filename)
    content = format_results(pairs)
    # UTF-8 is necessary as non ASCII characters are present.
    path.write_text(content, encoding='utf-8')
    print(f'<wrote> {filename}')

write_results_txt(pairs, N, '.')

# Experiment

In [None]:
pairs_exp_1 = pairs[(pairs.words_similarity <= 0.9) & (pairs.vectors_similarity <= 0.9)]

In [None]:
n_exp_1 = len(pairs_exp_1)

In [None]:
write_results_txt(pairs_exp_1, n_exp_1, '.')