# Introduction

**Context**

We would want to improve the way our models find hypotheses in CORD-19 papers.
We need to build a benchmark dataset of sentence pairs adapted to CORD-19 and the task.
This benchmark dataset will help to build models which better capture the semantic similarity.

**Solution**

A notebook helping to investigate and build such a dataset of sentence pairs.

This notebook follow these major steps:
1. Select sentences with some keywords.
2. Sample randomly a subset of N of them.
3. Pair the subset with the most similar ones.
4. Compute a word-based similarity for each pair.
5. Print & Export pairs in a human-readable format.

# Configuration

In [None]:
# Number of sentence pairs to sample.
N = 20

# Maximum number of sentences to consider.
# This could be used for experimenting or debugging.
# If None, all sentences are loaded.
LIMIT = 10000

# File path to a dump of all the sentences.
# This allows a faster loading of the sentences.
# pyarrow should be installed (pip install pyarrow).
# If None, sentences are loaded from the DATABASE (see below).
DUMP = 'sentences-cord19_v47-2020-10-01.parquet'

# Sentence embedding model to use for guiding pairing.
# A key in the EMBEDDINGS file (see below), e.g. 'Sent2Vec', BSV'.
MODEL = 'Sent2Vec'

# Seed for reproducibility of the random sampling.
SEED = 9173

# SQLAlchemy database URL.
DATABASE = 'mysql+pymysql://guest:guest@dgx1.bbp.epfl.ch:8853/cord19_v47'

# Path to the pre-computed sentence embeddings.
# They must be indexed on the sentence_id from the DATABASE (see above).
EMBEDDINGS = '/raid/sync/proj115/bbs_data/cord19_v47/embeddings/embeddings.h5'

# Imports

In [None]:
import re
from pathlib import Path
from typing import Tuple
from datetime import datetime

In [None]:
import torch
import spacy
import sqlalchemy
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from bbsearch.utils import H5

# Load sentences

In [None]:
if DUMP:
    # Takes 25 secs.
    print('<loading> from dump')
    sentences = pd.read_parquet(DUMP)
    if LIMIT:
        sentences = sentences[:LIMIT]
else:
    # Takes 5 mins for 20.5 millions sentences.
    print('<loading> from database')
    engine = sqlalchemy.create_engine(DATABASE)
    statement = 'SELECT sentence_id, text FROM sentences'
    if LIMIT:
        statement += f' LIMIT {LIMIT}'
    sentences = pd.read_sql(statement, engine, 'sentence_id')

scount = sentences.size
print(f'{scount:,} sentences')
# 20,510,932 sentences

In [None]:
# Create a dump of the sentences.
# Takes 20 secs for 20.5 millions sentences.
# sentences.to_parquet('sentences-cord19_v47-2020-10-01.parquet', index=True)

# Deduplicate sentences

In [None]:
# Takes 26 secs for 20.5 millions sentences.
sentences = sentences.drop_duplicates()

dcount = sentences.size
print(f'{dcount:,} sentences (- {scount-dcount:,} duplicates)')
# 19,131,302 sentences (- 1,379,630 duplicates)

# Select sentences

In [None]:
%%time

# All keywords in bold from BBS Ontology v0.3 on 17.09.2020.
keywords = {'pathogens', 'cardiac injury', 'cardiovascular disease', 'sars',
            'acute respiratory distress syndrome', 'gas exchange', 'inflammation',
            'sars-cov-2 infection', 'viral entry', 'glucose metabolism', 'golgi', 'human',
            'dry cough', 'mammals', 'cardiovascular injury', 'glycation', 'endoplasmic reticulum',
            'carbohydrates', 'innate immunity', 'igt', 'polysaccharide', 'hypertension',
            'thrombotic events', 'neutrophils', 'dc cells', 'obesity', 'congested cough',
            'influenzavirus', 'viral replication', 'septic shock', 'macrophages', 'cvd', 'lactate',
            'myalgia', 'chest pain', 'oxygen', 'mucociliary clearance', 'high blood sugar level',
            'respiratory failure', 'fever', 'systemic disorder', 'flu', 'influenzae',
            'hyperglycemia', 'impaired glucose tolerance', 'iron',
            'severe acute respiratory syndrome', 'immunity', 'host defense',
            'respiratory viral infection', 'multi-organs failure', 'blood clot',
            'viral infection', 'hypoxia', 'glucose homeostasis', 'vasoconstriction', 'covid-19',
            'sars-cov-2', 'fatigue', 'multiple organ failure', 'productive cough',
            'adaptive immunity', 'atp', 'bacteria', 'nk cells', 'coagulation', 'ards', 'diarrhea',
            'cytokine storm', 'dendritic cells', 'pneumonia', 'thrombosis', 'phagocytosis',
            'alveolar macrophages', 'glucose', 'clearance', 'epithelial cells', 'glucose uptake',
            'coronavirus', 'plasma membrane', 'lymphocytes', 'oxidative stress', 'glycans',
            'glycolysis', 'pulmonary embolism', 'glycosylation', 'viruses',
            'viral respiratory tract infection', 'diabetes', 'life-cycle', 'mammalia',
            'antimicrobials activity', 'ketones', 'immune system', 'pathogen'}

def ok(text: str) -> bool:
    conditions = (
        # Keep sentences of length between 100 to 300 characters.
        # These sentences are long enough to be meaningful.
        # They are short enough for humans to evaluate semantic similarity.
        100 <= len(text) <= 300,
        # Keep sentences starting with a capitalized word.
        # Sentences which don't are incorrect sentences (extraction issue, tokenization error, ...).
        re.match('[A-Z][a-z]+ ', text),
        # Keep sentences which contains some keywords.
        # Sentences which do are more interesting for training / evaluating a model for a domain.
        not {x.lower() for x in text.split()}.isdisjoint(keywords),
    )
    return all(conditions)

# Takes 2 mins 40 for 20.5 millions sentences and 100 keywords.
filtered = sentences[sentences.text.map(ok)].copy()

fcount = filtered.size
print(f'{fcount:,} sentences ({scount-fcount:,} not selected)')
# 1,264,496 sentences (19,246,436 not selected)

In [None]:
# Mapping between the sentence ID and the index in the embeddings.
# The embeddings are indexed on the sentence ID.
# This is no more the case when loading a subset of the embeddings.
# The loaded subset is indexed from 0 to fcount - 1.
filtered['mapping'] = np.arange(fcount)

# Sample sentences

In [None]:
sampled = filtered.sample(N, random_state=SEED)

# Load embeddings

In [None]:
def load_embeddings(model: str, sentence_ids: np.ndarray) -> torch.Tensor:
    path = Path(EMBEDDINGS)
    embeddings = H5.load(path, model, batch_size=10000, indices=sentence_ids)
    tensor = torch.from_numpy(embeddings)
    norm = torch.norm(tensor, dim=1, keepdim=True)
    norm[norm == 0] = 1
    tensor /= norm
    return tensor

# Takes 5 mins 30 for 20.5 millions embeddings and 225 thousands selected sentences.
embeddings = load_embeddings(MODEL, filtered.index.values)

ecount = embeddings.size()[0]
print(f'{ecount:,} embeddings (same as selected sentences? {ecount == fcount})')
# 224,343 embeddings (same as selected sentences? True)

# Pair sentences

In [None]:
# Takes 25 secs.
nlp = spacy.load('en_core_sci_lg')

In [None]:
%%time

def pair(mapping: int, embeddings: torch.Tensor, sentences: pd.DataFrame) -> Tuple[int, str, float]:
    embedding = embeddings[mapping]
    similarities = torch.nn.functional.linear(embedding, embeddings)
    # The top element is the sampled sentence.
    stop, itop = similarities.topk(2)
    # The most similar sentence is then the second top element.
    sim, idx = stop[1].item(), itop[1].item()
    # Retrieve paired sentence ID and content.
    row = sentences.loc[sentences.mapping == idx]
    return row.index.item(), row.text.item(), sim

def words_similarity(text1: str, text2: str, nlp: spacy.lang.en.English) -> float:
    doc1 = nlp(text1)
    doc2 = nlp(text2)
    set1 = {x.lemma_ for x in doc1 if not x.is_punct}
    set2 = {x.lemma_ for x in doc2 if not x.is_punct}
    dissimilarity12 = len(set1 - set2) / len(set1)
    dissimilarity21 = len(set2 - set1) / len(set2)
    return 1 - min(dissimilarity12, dissimilarity21)

rows = []

for x in tqdm(sampled.itertuples(), total=N):
    sid1, stext1 = x.Index, x.text

    # Pair the sentence with the most similar sentence.
    # The most similar sentence is the sentence with the highest cosine similarity.
    sid2, stext2, vsimilarity = pair(x.mapping, embeddings, filtered)
    
    # Compute a word-based similarity for each pair.
    # When 1, the two sentences use exactly the same wording.
    # When 0, the two sentences use a completely different wording.
    wsimilarity = words_similarity(stext1, stext2, nlp)
    
    rows.append((sid1, sid2, stext1, stext2, vsimilarity, wsimilarity))

cols = ['sentence_id_1', 'sentence_id_2', 'sentence_text_1', 'sentence_text_2',
        'vectors_similarity', 'words_similarity']
pairs = pd.DataFrame(rows, columns=cols).sort_values('vectors_similarity', ascending=False)

In [None]:
def format_results(pairs: pd.DataFrame) -> str:
    def _(i, x):
        return (
            f'pair: {i}  id_1: {x.sentence_id_1}  id_2: {x.sentence_id_2}  '
            f'vectors_sim: {x.vectors_similarity:.2f}  words_sim: {x.words_similarity:.2f}\n'
            f'-\n'
            f'{x.sentence_text_1.strip()}\n'
            f'-\n'
            f'{x.sentence_text_2.strip()}\n'
        )
    return '\n\n'.join(_(i, x) for i, x in enumerate(pairs.itertuples()))

print(format_results(pairs[:10]))

# Export sentence pairs

In [None]:
def write_results_txt(pairs: pd.DataFrame, n: int, directory: str) -> None:
    time =  datetime.now().strftime("%Y-%m-%d_%Hh%M")
    filename = f'pairs_n{n}_{time}.txt'
    path = Path(directory, filename)
    content = format_results(pairs)
    # UTF-8 is necessary as non ASCII characters are present.
    path.write_text(content, encoding='utf-8')
    print(f'<wrote> {filename}')

write_results_txt(pairs, N, '.')