In [None]:
import sqlalchemy
import pandas as pd

from bbsearch.ml.sts import *

In [None]:
%%time

version='cord19_v35'

model = load_model_bsv()
embeddings = load_embeddings_bsv(version)
engine = sqlalchemy.create_engine(f'mysql+pymysql://guest:guest@dgx1.bbp.epfl.ch:8853/{version}')

# < 45 s

In [None]:
%%time

sentences = pd.read_sql(f"SELECT sentence_id, text FROM sentences", engine, 'sentence_id')
# All keywords in bold from BBS Ontology v0.3 on 17.09.2020.
keywords = {'pathogens', 'cardiac injury', 'cardiovascular disease', 'sars',
            'acute respiratory distress syndrome', 'gas exchange', 'inflammation',
            'sars-cov-2 infection', 'viral entry', 'glucose metabolism', 'golgi', 'human',
            'dry cough', 'mammals', 'cardiovascular injury', 'glycation', 'endoplasmic reticulum',
            'carbohydrates', 'innate immunity', 'igt', 'polysaccharide', 'hypertension',
            'thrombotic events', 'neutrophils', 'dc cells', 'obesity', 'congested cough',
            'influenzavirus', 'viral replication', 'septic shock', 'macrophages', 'cvd', 'lactate',
            'myalgia', 'chest pain', 'oxygen', 'mucociliary clearance', 'high blood sugar level',
            'respiratory failure', 'fever', 'systemic disorder', 'flu', 'influenzae',
            'hyperglycemia', 'impaired glucose tolerance', 'iron',
            'severe acute respiratory syndrome', 'immunity', 'host defense',
            'respiratory viral infection', 'multi-organs failure', 'blood clot',
            'viral infection', 'hypoxia', 'glucose homeostasis', 'vasoconstriction', 'covid-19',
            'sars-cov-2', 'fatigue', 'multiple organ failure', 'productive cough',
            'adaptive immunity', 'atp', 'bacteria', 'nk cells', 'coagulation', 'ards', 'diarrhea',
            'cytokine storm', 'dendritic cells', 'pneumonia', 'thrombosis', 'phagocytosis',
            'alveolar macrophages', 'glucose', 'clearance', 'epithelial cells', 'glucose uptake',
            'coronavirus', 'plasma membrane', 'lymphocytes', 'oxidative stress', 'glycans',
            'glycolysis', 'pulmonary embolism', 'glycosylation', 'viruses',
            'viral respiratory tract infection', 'diabetes', 'life-cycle', 'mammalia',
            'antimicrobials activity', 'ketones', 'immune system', 'pathogen'}
seed = 3179

sstrategy = sampling_keywords
sparams = dict(sentences=sentences, keywords=keywords, seed=seed)

# < 4m 30s

In [None]:
# limit = sentences_count(engine)
# seed = 3179

# sstrategy = sampling_random
# sparams = dict(limit=limit, seed=seed)

In [None]:
pstrategy = pairing_powerlaw
pparams = dict(step=1, power=6)

In [None]:
%%time

n = 750
groups = 5

pairs = pair_sentences(n, groups, sstrategy, sparams, pstrategy, pparams, model, embeddings, engine)

# < 48m 30s  # n = 1000

In [None]:
write_results_pkl(pairs, f'pairs_n{n}_groups{groups}_seed{seed}.pkl')

In [None]:
print(format_results(pairs[:groups]))

---

In [None]:
import re

import spacy

In [None]:
%%time

nlp = spacy.load('en_core_sci_lg')

# < 15 s

In [None]:
# with open(f'pairs_n{n}_groups{groups}_seed{seed}.pkl, 'rb') as f:
#     pairs = pickle.load(f)

In [None]:
def ok(pair: Pair) -> bool:

    def c1():
        return re.match('^[A-Z][a-z]+ .*', pair.right.text)
    
    def c2():
        return 100 <= len(pair.right.text) <= 300
    
    def c3():
        doc0 = nlp(pair.left.text)
        doc1 = nlp(pair.right.text)
        set0 = {x.lemma_ for x in doc0 if x.is_alpha}
        set1 = {x.lemma_ for x in doc1 if x.is_alpha}
        dissim = len(set0 ^ set1) / len(set0 | set1)
        return dissim > 0.6
    
    return c1() and c2() and c3()

In [None]:
%%time

selection = [x for x in pairs if x.target == 4 and ok(x)]

# ~ 2 s

In [None]:
len(selection)

In [None]:
print(format_results(selection[:5]))

In [None]:
write_results_txt(selection, f'selected_pairs_n{n}_groups{groups}_seed{seed}.txt')