In [1]:
import torch
from torch.utils import data
from transformers import AutoTokenizer, AutoModel
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from itertools import islice
import os
import json
from time import time
from collections import Counter
import numpy as np
import pandas as pd
import torch.nn.functional as F
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tnrange, tqdm
from utils import *
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
%load_ext autoreload
%autoreload 2
text_path = 'data/preprocessed_text.json'

In [22]:
print("Loading all paper data...")
with open('data/all_text.json', 'r') as f:
    json_data = json.load(f)

with open('data/preprocessed_text.json', 'r') as f:
    articles = json.load(f)
len(articles.keys())

Loading all paper data...


33375

In [3]:
# original scibert
tokenizer_scibert = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model_scibert = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
# bert finetuned on covid
tokenizer_covid = AutoTokenizer.from_pretrained('deepset/covid_bert_base')
model_covid = AutoModel.from_pretrained('deepset/covid_bert_base')
# bert for sentences
model_sent = SentenceTransformer('bert-base-nli-mean-tokens')

In [6]:
def sentence_similarity(first, second):
    return cosine_similarity(sentence_embedding(tokenizer_scibert, model_scibert, first), sentence_embedding(tokenizer_scibert, model_scibert, second))

In [7]:
sentence_similarity("What are the risk factors for the virus?", "Fever was one of the symptoms of the virus.")

0.784419059753418

## Generate title embeddings OR just load them (this instruction is not clear)
Generation will take about 30 minutes per model for the full titles

### Crop paper titles to the first sentence. Drop those that are still too large (Explanation why we do this)

In [26]:
# select n papers
n = 2000
max_length = 30
selection = take(n, articles)
selected_papers = {key: articles[key] for key in selection}

# used only to display the original (non-processed) abstracts at the end
selected_papers_original = {key: json_data[key] for key in selection}
paper_ids = list(selected_papers_original.keys())

titles = [paper_json[1]['title'] for paper_json in selected_papers.items()]
cropped_titles = []
for title in titles:
    dot_index = title.find(".")
    if dot_index == -1:
        cropped_titles.append(title)
    else:
        cropped_titles.append(title[0:dot_index + 1])

# first run removes titles that are too long, 
# second run builds actual embeddings once both tokenizers have removed those that are too long

encoded_scibert, indices_to_drop = get_encodings_drop_long(cropped_titles, tokenizer_scibert, max_length = max_length)
drop_from_lists([cropped_titles, titles, paper_ids], indices_to_drop)

encoded_covid, indices_to_drop = get_encodings_drop_long(cropped_titles, tokenizer_covid, max_length = max_length)
drop_from_lists([cropped_titles, titles, paper_ids], indices_to_drop)

encoded_scibert, indices_to_drop = get_encodings_drop_long(cropped_titles, tokenizer_scibert, max_length = max_length)
drop_from_lists([cropped_titles, titles, paper_ids], indices_to_drop)

encoded_covid, indices_to_drop = get_encodings_drop_long(cropped_titles, tokenizer_covid, max_length = max_length)
drop_from_lists([cropped_titles, titles, paper_ids], indices_to_drop)

index_to_paperid_map = {ind: paper_ids[ind] for ind in range(len(paper_ids))}

# Sanity check: makes sure pre-processed and original data are consistent with each other after selecting papers
assert(len(cropped_titles) == len(titles) == len(paper_ids))

Dropped 249 titles
Dropped 178 titles
Dropped 0 titles
Dropped 0 titles


#### Generate (More explanation what we actually generate here)

In [9]:
batch_size = 32
title_generator = data.DataLoader(encoded_scibert, batch_size=batch_size, num_workers=4)
embeddings_scibert = torch.zeros(encoded_scibert.shape[0], 768)
embeddings_covid = torch.zeros(encoded_scibert.shape[0], 768)
with torch.no_grad():
    cur_index = 0
    t = tqdm(iter(title_generator), leave=False, total=len(title_generator))
    for i, batch in enumerate(t):
        cur_index += batch_size
        output_scibert = model_scibert(batch)
        embeddings_scibert[cur_index - batch_size: cur_index] = output_scibert[0][:, 0, :]

title_generator = data.DataLoader(encoded_covid, batch_size=batch_size, num_workers=4)
with torch.no_grad():
    cur_index = 0
    t = tqdm(iter(title_generator), leave=False, total=len(title_generator))
    for i, batch in enumerate(t):
        cur_index += batch_size
        output_covid = model_covid(batch)
        embeddings_covid[cur_index - batch_size: cur_index] = output_covid[0][:, 0, :]

embeddings_sent = torch.tensor(model_sent.encode(cropped_titles))

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))



In [8]:
#torch.save(embeddings, "embeddings.pt")

#### Load (These cells don't do anything right ? We can remove them)

In [9]:
#embeddings = torch.load("embeddings.pt")

## Similarity search

In [10]:
query = "Risk factors for covid-19 death"
query_embedding_scibert = get_query_embedding(tokenizer_scibert, model_scibert, query, max_length=max_length)
query_embedding_covid = get_query_embedding(tokenizer_covid, model_covid, query, max_length=max_length)
query_embedding_sent = get_query_embedding(None, model_sent, query, max_length=max_length)

In [11]:
n = 20
indices_scibert, titles_scibert = find_top_n_similar(embeddings_scibert, query_embedding_scibert, titles, n=n)
titles_scibert

['Predicting COVID-19 malignant progression with AI techniques',
 'Coherence of Influenza Surveillance Data across Different Sources and Age Groups',
 'Parallel evolution of influenza across multiple spatiotemporal scales',
 'Tackling dengue fever: Current status and challenges',
 'Harnessing host-virus evolution in antiviral therapy and immunotherapy',
 'Human Bocavirus: Lessons Learned to Date',
 'Associated Herpesvirus in Endothelial Cells',
 'Estimating epidemic exponential growth rate and basic reproduction number',
 'Emodin Inhibits EBV Reactivation and Represses NPC Tumorigenesis',
 'Evolution and emergence of novel human infections',
 'Machine intelligence design of 2019-nCoV drugs',
 'A Note on COVID-19 Diagnosis Number Prediction Model in China',
 'Patterns of Positive Selection in Six Mammalian Genomes',
 'Position Paper on Road Map for RNA Virus Research in India',
 "PERSPECTIVE One Group's Historical Reflections on DNA Vaccine Development",
 'Virology Journal Viruses and t

In [12]:
indices_covid, titles_covid = find_top_n_similar(embeddings_covid, query_embedding_covid, titles, n=n)
titles_covid

['Estimates of the severity of COVID-19 disease',
 'Morbidity and Mortality Weekly Report',
 'Clinical characteristics of 82 death cases with COVID-19',
 'Restriction Factor in Yeasts 2 3',
 'Dynamic profile of severe or critical COVID-19 cases',
 'Pathogenesis of Lassa Fever',
 'Ebola virus disease in the Democratic',
 'Incorporating one health into medical education',
 'District Health Office (Ministry of Health)',
 'Pangolin homology associated with 2019-nCoV',
 'Correcting under-reported COVID-19 case numbers',
 'Predicting COVID-19 malignant progression with AI techniques',
 'Retrospective Analysis of Clinical Features in 101 Death Cases with COVID-19',
 'Molecular Sciences Review B Cells and Antibodies in Kawasaki Disease',
 'Epidemic analysis of COVID-19 in China by dynamical modeling',
 'Comment',
 'Planning horizon affects prophylactic decision-making and epidemic dynamics',
 'Viral etiology of Acute Respiratory Infections in Hospitalized',
 'Advance of Novel Coronavirus Regis

In [13]:
indices_sent, titles_sent = find_top_n_similar(embeddings_sent, query_embedding_sent, titles, n=n)
titles_sent

['Estimation of risk factors for COVID-19 mortality -preliminary results',
 'Estimates of the severity of COVID-19 disease',
 'Building a COVID-19 Vulnerability Index',
 'Potential Factors for Prediction of Disease Severity of COVID-19 Patients',
 'Investigating the Impact of Asymptomatic Carriers on COVID-19 Transmission',
 'Risk factors related to hepatic injury in patients with corona virus disease 2019',
 'Potential biochemical markers to identify severe cases among COVID-19 patients',
 'Dynamic profile of severe or critical COVID-19 cases',
 'Assessing the Global Tendency of COVID-19 Outbreak',
 'Impact of the contact and exclusion rates on the spread of COVID-19 pandemic',
 'Potential Biases in Estimating Absolute and Relative Case-Fatality Risks during Outbreaks',
 'The time scale of asymptomatic transmission affects estimates of epidemic potential in the COVID-19 outbreak',
 'Estimating the cure rate and case fatality rate of the ongoing epidemic COVID-19',
 'Relations of param

## Visualization

In [14]:
tsne_scibert = get_tsne_embeddings(embeddings_scibert)
tsne_covid = get_tsne_embeddings(embeddings_covid)
tsne_sent = get_tsne_embeddings(embeddings_sent)

In [15]:
def plot_query_embeddings(query, n=40):
    models = [model_scibert, model_covid, model_sent]
    tokenizers = [tokenizer_scibert, tokenizer_covid, None]
    embeddings = [embeddings_scibert, embeddings_covid, embeddings_sent]
    tsnes = [tsne_scibert, tsne_covid, tsne_sent]
    plot_titles = ["Scibert", "Covid", "Bert Sentence"]
    fig, ax = plt.subplots(1, 3, figsize=(20, 5))
    for index, cur in enumerate(zip(models, tokenizers, embeddings, plot_titles, tsnes)):
        query_embedding = get_query_embedding(cur[1], cur[0], query)
        similar, _ = find_top_n_similar(cur[2], query_embedding, titles, n=n)
        similar = set(similar[:n].tolist())
        tsne = cur[4]
        for i in range(tsne.shape[0]):
            if i in similar:
                ax[index].scatter(tsne[i, 0], tsne[i, 1], c='r', s=16)
            else:
                ax[index].scatter(tsne[i, 0], tsne[i, 1], c='b', s=4)
        ax[index].set_title(cur[3])

In [16]:
def plot_query_embeddings_plotly(query, titles, n=40):
    models = [model_scibert, model_covid, model_sent]
    tokenizers = [tokenizer_scibert, tokenizer_covid, None]
    embeddings = [embeddings_scibert, embeddings_covid, embeddings_sent]
    tsnes = [tsne_scibert, tsne_covid, tsne_sent]
    plot_titles = ["Scibert", "Covid", "Bert Sentence"]
    fig = make_subplots(rows=1, cols=3)
    for index, cur in enumerate(zip(models, tokenizers, embeddings, plot_titles, tsnes)):
        query_embedding = get_query_embedding(cur[1], cur[0], query)
        similar, _ = find_top_n_similar(cur[2], query_embedding, titles, n=n)
        similar_set = set(similar[:n].tolist())
        tsne = cur[4]
        fig.add_trace(go.Scatter(x=tsne[:, 0], y=tsne[:, 1], \
                                 mode="markers", text=titles, \
                                 marker=dict(size=[6 if i in similar_set else 4 for i in range(len(titles))],\
                                             color=['red' if i in similar_set else 'blue' for i in range(len(titles))]))\
                      , 1, index + 1) 
    fig.update_layout(height=400, width=1000, title_text="Visualization of Search Results for '{}'".format(query))
    fig.show()
    print("Top 10 results:")
    for i in similar[:10]:
        print(titles[i])

In [None]:
plot_query_embeddings_plotly("Risk factors for covid-19 death", titles)

In [None]:
plot_query_embeddings_plotly("Asymptomatic carriers of the virus", titles)

### Task 1
#### What is known about transmission, incubation, and environmental stability? What do we know about natural history, transmission, and diagnostics for the virus? What have we learned about infection prevention and control?
* Range of incubation periods for the disease in humans (and how this varies across age and health status) and how long individuals are contagious, even after recovery.
* Prevalence of asymptomatic shedding and transmission (e.g., particularly children)
* Seasonality of transmission
* Physical science of the coronavirus (e.g., charge distribution, adhesion to hydrophilic/phobic surfaces, environmental survival to inform decontamination efforts for affected areas and provide information about viral shedding).
* Persistence and stability on a multitude of substrates and sources (e.g., nasal discharge, sputum, urine, fecal matter, blood).
* Disease models, including animal models for infection, disease and transmission
* Effectiveness of movement control strategies to prevent secondary transmission in health care and community settings

In [27]:
def display_closest_papers(indices: list):
    for ind in indices:
        print("------------------------------------------------")
        paper_id = index_to_paperid_map[ind]
        abstract = selected_papers_original[paper_id]['abstract']
        title = selected_papers_original[paper_id]['title']
        print("Title: ", title)
        print("Abstract: ", abstract)

### Approach 1:  Query for all subtasks of the task jointly
In this experiment, we represent **all subtasks of task 1 as a single query**. We then obtain the embedding of the task1 query from Sentence Bert, and find the papers with the most similar embeddings as provided from Sentence Bert.


In [28]:
n = 5
task1_questions = [
    # general questions
    "What is known about transmission incubation and environmental stability What do we know about natural history transmission and diagnostics for the virus What have we learned about infection prevention and control",
    
    # sub-task questions
    "Range of incubation periods for the disease in humans and how this varies across age and health status and how long individuals are contagious, even after recovery.",
    "Prevalence of asymptomatic shedding and transmission particularly children", 
    "Seasonality season of transmission",
    "Physical science of the coronavirus charge distribution, adhesion to hydrophilic phobic surfaces, environmental survival to inform decontamination efforts for affected areas and provide information about viral shedding",
    "Persistence and stability on a multitude of substrates and sources nasal discharge sputum  urine  fecal matter  blood",
    "Disease models including animal models for infection disease and transmission",
    "Effectiveness of movement control strategies to prevent secondary transmission in health care healthcare and community settings"]

task1_query = '. '.join(task1_questions)
task1_embedding = get_query_embedding(None, model_sent, task1_query, max_length=1000)
indices, relevant_titles = find_top_n_similar(embeddings_sent, task1_embedding, titles, n=n)
display_closest_papers(indices.tolist())

['Feline immunodeficiency virus in puma: Estimation of force of infection reveals insights into transmission', 'Middle East respiratory syndrome coronavirus: transmission, virology and therapeutic targeting to aid in outbreak control', 'Emergence of evidence during disease outbreaks: lessons learnt from the Zika virus outbreak', 'Spectrum of Viral Infections Among Primary Immunodeficient Children: Report From a National Registry', 'The Infectious Bronchitis Virus Coronavirus Envelope Protein Alters Golgi pH to Protect Spike Protein and Promote Release of Infectious Virus']
------------------------------------------------
Title:  Feline immunodeficiency virus in puma: Estimation of force of infection reveals insights into transmission
Abstract:  1. Determining parameters that govern pathogen transmission (such as the force of infection, FOI), and pathogen impacts on morbidity and mortality, is exceptionally challenging for wildlife. Vital parameters can vary, for example across host pop

------------------------------------------------
Title:  Machine Learning the Phenomenology of COVID-19 From Early Infection Dynamics
Abstract:  We present a data-driven machine learning analysis of COVID-19 from its early infection dynamics, with the goal of extracting actionable public health insights. We focus on the transmission dynamics in the USA starting from the first confirmed infection on January 21 2020. We find that COVID-19 has a strong infectious force if left unchecked, with a doubling time of under 3 days. However it is not particularly virulent. Our methods may be of general interest.
------------------------------------------------
Title:  Potential inhibitors for 2019-nCoV coronavirus M protease from clinically approved medicines
Abstract:  Starting from December 2019, a novel coronavirus, named 2019-nCoV, was found to cause Severe Acute Respiratory (SARI) symptoms and rapid pandemic in China. With the hope to identify candidate drugs for 2019-nCoV, we adopted a comp

Title:  Window of Opportunity for Mitigation to Prevent Overflow of ICU capacity in Chicago by COVID-19
Abstract:  Please note: this is a working document and has not been submitted for journal publication. It is planned that a later version of this document will be submitted for peer-reviewed publication, but in the interests of sharing information during a rapidly changing epidemic landscape, we are making this early version available. We estimate the growth in demand for ICU beds in Chicago during the emerging COVID-19 epidemic, using state-of-the-art computer simulations calibrated for the SARS-CoV-2 virus. The questions we address are these: (1) Will the ICU capacity in Chicago be exceeded, and if so by how much? (2) Can strong mitigation strategies, such as lockdown or shelter in place order, prevent the overflow of capacity? (3) When should such strategies be implemented? Our answers are as follows: (1) The ICU capacity may be exceeded by a large amount, probably by a factor of 

### Task 1: Querying subtasks separately
 
#### Subtask: Prevalence of asymptomatic shedding and transmission particularly children.

In [None]:
question = "Prevalence of asymptomatic shedding and transmission particularly children."
question_query = '. '.join(question)
question_embedding = get_query_embedding(None, model_sent, question_query, max_length=1000)
indices, _ = find_top_n_similar(embeddings_sent, question_embedding, titles, n=n)
display_closest_papers(indices.tolist())