In [1]:
import os
import re
import json
import utils
import scipy
import torch
import random
import gensim
import warnings

import numpy as np
import pandas as pd

from tasks import *
from pprint import pprint
from transformers import *
from tqdm.notebook import tqdm
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors

In [2]:
complete_df = pd.read_csv("data/clean_df.csv")
complete_df.shape

(40152, 11)

In [3]:
complete_df.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,0,0,6f005a0677f24f697c1ec3ce065ec4144519e357,Antiviral Mechanisms of Human Defensins Introd...,"Sarah S Wilson, Mayim E Wiens, Jason G Smith","Sarah S Wilson (University of Washington, 1705...",Defensins are an effector component of the inn...,Defensins are one of the most abundant classes...,Direct inactivation of viruses by human granul...,"[{'first': 'Sarah', 'middle': ['S'], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Direct ..."
1,1,1,4994fa72322bbf19120592304d92629226948d8e,Rapid Identification of Malaria Vaccine Candid...,"V Villard, G W Agak, G Frank, A Jafarshad, C S...","V Villard, G W Agak, G Frank, A Jafarshad, C S...",To identify malaria antigens for vaccine devel...,Human Plasmodium falciparum (Pf) infection is ...,Identification of vaccine candidates against s...,"[{'first': 'V', 'middle': [], 'last': 'Villard...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Identif..."


In [4]:
complete_df.describe()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1
count,40152.0,40152.0
mean,29705.053248,29705.053248
std,17183.494309,17183.494309
min,0.0,0.0
25%,14787.75,14787.75
50%,29717.0,29717.0
75%,44551.25,44551.25
max,59560.0,59560.0


In [5]:
# Keep only texts with minimal number of words 

complete_df = complete_df[complete_df['text'].apply(lambda x: len(re.findall(r"(?i)\b[a-z]+\b", x))) > 1000]                                         
complete_df.shape

(38265, 11)

In [6]:
frac_of_articles = 1
train_df  = complete_df.sample(frac = frac_of_articles, random_state = 42)
train_corpus = (list(utils.read_corpus(train_df, 'abstract'))) 

In [7]:
# Using distributed memory model

model = gensim.models.doc2vec.Doc2Vec(dm = 1, vector_size = 50, min_count = 10, dm_mean = 1, epochs = 20, seed = 42, workers = 6)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples = model.corpus_count, epochs = model.epochs)

In [8]:
list_of_tasks = [task_1, task_2, task_3, task_4, task_5, task_6, task_7, task_8, task_9]

In [9]:
abstract_vectors = model.docvecs.vectors_docs
array_of_tasks = [utils.get_doc_vector(task, model) for task in list_of_tasks]

In [10]:
train_df['abstract_vector'] = [vec for vec in abstract_vectors]

### Nearest Neigbors search

In [11]:
train_df = train_df[train_df['abstract'].apply(lambda x: len(re.findall(r"(?i)\b[a-z]+\b", x))) > 40]
train_df.shape

(37302, 12)

In [12]:
train_array = train_df['abstract_vector'].values.tolist()

In [13]:
ball_tree = NearestNeighbors(algorithm = 'ball_tree', leaf_size = 20).fit(train_array)

In [14]:
# Query for all tasks

distances, indices = ball_tree.kneighbors(array_of_tasks, n_neighbors = 3)

In [15]:
print("="*80, f"\n\nTask = {list_of_tasks[3]}\n", )
df =  train_df.iloc[indices[3]]
abstracts = df['abstract']
titles = df['title']
dist = distances[3]
for l in range(len(dist)):
    print(f" Text index = {indices[3][l]} \n Distance = {distances[3][l]} \n Title: {titles.iloc[l]} \n Abstract extract: {abstracts.iloc[l]}\n\n")


Task = What do we know about vaccines and therapeutics? What has been published concerning research and development and evaluation efforts of vaccines and therapeutics?
Effectiveness of drugs being developed and tried to treat COVID-19 patients.
Clinical and bench trials to investigate less common viral inhibitors against COVID-19 such as naproxen, clarithromycin, and minocyclinethat that may exert effects on viral replication.
Methods evaluating potential complication of Antibody-Dependent Enhancement (ADE) in vaccine recipients.
Exploration of use of best animal models and their predictive value for a human vaccine.
Capabilities to discover a therapeutic (not vaccine) for the disease, and clinical effectiveness studies to discover therapeutics, to include antiviral agents.
Alternative models to aid decision makers in determining how to prioritize and distribute scarce, newly proven therapeutics as production ramps up. This could include identifying approaches for expanding productio

## Level 2 Abstraction using SciBERT

In [43]:
model = AutoModelWithLMHead.from_pretrained('models/COVID-scibert-latest').to('cuda')
tokenizer = AutoTokenizer.from_pretrained('models/COVID-scibert-tokenizer')

In [44]:
number_top_matches = 3

In [45]:
def convert(sentence):
    with torch.no_grad():
        vector = model(torch.tensor(tokenizer.encode(sentence, add_special_tokens = True, padding = True, truncation = True)).to('cuda').unsqueeze(0))[0][0].cpu().numpy().flatten().tolist()
    return vector

In [46]:
query = 'What are the possible medications against COVID-19?' 
query_embeddings = convert(query)

In [47]:
def truncate(vector):
    if vect
    return(vector[:len(query_embeddings)])

In [55]:
[len(i) for i in abstracts_vector[0]]

[415233, 415233, 415233, 415233, 415233, 159705, 415233]

In [52]:
print("="*80, f"\n\nTask = \n\n {list_of_tasks[3]}\n", )

print("\n\n======================\n\n")
print("Searching in Abstracts")
    
dfs =  train_df.iloc[indices[3]]
abstracts = [i.split(".") for i in df['abstract']]
for abstract in abstracts:
    print("\n\n======================\n\n")
    print("Abstract:", '.'.join(abstract))
    abstracts_vector = [truncate(convert(i)) for i in abstract if not(len(i) < 5)]
    if(len(np.array(abstracts_vector).shape) != 2):
        abstracts_vector = [abstracts_vector]
    distance = scipy.spatial.distance.cdist([query_embeddings], abstracts_vector, "cosine")[0]

    results = zip(range(len(distance)), distance)
    results = sorted(results, key = lambda x: x[1])
    
    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 3 most similar sentences are:")

    for idx, dist in results[0:number_top_matches]:
        print(abstract[idx].strip(), "(Cosine Score: %.4f)" % (1-dist))

        
print("\n\n======================\n\n")
print("Searching in Texts")

texts = [i.split(".") for i in df['text']]
for text in texts:
    print("\n\n======================\n\n")
    print("Text:", text[:100])
    text_vector = [truncate(convert(i)) for i in text if not(len(i) < 5)]
    distance = scipy.spatial.distance.cdist([query_embeddings], text_vector, "cosine")[0]

    results = zip(range(len(distance)), distance)
    results = sorted(results, key = lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 3 most similar sentences are:")

    for idx, dist in results[0:number_top_matches]:
        print(text[idx].strip(), "\n(Cosine Score: %.4f)" % (1-dist))


Task = 

 What do we know about vaccines and therapeutics? What has been published concerning research and development and evaluation efforts of vaccines and therapeutics?
Effectiveness of drugs being developed and tried to treat COVID-19 patients.
Clinical and bench trials to investigate less common viral inhibitors against COVID-19 such as naproxen, clarithromycin, and minocyclinethat that may exert effects on viral replication.
Methods evaluating potential complication of Antibody-Dependent Enhancement (ADE) in vaccine recipients.
Exploration of use of best animal models and their predictive value for a human vaccine.
Capabilities to discover a therapeutic (not vaccine) for the disease, and clinical effectiveness studies to discover therapeutics, to include antiviral agents.
Alternative models to aid decision makers in determining how to prioritize and distribute scarce, newly proven therapeutics as production ramps up. This could include identifying approaches for expanding produc

ValueError: XA and XB must have the same number of columns (i.e. feature dimension.)