In [1]:
import os
import re
import fse
import json
import utils
import random
import gensim
import warnings

import numpy as np
import pandas as pd

from tasks import *
from pprint import pprint
from tqdm.notebook import tqdm
from sklearn.neighbors import NearestNeighbors

In [2]:
biorxiv_dir = 'data/biorxiv_medrxiv/biorxiv_medrxiv/'
filenames = os.listdir(biorxiv_dir)
print("Number of articles retrieved from biorxiv:", len(filenames))

Number of articles retrieved from biorxiv: 885


In [3]:
all_files = []

for filename in filenames:
    filename = biorxiv_dir + filename
    file = json.load(open(filename, 'rb'))
    all_files.append(file)

In [4]:
cleaned_files = []

for file in tqdm(all_files):
    features = [
        file['paper_id'],
        file['metadata']['title'],
        utils.format_authors(file['metadata']['authors']),
        utils.format_authors(file['metadata']['authors'], 
                       with_affiliation = True),
        utils.format_body(file['abstract']),
        utils.format_body(file['body_text']),
        utils.format_bib(file['bib_entries']),
        file['metadata']['authors'],
        file['bib_entries']
    ]
    
    cleaned_files.append(features)

HBox(children=(FloatProgress(value=0.0, max=885.0), HTML(value='')))




In [5]:
col_names = [
    'paper_id', 
    'title', 
    'authors',
    'affiliations', 
    'abstract', 
    'text', 
    'bibliography',
    'raw_authors',
    'raw_bibliography'
]

clean_df = pd.DataFrame(cleaned_files, columns = col_names)

In [6]:
pmc_dir = 'data/custom_license/custom_license/'
pmc_files = utils.load_files(pmc_dir)
pmc_df = utils.generate_clean_df(pmc_files)

100%|██████████| 16959/16959 [04:27<00:00, 63.28it/s]
100%|██████████| 16959/16959 [00:43<00:00, 388.58it/s]


In [7]:
comm_dir = 'data/comm_use_subset/comm_use_subset/'
comm_files = utils.load_files(comm_dir)
comm_df = utils.generate_clean_df(comm_files)

100%|██████████| 9118/9118 [02:17<00:00, 66.38it/s]
100%|██████████| 9118/9118 [00:25<00:00, 364.05it/s]


In [8]:
noncomm_dir = 'data/noncomm_use_subset/noncomm_use_subset/'
noncomm_files = utils.load_files(noncomm_dir)
noncomm_df = utils.generate_clean_df(noncomm_files)

100%|██████████| 2353/2353 [00:25<00:00, 91.44it/s] 
100%|██████████| 2353/2353 [00:10<00:00, 233.63it/s]


In [9]:
complete_df = pd.concat([clean_df, pmc_df, comm_df, noncomm_df])

In [10]:
complete_df = complete_df[complete_df['text'].apply(lambda x: len(re.findall(r"(?i)\b[a-z]+\b", x))) > 1000] 

In [11]:
frac_of_articles = 1
train_df  = complete_df.sample(frac = frac_of_articles, random_state = 42)
train_corpus = [i.split() for i in train_df["abstract"]] 
il = fse.IndexedList(train_corpus)

In [12]:
model = gensim.models.Word2Vec(min_count = 10, seed = 42, workers = 6)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples = model.corpus_count, epochs = model.epochs)

(16976348, 23391565)

In [13]:
se = fse.models.SIF(model)
se.train(il)

(20566, 4247590)

In [14]:
vecs = []
for i in range(len(se.sv)):
    vecs.append(se.sv[i])

In [15]:
train_df['abstract_vector'] = vecs

In [16]:
train_df.head()

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography,abstract_vector
10486,ac93c3f7b9207fea7f888e2c53950c66e96b072f,Importance of Viral Disease in Dairy Cow Ferti...,"D Claire Wathes, Chike F Oguejiofor, Carole Th...","D Claire Wathes (Royal Veterinary College, AL9...",Many viral diseases are endemic in cattle popu...,Although viral disease remains a major cause o...,Potential applications for antiviral therapy a...,"[{'first': 'D', 'middle': ['Claire'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Potenti...","[0.023329236, 0.14939576, 0.13574848, 0.137557..."
10111,118eb919e64fc55d82d5071ae893561a3f4af4b6,Engineering RNA for Targeted siRNA Delivery an...,"Peixuan Guo, Oana Coban, Nicholas M Snead, Joe...","Peixuan Guo (University of Cincinnati, 45221, ...",RNA engineering for nanotechnology and medical...,Cationic One research area in the emergent pop...,The developments of semisynthetic DNA-protein ...,"[{'first': 'Peixuan', 'middle': [], 'last': 'G...","{'BIBREF0': {'ref_id': 'b0', 'title': 'The dev...","[-0.0290225, -0.10409663, 0.028612107, -0.1599..."
10160,88830e061b50d27536ef609ba2306bacdee78d57,"Labouring geography: Negotiating scales, strat...","Steven Tufts, Lydia Savage","Steven Tufts (York University, 4700 Keele Stre...",,In our editorial introduction to this themed i...,"Geographies of the justice for janitors, L L M...","[{'first': 'Steven', 'middle': [], 'last': 'Tu...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Geograp...","[1.12653794e-07, 1.1506873e-07, 1.4067794e-07,..."
3634,e10a3c96080f79b0406b3f938edd28236ab6b439,Immunogenicity of a killed Leishmania vaccine ...,"Rodolfo Cordeiro Giunchetti, Rodrigo Corrêa-Ol...",Rodolfo Cordeiro Giunchetti (Universidade Fede...,Cellular and humoral immune responses of dogs ...,Canine visceral leishmaniasis (CVL) is caused ...,Leishmaniasis: current situation and new persp...,"[{'first': 'Rodolfo', 'middle': ['Cordeiro'], ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Leishma...","[0.06994428, -0.15551212, -0.26030055, 0.00672..."
3265,cb91fc4ab4d7aff52357bfc8830954bcacdf5f30,In vitro and ex vivo analyses of co-infections...,"I Dobrescu, B Levast, K Lai, M Delgado-Ortega,...","I Dobrescu (University of Saskatchewan, 120 Ve...",,In vitro and ex vivo analyses of co-infections...,Differential sensitivity of well-differentiate...,"[{'first': 'I', 'middle': [], 'last': 'Dobresc...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Differe...","[1.12653794e-07, 1.1506873e-07, 1.4067794e-07,..."


In [17]:
list_of_tasks = [task_1, task_2, task_3, task_4, task_5, task_6, task_7, task_8, task_9]

In [18]:
def get_sif_vector(string, model):
    tmp = (string.split(), 0)
    return model.infer([tmp])

array_of_tasks = [get_sif_vector(task, se) for task in list_of_tasks]

In [19]:
# Nearest Neigbors search

train_df = train_df[train_df['abstract'].apply(lambda x: len(re.findall(r"(?i)\b[a-z]+\b", x))) > 40]
train_df.shape

(20123, 10)

In [31]:
tasks = []
for i in range(len(array_of_tasks)):
    tasks.append(array_of_tasks[i][0].tolist())

[[-0.09367571026086807,
  0.10938385874032974,
  0.46399176120758057,
  0.032029248774051666,
  -0.09903774410486221,
  -0.1217045709490776,
  -0.22442170977592468,
  -0.042359113693237305,
  -0.00272524356842041,
  -0.06361312419176102,
  0.15612637996673584,
  -0.13468889892101288,
  -0.08263194561004639,
  -0.08375783264636993,
  0.04024039953947067,
  -0.0055435895919799805,
  -0.17467966675758362,
  0.0353347584605217,
  -0.16941505670547485,
  0.012467660009860992,
  -0.0013268440961837769,
  -0.014757356606423855,
  0.2571451663970947,
  -0.027673691511154175,
  0.130849689245224,
  0.33539432287216187,
  -0.259182333946228,
  0.1682261973619461,
  -0.015593096613883972,
  0.06979138404130936,
  -0.1654740273952484,
  0.14121292531490326,
  -0.23015287518501282,
  -0.232635959982872,
  -0.08317632228136063,
  0.011912986636161804,
  -0.03374043107032776,
  -0.2675316333770752,
  -0.09394082427024841,
  0.05619344115257263,
  0.28531894087791443,
  0.2099190056324005,
  -0.023486

In [32]:
train_array = train_df['abstract_vector'].values.tolist()
ball_tree = NearestNeighbors(algorithm = 'ball_tree', leaf_size = 20).fit(train_array)

# Query for all tasks
distances, indices = ball_tree.kneighbors(tasks, n_neighbors = 3)
for i, info in enumerate(list_of_tasks):
    print("="*80, f"\n\nTask = {info[:100]}\n", )
    df =  train_df.iloc[indices[i]]
    abstracts = df['abstract']
    titles = df['title']
    dist = distances[i]
    for l in range(len(dist)):
        print(f" Text index = {indices[i][l]} \n Distance = {distances[i][l]} \n Title: {titles.iloc[l]} \n Abstract extract: {abstracts.iloc[l][:200]}\n\n")


Task = What is known about transmission, incubation, and environmental stability of COVID-19? What do we kn

 Text index = 4189 
 Distance = 0.6686317731444134 
 Title: Microbiological Safety of Drinking Water: United States and Global Perspectives 
 Abstract extract: Waterborne disease statistics only begin to estimate the global burden of infectious diseases from contaminated drinking water. Diarrheal disease is dramatically underreported and etiologies seldom di


 Text index = 12626 
 Distance = 0.6910183870236938 
 Title: Local risk perception enhances epidemic control 
 Abstract extract: As infectious disease outbreaks emerge, public health agencies often enact vaccination and social distancing measures to slow transmission. Their success depends on not only strategies and resources, 


 Text index = 18537 
 Distance = 0.696400797384311 
 Title: Roles of sunlight and natural ventilation for controlling infection: historical and current perspectives 
 Abstract extract: Airborne t