In [1]:
import os
import re
import fse
import json
import utils
import random
import gensim
import warnings

import numpy as np
import pandas as pd

from tasks import *
from pprint import pprint
from tqdm.notebook import tqdm

In [2]:
biorxiv_dir = 'data/biorxiv_medrxiv/biorxiv_medrxiv/'
filenames = os.listdir(biorxiv_dir)
print("Number of articles retrieved from biorxiv:", len(filenames))

Number of articles retrieved from biorxiv: 885


In [3]:
all_files = []

for filename in filenames:
    filename = biorxiv_dir + filename
    file = json.load(open(filename, 'rb'))
    all_files.append(file)

In [4]:
cleaned_files = []

for file in tqdm(all_files):
    features = [
        file['paper_id'],
        file['metadata']['title'],
        utils.format_authors(file['metadata']['authors']),
        utils.format_authors(file['metadata']['authors'], 
                       with_affiliation = True),
        utils.format_body(file['abstract']),
        utils.format_body(file['body_text']),
        utils.format_bib(file['bib_entries']),
        file['metadata']['authors'],
        file['bib_entries']
    ]
    
    cleaned_files.append(features)

HBox(children=(FloatProgress(value=0.0, max=885.0), HTML(value='')))




In [5]:
col_names = [
    'paper_id', 
    'title', 
    'authors',
    'affiliations', 
    'abstract', 
    'text', 
    'bibliography',
    'raw_authors',
    'raw_bibliography'
]

clean_df = pd.DataFrame(cleaned_files, columns = col_names)

In [6]:
pmc_dir = 'data/custom_license/custom_license/'
pmc_files = utils.load_files(pmc_dir)
pmc_df = utils.generate_clean_df(pmc_files)

100%|██████████| 16959/16959 [04:50<00:00, 58.34it/s]
100%|██████████| 16959/16959 [00:42<00:00, 397.04it/s]


In [7]:
comm_dir = 'data/comm_use_subset/comm_use_subset/'
comm_files = utils.load_files(comm_dir)
comm_df = utils.generate_clean_df(comm_files)

100%|██████████| 9118/9118 [02:23<00:00, 63.62it/s]
100%|██████████| 9118/9118 [00:26<00:00, 346.75it/s]


In [8]:
noncomm_dir = 'data/noncomm_use_subset/noncomm_use_subset/'
noncomm_files = utils.load_files(noncomm_dir)
noncomm_df = utils.generate_clean_df(noncomm_files)

100%|██████████| 2353/2353 [00:32<00:00, 71.35it/s] 
100%|██████████| 2353/2353 [00:04<00:00, 487.72it/s]


In [9]:
complete_df = pd.concat([clean_df, pmc_df, comm_df, noncomm_df])

In [10]:
complete_df = complete_df[complete_df['text'].apply(lambda x: len(re.findall(r"(?i)\b[a-z]+\b", x))) > 1000] 

In [11]:
frac_of_articles = 1
train_df  = complete_df.sample(frac = frac_of_articles, random_state = 42)
train_corpus = [i.split() for i in train_df["abstract"]] 
il = fse.IndexedList(train_corpus)

In [12]:
model = gensim.models.Word2Vec(min_count = 10, seed = 42, workers = 6)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples = model.corpus_count, epochs = model.epochs)

(16976173, 23391565)

In [13]:
se = fse.models.SIF(model)
se.train(il)

In [33]:
vecs = []
for i in range(len(se.sv)):
    vecs.append(se.sv[i])

In [35]:
train_df['abstract_vector'] = vecs

In [36]:
train_df.head()

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography,abstract_vector
10486,ac93c3f7b9207fea7f888e2c53950c66e96b072f,Importance of Viral Disease in Dairy Cow Ferti...,"D Claire Wathes, Chike F Oguejiofor, Carole Th...","D Claire Wathes (Royal Veterinary College, AL9...",Many viral diseases are endemic in cattle popu...,Although viral disease remains a major cause o...,Potential applications for antiviral therapy a...,"[{'first': 'D', 'middle': ['Claire'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Potenti...","[-0.08560004, -0.09882225, -0.15302151, 0.1018..."
10111,118eb919e64fc55d82d5071ae893561a3f4af4b6,Engineering RNA for Targeted siRNA Delivery an...,"Peixuan Guo, Oana Coban, Nicholas M Snead, Joe...","Peixuan Guo (University of Cincinnati, 45221, ...",RNA engineering for nanotechnology and medical...,Cationic One research area in the emergent pop...,The developments of semisynthetic DNA-protein ...,"[{'first': 'Peixuan', 'middle': [], 'last': 'G...","{'BIBREF0': {'ref_id': 'b0', 'title': 'The dev...","[0.19373953, -0.036176935, 0.15033433, -0.0580..."
10160,88830e061b50d27536ef609ba2306bacdee78d57,"Labouring geography: Negotiating scales, strat...","Steven Tufts, Lydia Savage","Steven Tufts (York University, 4700 Keele Stre...",,In our editorial introduction to this themed i...,"Geographies of the justice for janitors, L L M...","[{'first': 'Steven', 'middle': [], 'last': 'Tu...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Geograp...","[1.174044e-07, 1.1362632e-07, 1.2430547e-07, 1..."
3634,e10a3c96080f79b0406b3f938edd28236ab6b439,Immunogenicity of a killed Leishmania vaccine ...,"Rodolfo Cordeiro Giunchetti, Rodrigo Corrêa-Ol...",Rodolfo Cordeiro Giunchetti (Universidade Fede...,Cellular and humoral immune responses of dogs ...,Canine visceral leishmaniasis (CVL) is caused ...,Leishmaniasis: current situation and new persp...,"[{'first': 'Rodolfo', 'middle': ['Cordeiro'], ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Leishma...","[0.07127799, 0.11206261, -0.2715377, -0.019525..."
3265,cb91fc4ab4d7aff52357bfc8830954bcacdf5f30,In vitro and ex vivo analyses of co-infections...,"I Dobrescu, B Levast, K Lai, M Delgado-Ortega,...","I Dobrescu (University of Saskatchewan, 120 Ve...",,In vitro and ex vivo analyses of co-infections...,Differential sensitivity of well-differentiate...,"[{'first': 'I', 'middle': [], 'last': 'Dobresc...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Differe...","[1.174044e-07, 1.1362632e-07, 1.2430547e-07, 1..."


In [37]:
list_of_tasks = [task_1, task_2, task_3, task_4, task_5, task_6, task_7, task_8, task_9]

In [None]:
def get_sif_vector(string, model):
    tmp = (string.split(), 0)
    return model.infer([tmp])

array_of_tasks = [utils.get_doc_vector(task, model) for task in list_of_tasks]