In [1]:
import os
import re
import json
import utils
import spacy
import random
import gensim

import numpy as np
import pandas as pd

from tasks import *
from tqdm.notebook import tqdm

In [2]:
biorxiv_dir = 'data/biorxiv_medrxiv/biorxiv_medrxiv/'
filenames = os.listdir(biorxiv_dir)
print("Number of articles retrieved from biorxiv:", len(filenames))

Number of articles retrieved from biorxiv: 885


In [3]:
all_files = []

for filename in filenames:
    filename = biorxiv_dir + filename
    file = json.load(open(filename, 'rb'))
    all_files.append(file)

In [4]:
cleaned_files = []

for file in tqdm(all_files):
    features = [
        file['paper_id'],
        file['metadata']['title'],
        utils.format_authors(file['metadata']['authors']),
        utils.format_authors(file['metadata']['authors'], 
                       with_affiliation = True),
        utils.format_body(file['abstract']),
        utils.format_body(file['body_text']),
        utils.format_bib(file['bib_entries']),
        file['metadata']['authors'],
        file['bib_entries']
    ]
    
    cleaned_files.append(features)

HBox(children=(FloatProgress(value=0.0, max=885.0), HTML(value='')))




In [5]:
col_names = [
    'paper_id', 
    'title', 
    'authors',
    'affiliations', 
    'abstract', 
    'text', 
    'bibliography',
    'raw_authors',
    'raw_bibliography'
]

clean_df = pd.DataFrame(cleaned_files, columns = col_names)

In [6]:
pmc_dir = 'data/custom_license/custom_license/'
pmc_files = utils.load_files(pmc_dir)
pmc_df = utils.generate_clean_df(pmc_files)

100%|██████████| 16959/16959 [04:45<00:00, 59.48it/s]
100%|██████████| 16959/16959 [00:40<00:00, 418.10it/s]


In [7]:
comm_dir = 'data/comm_use_subset/comm_use_subset/'
comm_files = utils.load_files(comm_dir)
comm_df = utils.generate_clean_df(comm_files)

100%|██████████| 9118/9118 [01:39<00:00, 91.85it/s] 
100%|██████████| 9118/9118 [00:24<00:00, 371.86it/s]


In [8]:
noncomm_dir = 'data/noncomm_use_subset/noncomm_use_subset/'
noncomm_files = utils.load_files(noncomm_dir)
noncomm_df = utils.generate_clean_df(noncomm_files)

100%|██████████| 2353/2353 [00:30<00:00, 76.11it/s] 
100%|██████████| 2353/2353 [00:04<00:00, 483.77it/s]


In [9]:
complete_df = pd.concat([clean_df, pmc_df, comm_df, noncomm_df])
complete_df.shape

(29315, 9)

In [10]:
complete_df = complete_df[complete_df['text'].apply(lambda x: len(re.findall(r"(?i)\b[a-z]+\b", x))) > 1000]                                           
complete_df.shape

(27139, 9)

In [11]:
frac_of_articles = 1
train_df  = complete_df.sample(frac = frac_of_articles, random_state = 42)
train_corpus = [i.split() for i in train_df["abstract"]] 

In [12]:
model = gensim.models.Word2Vec(min_count = 10, seed = 42, workers = 6)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples = model.corpus_count, epochs = model.epochs)

(16973856, 23391565)

In [13]:
list_of_tasks = [task_1, task_2, task_3, task_4, task_5, task_6, task_7, task_8, task_9]

In [14]:
model.wv.save_word2vec_format("data/word2vec.txt")

In [15]:
# Prep data for spacy model
!gzip data/word2vec.txt

In [16]:
# Init spacy model
!python -m spacy init-model en models/spacy.word2vec.model --vectors-loc data/word2vec.txt.gz

[2K[38;5;2m✔ Successfully created model[0m
24973it [00:01, 19213.88it/s]ord2vec.txt.gz
[2K[38;5;2m✔ Loaded vectors from data/word2vec.txt.gz[0m
[38;5;2m✔ Sucessfully compiled vocab[0m
25351 entries, 24973 vectors


In [17]:
# Load Model
nlp = spacy.load('models/spacy.word2vec.model/')