In [1]:
import glob
import multiprocessing
import time
import csv
import pickle
import os
import pandas as pd
import shutil
import numpy as np
import warnings
import tqdm
import logging
import mxnet as mx
from bs4 import BeautifulSoup
from urllib.request import urlopen
from os.path import basename
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split

%pylab inline
warnings.filterwarnings("ignore")

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


Populating the interactive namespace from numpy and matplotlib


In [2]:
trainingTable = pd.read_csv('trainingTable.csv')
dest = 'RCTData'

In [3]:
trainingTable[trainingTable.RCT==True]

Unnamed: 0,PMCID,RCT,PMCFILE
524,PMC1784628,True,PMC1784628.nxml
677,PMC1972548,True,PMC1972548.nxml
1519,PMC2150403,True,PMC2150403.nxml
1641,PMC2212824,True,PMC2212824.nxml
1654,PMC2222782,True,PMC2222782.nxml
2008,PMC2374335,True,PMC2374335.nxml
2045,PMC2396473,True,PMC2396473.nxml
2064,PMC2423861,True,PMC2423861.nxml
2296,PMC2615378,True,PMC2615378.nxml
2587,PMC2824823,True,PMC2824823.nxml


In [4]:
fileName = 'RCTData/PMC5780715.nxml'
with open(fileName, 'r',encoding='ISO-8859-1') as data_file:
    rawdata = data_file.read()
soup = BeautifulSoup(rawdata)
for hit in soup.findAll('abstract'):
            abstract = hit.contents[1].text

In [5]:
soup.find('Abstract') == None

True

In [6]:
data_X = []
data_Y = []

# Load stop-words
stop_words = set(stopwords.words('english'))
stop_words.update(('name','surname','given','first'))

# Initialize tokenizer
regexp_tokenizer = RegexpTokenizer('[\'a-zA-Z]+')

# Initialize lemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [7]:
# Tokenize a document to words
def tokenize(document, rebuild_document=True):
    words = []

    for sentence in sent_tokenize(document):
        tokens = [wordnet_lemmatizer.lemmatize(t.lower()) 
                  for t in regexp_tokenizer.tokenize(sentence) if t.lower() not in stop_words]
        words += tokens

    if rebuild_document:
        return ' '.join(words).strip()
    else:
        return words

In [8]:
def getAbstract(fileName):
    abList = ['Abstract', 'abstract', 'AbstractText', 'abstracttext' , 'OtherAbstract', 
             'otherabstract', 'summary', 'summary and conclusions', 'conclusions and summary']
    abstract = ''
    with open(fileName, 'r',encoding='ISO-8859-1') as data_file:
        rawdata = data_file.read()
    soup = BeautifulSoup(rawdata)
    for a in abList:
        tmp = soup.find(a)
        if tmp == None:
            continue
        abstract = abstract + ' '+ tmp.text
    return abstract

In [9]:
# put all tokens into dataset data_X and assign the correspond labels
for i in tqdm.tqdm(range(len(trainingTable['PMCID']))):
    fileName = os.path.join(dest, trainingTable['PMCFILE'][i])
    text = getAbstract(fileName)
    data_X.append(tokenize(text))
    data_Y.append(trainingTable['RCT'][i])   

100%|██████████| 19111/19111 [1:13:37<00:00,  4.33it/s]


In [10]:
p_file = 'data_X'

with open(p_file, 'wb') as fout:
    pickle.dump(data_X, fout)

In [11]:
# Create tagged documents
taggedDoc = []

for i, document in enumerate(data_X):
    taggedDoc.append(TaggedDocument(document.split(' '), [i]))

p_file = 'taggedDoc'
with open(p_file, 'wb') as fout:
    pickle.dump(taggedDoc, fout)

# Train Doc2Vec model
# https://arxiv.org/pdf/1405.4053v2.pdf
doc2vecSize = 32*32

doc2vecModel = Doc2Vec(documents=taggedDoc, size=doc2vecSize, window=3, 
                        min_count=2, iter=30, workers=multiprocessing.cpu_count())
doc2vecModel.save('doc2vecModel')
doc2vecModel.init_sims(replace=False)

# Vectorize documents and split corpus to training and testing
vectors = []

for document in data_X:
    vectors.append(doc2vecModel.infer_vector(document.split(' ')))

d2v_X_train, d2v_X_test, d2v_Y_train, d2v_Y_test = train_test_split(vectors, data_Y, test_size=0.25)

In [12]:
X_train_array = np.array(d2v_X_train)
Y_train_array = np.array(d2v_Y_train)
X_test_array = np.array(d2v_X_test)
Y_test_array = np.array(d2v_Y_test)

In [13]:
p_file = 'RCT_Vectors'

with open(p_file, 'wb') as fout:
    pickle.dump(vectors, fout)

In [14]:
p_file = 'RCT_labels'

with open(p_file, 'wb') as fout:
    pickle.dump(data_Y, fout)