In [None]:
## Use random forest classifier to determine the article type(primary/secondary)
## of a medical paper
import json
import pandas as pd
import re
import nltk
import numpy as np
#nltk.download('all')
from nltk.tokenize import sent_tokenize # tokenizes sentences
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
import matplotlib.pyplot as plt

In [None]:
with open('pmid-pdf.json','r',encoding='UTF-8') as f:data_full = json.load(f)

In [None]:
df=pd.DataFrame(data_full)

In [None]:
train=pd.read_csv('training.csv', encoding = "ISO-8859-1")

In [None]:
for i in range(len(train)):
    pmid=train['pmid'][i]

In [None]:
columns = ['pmid','flag','keywords','title','abstract','structure']
new_df = pd.DataFrame(columns=columns)
secondary = ['review','book','comment','news','meta analysis']
flag = 0
k=0
p1 = 1
p2 = 1000
p= 0
for i in range(len(df)):
    pmid = df['pmid'][i]
    body = df['body'][i]
    keywords = df['keywords'][i]
    abstract = str(df['_abstract'][i])
    title = df['title'][i]
    method = ''
    result = ''
    structure = ''
    flag_2 = 0
    if not abstract:
        continue
    for j in range(len(body)):
        temp = body[j]
        if (re.search('method',temp['header'],re.I)):
            method = method + temp['content']
        if (re.search('result',temp['header'],re.I)):
            result = result + temp['content']
        structure = structure + " " + temp['header']
    if not result and not method:
        flag_2 = 1
    flag = 0
    flag_1 = 0
    for j in range(len(secondary)):
        my_re1 = r"^" + re.escape(secondary[j])
        my_re2 = r" " + re.escape(secondary[j])
        if keywords:
            if (re.search(my_re1,keywords,re.I) or re.search(my_re2,keywords,re.I)):
                flag_1 = 1
        if abstract:
            if (re.search(my_re1,abstract,re.I) or re.search(my_re2,abstract,re.I)):
                flag_1 = 1
        if title:
            if (re.search(my_re1,title,re.I) or re.search(my_re2,title,re.I)):
                flag_1 = 1
    if (flag_2 == 1):
        flag = 0
        #p=p+1
        #if p >= p1 and p <= p2:
        new_df.loc[k] = [pmid,flag,keywords,title,abstract,structure]
        k=k+1
    elif (flag_1 != 1):
        flag = 1
        new_df.loc[k] = [pmid,flag,keywords,title,abstract,structure]
        k=k+1
    if (k % 500 == 0):
        print("processed:", k)

In [None]:
new_df.loc[new_df['flag']==1]

In [None]:
def review_cleaner(reviews,lemmatize=False,stem=False):
    '''
    Clean and preprocess a review.

    1. Use regex to remove all special characters (only keep letters)
    2. Make strings to lower case and tokenize / word split reviews
    3. Remove English stopwords
    4. Rejoin to one string
    '''
    ps = PorterStemmer()
    wnl = WordNetLemmatizer()
    length = []
    cleaned_reviews=[]
    for i,review in enumerate(reviews):
    # print progress
        if( (i+1)%500 == 0 ):
            print("Done with %d reviews" %(i+1))

        #1. Remove punctuation
        review = re.sub("[^a-zA-Z0-9]", " ",review)

        #2. Tokenize into words (all lower case)
        review = review.lower().split()

        #3. Remove stopwords
        eng_stopwords = set(stopwords.words("english"))
            
        clean_review=[]
        for word in review:
            if word not in eng_stopwords:
                if lemmatize is True:
                    word=wnl.lemmatize(word)
                elif stem is True:
                    if word == 'oed':
                        continue
                    word=ps.stem(word)
                clean_review.append(word)

        #6. Join the review to one sentence
        
        review_processed = ' '.join(clean_review)
        length.append(len(clean_review))
        cleaned_reviews.append(review_processed)
    if len(length) > 1:
        plt.plot(length)
    return(cleaned_reviews)

In [None]:
from sklearn.ensemble import RandomForestClassifier
# # CountVectorizer can actucally handle a lot of the preprocessing for us
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics # for confusion matrix, accuracy score etc
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import pickle

np.random.seed(0)

def train_predict_sentiment(cleaned_reviews,y=new_df['flag'],ngram=2,max_features=500):
    '''This function will:
    1. split data into train and test set.
    2. get n-gram counts from cleaned reviews 
    3. train a random forest model using train n-gramy=y.astype('int')  counts and y (labels)
    4. test the model on your test split
    5. print accuracy of sentiment prediction on test and training data
    6. print confusion matrix on test data results

    To change n-gram type, set value of ngram argument
    To change the number of features you want the countvectorizer to generate, set the value of max_features argument'''

    print("Creating the bag of words model!\n")
    # CountVectorizer" is scikit-learn's bag of words tool, here we show more keywords 
    y=y.astype('int') # this is to convert an object to integer object
    vectorizer = CountVectorizer(ngram_range=(1, ngram),analyzer = "word",   \
                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,   \
                                 max_features = 500) 
    
    X_train, X_test, y_train, y_test = train_test_split(\
    cleaned_reviews, y, random_state=0, test_size=.2)

    # Then we use fit_transform() to fit the model / learn the vocabulary,
    # then transform the data into feature vectors.
    # The input should be a list of strings. .toarraty() converts to a numpy array
    
    train_bag = vectorizer.fit_transform(X_train).toarray()
    test_bag = vectorizer.transform(X_test).toarray()
    # print('TOP 20 FEATURES ARE: ',(vectorizer.get_feature_names()[:20]))


    print("Training the random forest classifier!\n")
    # Initialize a Random Forest classifier with 75 trees
    forest = RandomForestClassifier(n_estimators = 50) 

    # Fit the forest to the training set, using the bag of words as 
    # features and the sentiment labels as the target variable
    forest = forest.fit(train_bag, y_train)


    train_predictions = forest.predict(train_bag)
    test_predictions = forest.predict(test_bag)
    
    train_acc = metrics.accuracy_score(y_train, train_predictions)
    valid_acc = metrics.accuracy_score(y_test, test_predictions)
    
    filename = 'primary.sav'
    vectorfile = 'primary_vector.sav'
    pickle.dump(forest, open(filename, 'wb'))
    pickle.dump(vectorizer, open(vectorfile, 'wb'))
    
    print(" The training accuracy is: ", train_acc, "\n", "The validation accuracy is: ", valid_acc)
    print()
    print('CONFUSION MATRIX:')
    print('         Predicted')
    print('          neg pos')
    print(' Actual')
    c=confusion_matrix(y_test, test_predictions)
    print('     neg  ',c[0])
    print('     pos  ',c[1])

    #Extract feature importnace
    print('\nTOP IMPORTANT FEATURES:')
    importances = forest.feature_importances_
    indices = np.argsort(importances)[::-1]
    top_10 = indices[:50]
    print([vectorizer.get_feature_names()[ind] for ind in top_10])

In [None]:
original_clean_reviews=review_cleaner(new_df['structure'],lemmatize=False,stem=False)

In [None]:
train_predict_sentiment(cleaned_reviews=original_clean_reviews, y=new_df['flag'],ngram=2,max_features=500)

In [None]:
columns = ['pmid','structure']
input = pd.DataFrame(columns=columns)
k=0
for i in range(len(df)):
    pmid = df['pmid'][i]
    body = df['body'][i]
    structure = ''
    for j in range(len(body)):
        temp = body[j]
        structure = structure + " " + temp['header']
    if not structure:
        continue
    input.loc[k] = [pmid,structure]
    k=k+1
    if (k % 500 == 0):
        print("Processed:", k) 
print(input)

In [None]:
input_cleaned = review_cleaner(input['structure'],lemmatize=False,stem=False)

In [None]:
vectorfile='primary_vector.sav'
vectorizer = pickle.load(open(vectorfile,'rb')) 
input_bag = vectorizer.transform(input_cleaned).toarray()    
filename='primary.sav'
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(input_bag)

In [None]:
j=0
for i in result:
    if i == 1:
        j=j+1
print("predicted Research as Primary:",j)

In [None]:
columns = ['pmid','flag','structure']
output = pd.DataFrame(columns=columns)
for i in range(len(input)):
    pmid = input.iloc[i]['pmid']
    structure = input.iloc[i]['structure']
    output.loc[i]=[pmid,result[i],structure]
print (output.head())
output.to_csv('primary_output.csv')

In [None]:
if df.empty:
    with open('pmid-pdf.json','r',encoding='UTF-8') as f:data_full = json.load(f)
    df=pd.DataFrame(data_full)
try: 
    primary_model
except NameError:
    filename='primary.sav'
    vectorfile='primary_vector.sav'
    primary_model = pickle.load(open(filename, 'rb'))
    primary_vector = pickle.load(open(vectorfile,'rb'))

# function to find whether a Research Paper is Primary or Secondary
# input:pmid
# output: 0 (Secondary), 1 （Primary）， n_a （if not applicable）
# requires 
# 1. def review_cleaner（）
# 2. the classifier model: primary.sav
# 3. the training vectorizer: primary_vector.sav
# 4. pmid-pdf.json (original data file)

def pmid2primary(pmid):
    text = ''
    list = []
    try:
        text = df.loc[df['pmid']==pmid]['body'].values
    except ValueError:
        text = ''
    if text:
        structure = ''
        for temp in text:
            for tmp in temp:
                structure = structure + " " + tmp['header']
        print(structure)
        if structure:
            list.append(structure)
            text_cleaned = review_cleaner(list,lemmatize=False,stem=False) 
            input_bag = primary_vector.transform(text_cleaned).toarray()    
            result = loaded_model.predict(input_bag)
            return(result[0])
        else:
            return('n_a')
    else:
        return('n_a')

In [None]:
flag = pmid2primary(27679783)
print(flag)
flag = pmid2primary(16673520)
print(flag)
flag = pmid2primary(11111111)
print(flag)