In [None]:
## import packages
import json
import re
import heapq
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk
#nltk.download('all')
from nltk.tokenize import sent_tokenize # tokenizes sentences
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams

In [None]:
## load the original dataset
with open('pmid-pdf.json','r', encoding='UTF-8') as f:
    data_full = json.load(f)
    f.close()

In [None]:
## extract the papers with results
data_cleaned = []

for data in data_full:
    for chunk in data['body']:
        if (chunk['header'].lower() == 'result' or chunk['header'].lower() == 'results') and len(chunk['content'])> 0:
            data_cleaned.append(data)
            next

In [None]:
## get the title of a paper
def get_title(in_data):
    return in_data['title']

In [None]:
## get the arthur of a paper
def get_authors(in_data):
    authors = [x['forename'] for x in in_data['authors']]
    return ','.join(authors)

In [None]:
## get url for the full paper
def get_pmid(in_data):
    pmid = in_data['id']
    return pmid

In [None]:
## get abstract of the paper
def get_abstract(in_data):
    abstract = in_data['_abstract']
    return abstract

In [None]:
## get summarization of the results of a paper
def get_sum(in_data):
    
    ## extract the result part of the paper
    def results(in_data):
        for chunk in in_data['body']:
            if (chunk['header'].lower() == 'result' or chunk['header'].lower() == 'results') and len(chunk['content'])> 0:
                return chunk['content']
    result = results(in_data)
    
    ## if the result is short, we don't need to summarize it
    if len(result)<30:
        return result
    
    ## clean the text and score the sentences
    result_cleaned = re.sub('[^a-zA-Z]',' ',result)
    result_cleaned = re.sub(r'\s+',' ', result_cleaned)
    sentence = nltk.sent_tokenize(result) 
    stopwords = nltk.corpus.stopwords.words('english')
    word_freq = {}  
    for word in nltk.word_tokenize(result_cleaned):  
        if word not in stopwords:
            if word not in word_freq.keys():
                word_freq[word] = 1
            else:
                word_freq[word] = word_freq[word]+1
            
    max_freq = max(word_freq.values())
    for word in word_freq.keys():  
        word_freq[word] = (word_freq[word]/max_freq)
    
    sentence_score = {}  
    for sentence in sentence:  
        for word in nltk.word_tokenize(sentence.lower()):
            if word in word_freq.keys():
                if len(sentence.split(' ')) < 35:
                    if sentence not in sentence_score.keys():
                        sentence_score[sentence] = word_freq[word]
                    else:
                        sentence_score[sentence] = sentence_score[sentence]+word_freq[word]
    
    ## choose the highest ranked sentences as summarization
    summary_sentence = heapq.nlargest(5, sentence_score, key=sentence_score.get)
    summary = ' '.join(summary_sentence)  
    return summary

In [None]:
article_type = pd.read_csv("primary_output.csv")
def get_article_type(pmid):
    flag = article_type[article_type["pmid"] == pmid]["flag"]
    if(flag.empty):
        return("na")
    flag = flag.values[0]
    if(flag == 0):
        return("secondary")
    elif(flag == 1):
        return("primary")
    else:
        return("na")

In [None]:
study_design = pd.read_csv("design_output.csv")
def get_study_design(pmid):
    flag = study_design[study_design["pmid"] == pmid]["flag"]
    if(flag.empty):
        return("na")
    flag = flag.values[0]
    if(flag == 0):
        return("non-randomized")
    elif(flag == 1):
        return("randomized")
    else:
        return("na")

In [None]:
dataframe = pd.DataFrame(columns=['title','authors','pmid','summarization','article type','study design'])

In [None]:
i=0
for data in data_cleaned:
    title = get_title(data)
    authors = get_authors(data)
    pmid = int(get_pmid(data))
    summarization = get_sum(data)
    a_type = get_article_type(pmid)
    s_design = get_study_design(pmid)
    dataframe.loc[i] = [title, authors, pmid, summarization, a_type, s_design]
    i = i+1

In [None]:
import os
current_path = os.getcwd()
dataframe.to_csv(current_path+"/datax_proj/happy.csv")