### Data preprocessing
In the original dataset from Kaggle, the papers are organized in different directories based on their source biomedical database. In this notebook, we:

1) Aggregate all the papers in a single file.

2) Select only the textual data (title, abstract and body of paper).

3) Perform some pre-processing of the text.

In [None]:
import os
import json
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

In [3]:
data_dirs = {
    "biorxiv": "data/biorxiv_medrxiv/biorxiv_medrxiv",
    "comm_use_subset": "data/comm_use_subset/comm_use_subset",
    "custom_license": "data/custom_license/custom_license",
    "noncomm_use_subset": "data/noncomm_use_subset/noncomm_use_subset"
}

#### Step 1: Merge all papers into a single json file
#### Step 2: Save only text data

In [4]:
def extract_abstract(abstract_parts: list) -> str:
    """
        Input:
            abstract_parts: a list, the 'abstract' key of the paper's json.
        Output:
            The full text of the abstract as a string
        
        The abstract section in the paper's json gives us a list of the ordered parts of the text from the abstract.
        This method combines the divided abstract text into one string for the whole abstract.
    """
    
    if abstract_parts == []: # some papers are missing abstracts
        full_abstract_text = ""
    else:
        full_abstract_text = ' '.join([abstract_part['text'] for abstract_part in abstract_parts])
    
    return full_abstract_text

def extract_body(body_parts: list) -> str:
    """
        Input:
            body_parts: a list, the 'body_text' key of the paper's json.
        Output:
            The full text of the body as a string
        
        The body section in the paper's json gives us a list of the ordered parts of the text from the body.
        This method combines the divided body text into one string for the whole body text.
    """
    if body_parts == []:
        full_body_text = ""
    else:
        full_body_text = ' '.join([body_part['text'] for body_part in body_parts])
    
    return full_body_text
        
def combine_all_text(save_to='data/all_text.json'):
    """
        Combines the text data from all datasets into a single json object.
        The structure of the json is:
        {
            "paper_id": {
                "title": title of paper,
                "source: database it comes from,
                "abstract": abstract of paper,
                "body": full text of the body
            }
        }
    """
    filtered_json = {}
    for source, data_dir in data_dirs.items():
        filenames = os.listdir(data_dir)
        print("{} papers from source {}...".format(len(filenames), source))

        for i in tqdm(range(len(filenames))):
            #if i == 5:
               #break

            with open(os.path.join(data_dir, filenames[i]), 'r') as f:
                paper_json = json.load(f)

            text_json = {
                "title": paper_json['metadata']['title'],
                "source": source,
                "abstract": extract_abstract(paper_json['abstract']),
                "body": extract_body(paper_json['body_text'])
            }

            paper_id = paper_json['paper_id']
            filtered_json[paper_id] = text_json

    with open(save_to, 'w') as f:
        print("Saving json to {}".format(save_to))
        json.dump(filtered_json, f)

all_text_file = 'data/all_text.json'
combine_all_text(save_to=all_text_file)

1053 papers from source biorxiv...


HBox(children=(FloatProgress(value=0.0, max=1053.0), HTML(value='')))


9315 papers from source comm_use_subset...


HBox(children=(FloatProgress(value=0.0, max=9315.0), HTML(value='')))


20657 papers from source custom_license...


HBox(children=(FloatProgress(value=0.0, max=20657.0), HTML(value='')))


2350 papers from source noncomm_use_subset...


HBox(children=(FloatProgress(value=0.0, max=2350.0), HTML(value='')))


Saving json to data/all_text.json


#### Step 3: Text pre-processing.
* Normalize all letters to lowercase.
* Remove stopwords.
* Remove punctuation.
* Remove numeric tokens. 

In [5]:
stop_words = set(stopwords.words('english'))
punctuation = string.punctuation

def is_number(x: str) -> bool:
    """
        Helper method, check if string is a number(int or float)
    """
    number_symbols = ['.', '+', '-', '/', '\\', '−']
    for symbol in number_symbols:
        x = x.replace(symbol, '')
        
    return x.isdigit()

def preprocess_text(text: str) -> str:
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [t for t in tokens if t not in punctuation]
    tokens = [t for t in tokens if not is_number(t)]
    
    text = ' '.join(tokens)
    #print(tokens)
    return text

def preprocess_json(paper_json: dict) -> dict:
    paper_json['abstract'] = preprocess_text(paper_json['abstract'])
    paper_json['body'] = preprocess_text(paper_json['body'])
    
    return paper_json


with open('data/all_text.json', 'r') as f:
    all_data_json = json.load(f)

print("{} papers found in file {}".format(len(all_data_json.keys()), all_text_file))
    
preprocessed_json = {}
for paper_id, paper_json in tqdm(all_data_json.items()):
    preprocessed_json[paper_id] = preprocess_json(paper_json)

with open('data/preprocessed_text.json', 'w') as f:
    all_data_json = json.dump(preprocessed_json, f)

33375 papers found in file


HBox(children=(FloatProgress(value=0.0, max=33375.0), HTML(value='')))


