In [1]:
import numpy as np
import pandas as pd
import os
import json
import glob
import sys
from tqdm import tqdm
from IPython.display import FileLink

In [2]:
sys.path.insert(0, "../")

root_path = '/kaggle/input/CORD-19-research-challenge'

# Initialize a dictionary with the following 5 keys corresponding to the 5 features of the final dataframe.
covid_features = {"doc_id": [None], "title": [None], "abstract": [None], "text_body": [None]}

# Construct a dataframe from the dictionary. 
covid_df = pd.DataFrame.from_dict(covid_features)

# Import all files from the project directory which end with .json.
json_filenames = glob.glob(f'{root_path}/**/*.json', recursive=True)

# Dataset construction of 10000 documents.

In [3]:
def return_covid_df_10000(json_filenames, df):
    
    for file_name in tqdm(json_filenames[20000:30000]): # Use only 10000 documents.

        row = {}

        with open(file_name) as json_data:
            data = json.load(json_data)

            # Store the document id and title as values in the 'row' dictionary.
            doc_id = data['paper_id']
            row['doc_id'] = doc_id
            row['title'] = data['metadata']['title']

            # Store the document abstract as a value in the 'row' dictionary. 
            # Put it all in a list,then use str.join() to split it into paragraphs. 
            try:
                abstract_list = [abst['text'] for abst in data['abstract']]
                abstract = "\n ".join(abstract_list)
                row['abstract'] = abstract
                
            except: # Ignore documents which have nan as values for the abstract.
                row['abstract'] = np.nan
                
            # Store the document body text as a value in the 'row' dictionary.  
            # Perform the same operations as for the abstract.
            body_list = [bt['text'] for bt in data['body_text']]
            body = "\n ".join(body_list)
            row['text_body'] = body
            
            # Append each 'row' dictionary to the dataframe.
            df = df.append(row, ignore_index=True)
                 
    return df

In [4]:
covid_df_10000 = return_covid_df_10000(json_filenames,covid_df)
covid_df_10000

100%|██████████| 10000/10000 [01:34<00:00, 106.10it/s]


Unnamed: 0,doc_id,title,abstract,text_body
0,,,,
1,5ac635d21fdb1726f21d7090a233dd2b5ffdc1b5,"Long Term Predictors of Breathlessness, Exerci...",J o u r n a l P r e -p r o o f 2,With over 246 million Coronavirus disease 2019...
2,71d93a6452061c57ae1532991d4a20cd6fc6fcb3,Emergence of universality in the transmission ...,The complexities involved in modelling the tra...,• Even though the pattern of disease spread is...
3,635ba7eca764f7caaa82904197c40a20111ec940,Supporting families to protect child health: P...,Supportive parenting is critical for promoting...,a1111111111 a1111111111 a1111111111 a111111111...
4,0282d2f9eb65318e40db21efbe6172ce16c8acf5,Perspectives on the Early Quality of Evidence ...,Background: The severe acute respiratory syndr...,The severe acute respiratory syndrome coronavi...
...,...,...,...,...
9996,e6fe93e18fe7e74cc6d90086205e877c4245df35,Journal Pre-proof Targeted delivery of inhalab...,The coronavirus disease 2019 pandemic has led ...,Coronavirus disease 2019 (COVID-19) is highly ...
9997,a6f37939b6a069def17ff724d2d49bbb41ce8f82,,Governments around the globe have started to d...,"The novel coronavirus , first detected late 20..."
9998,8126a598911ae325a6ebf1e1ca2fb9ae08cc73a2,,,as a means of preventing transmission (Chu et ...
9999,cc3f4dc92499c431d05f71a9a4d0e74704f0a30b,Surveillance of acute SARS-CoV-2 infections in...,Background: Switzerland had one of the highest...,The role of schools in severe acute respirator...


In [5]:
covid_df_10000.isnull().sum(axis = 0) # Check if there any missing values in the columns. We are interested in the 'text_body' column.

doc_id       1
title        1
abstract     1
text_body    1
dtype: int64

In [6]:
covid_df_10000.dropna(subset=['text_body'],inplace=True) # Drop rows with missing values for the 'text_body' column.
covid_df_10000

Unnamed: 0,doc_id,title,abstract,text_body
1,5ac635d21fdb1726f21d7090a233dd2b5ffdc1b5,"Long Term Predictors of Breathlessness, Exerci...",J o u r n a l P r e -p r o o f 2,With over 246 million Coronavirus disease 2019...
2,71d93a6452061c57ae1532991d4a20cd6fc6fcb3,Emergence of universality in the transmission ...,The complexities involved in modelling the tra...,• Even though the pattern of disease spread is...
3,635ba7eca764f7caaa82904197c40a20111ec940,Supporting families to protect child health: P...,Supportive parenting is critical for promoting...,a1111111111 a1111111111 a1111111111 a111111111...
4,0282d2f9eb65318e40db21efbe6172ce16c8acf5,Perspectives on the Early Quality of Evidence ...,Background: The severe acute respiratory syndr...,The severe acute respiratory syndrome coronavi...
5,d96113a2d8691d3b1aee5fd1b5d30241f2b2a633,Quantify the role of superspreaders -opinion l...,Effective communication of accurate informatio...,We design a mathematical model to quantify the...
...,...,...,...,...
9996,e6fe93e18fe7e74cc6d90086205e877c4245df35,Journal Pre-proof Targeted delivery of inhalab...,The coronavirus disease 2019 pandemic has led ...,Coronavirus disease 2019 (COVID-19) is highly ...
9997,a6f37939b6a069def17ff724d2d49bbb41ce8f82,,Governments around the globe have started to d...,"The novel coronavirus , first detected late 20..."
9998,8126a598911ae325a6ebf1e1ca2fb9ae08cc73a2,,,as a means of preventing transmission (Chu et ...
9999,cc3f4dc92499c431d05f71a9a4d0e74704f0a30b,Surveillance of acute SARS-CoV-2 infections in...,Background: Switzerland had one of the highest...,The role of schools in severe acute respirator...


# Dataset construction of 20000 documents.

In [7]:
def return_covid_df_20000(json_filenames, df):
    
    for file_name in tqdm(json_filenames[20000:40000]): # Use only 20000 documents.

        row = {}

        with open(file_name) as json_data:
            data = json.load(json_data)

            # Store the document id and title as values in the 'row' dictionary.
            doc_id = data['paper_id']
            row['doc_id'] = doc_id
            row['title'] = data['metadata']['title']

            # Store the document abstract as a value in the 'row' dictionary. 
            # Put it all in a list,then use str.join() to split it into paragraphs. 
            try:
                abstract_list = [abst['text'] for abst in data['abstract']]
                abstract = "\n ".join(abstract_list)
                row['abstract'] = abstract
                
            except: # Ignore documents which have nan as values for the abstract.
                row['abstract'] = np.nan
                
            # Store the document body text as a value in the 'row' dictionary.  
            # Perform the same operations as for the abstract.
            body_list = [bt['text'] for bt in data['body_text']]
            body = "\n ".join(body_list)
            row['text_body'] = body
            
            # Append each 'row' dictionary to the dataframe.
            df = df.append(row, ignore_index=True)
                 
    return df

In [8]:
covid_df_20000 = return_covid_df_20000(json_filenames,covid_df)
covid_df_20000

100%|██████████| 20000/20000 [02:50<00:00, 117.63it/s]


Unnamed: 0,doc_id,title,abstract,text_body
0,,,,
1,5ac635d21fdb1726f21d7090a233dd2b5ffdc1b5,"Long Term Predictors of Breathlessness, Exerci...",J o u r n a l P r e -p r o o f 2,With over 246 million Coronavirus disease 2019...
2,71d93a6452061c57ae1532991d4a20cd6fc6fcb3,Emergence of universality in the transmission ...,The complexities involved in modelling the tra...,• Even though the pattern of disease spread is...
3,635ba7eca764f7caaa82904197c40a20111ec940,Supporting families to protect child health: P...,Supportive parenting is critical for promoting...,a1111111111 a1111111111 a1111111111 a111111111...
4,0282d2f9eb65318e40db21efbe6172ce16c8acf5,Perspectives on the Early Quality of Evidence ...,Background: The severe acute respiratory syndr...,The severe acute respiratory syndrome coronavi...
...,...,...,...,...
19996,ffdc50d239f8e531dc159993723c0cbc176a632e,Comparative analysis of antibody-and lipid-bas...,Multiplexing of samples in single-cell RNA-seq...,Recent advances in single-cell and single-nucl...
19997,b37bf3ab1d814f853a42cd8958d7390b42343bd2,,,To the Editor:\n We read with interest the art...
19998,2c70b8caf708c5c1b0c9b811a80179061274cff7,Nervensystem,,zeichen diagnostisch eine Rolle spielen. Am Ko...
19999,1831d9c801bcbc3388eb2aeb7d48aed3c2c690e9,Journal Pre-proof The sub-specialty of Foot an...,,The sub-specialty of Foot and Ankle is evolvin...


In [9]:
covid_df_20000.isnull().sum(axis = 0) # Check if there any missing values in the columns. We are interested in the 'text_body' column.

doc_id       1
title        1
abstract     1
text_body    1
dtype: int64

In [10]:
covid_df_20000.dropna(subset=['text_body'],inplace=True) # Drop rows with missing values for the 'text_body' column.
covid_df_20000

Unnamed: 0,doc_id,title,abstract,text_body
1,5ac635d21fdb1726f21d7090a233dd2b5ffdc1b5,"Long Term Predictors of Breathlessness, Exerci...",J o u r n a l P r e -p r o o f 2,With over 246 million Coronavirus disease 2019...
2,71d93a6452061c57ae1532991d4a20cd6fc6fcb3,Emergence of universality in the transmission ...,The complexities involved in modelling the tra...,• Even though the pattern of disease spread is...
3,635ba7eca764f7caaa82904197c40a20111ec940,Supporting families to protect child health: P...,Supportive parenting is critical for promoting...,a1111111111 a1111111111 a1111111111 a111111111...
4,0282d2f9eb65318e40db21efbe6172ce16c8acf5,Perspectives on the Early Quality of Evidence ...,Background: The severe acute respiratory syndr...,The severe acute respiratory syndrome coronavi...
5,d96113a2d8691d3b1aee5fd1b5d30241f2b2a633,Quantify the role of superspreaders -opinion l...,Effective communication of accurate informatio...,We design a mathematical model to quantify the...
...,...,...,...,...
19996,ffdc50d239f8e531dc159993723c0cbc176a632e,Comparative analysis of antibody-and lipid-bas...,Multiplexing of samples in single-cell RNA-seq...,Recent advances in single-cell and single-nucl...
19997,b37bf3ab1d814f853a42cd8958d7390b42343bd2,,,To the Editor:\n We read with interest the art...
19998,2c70b8caf708c5c1b0c9b811a80179061274cff7,Nervensystem,,zeichen diagnostisch eine Rolle spielen. Am Ko...
19999,1831d9c801bcbc3388eb2aeb7d48aed3c2c690e9,Journal Pre-proof The sub-specialty of Foot an...,,The sub-specialty of Foot and Ankle is evolvin...


# Download the datasets.

In [11]:
os.chdir("/kaggle/working/")

In [12]:
covid_df_10000.to_csv(r'covid_df_10000.csv')

In [13]:
FileLink(r'covid_df_10000.csv')

In [14]:
covid_df_20000.to_csv(r'covid_df_20000.csv')

In [15]:
FileLink(r'covid_df_20000.csv')