In [1]:
import os
import json
from pprint import pprint
from copy import deepcopy

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

## Helper Functions

Unhide the cell below to find the definition of the following functions:
* `format_name(author)`
* `format_affiliation(affiliation)`
* `format_authors(authors, with_affiliation=False)`
* `format_body(body_text)`
* `format_bib(bibs)`

In [2]:
def format_name(author):
    middle_name = " ".join(author['middle'])
    
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])


def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)

def format_authors(authors, with_affiliation=False):
    name_ls = []
    
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:
            name_ls.append(name)
    
    return ", ".join(name_ls)

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    
    for section, text in texts:
        texts_di[section] += text

    body = ""

    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    return body

def format_bib(bibs):
    if type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted = []
    
    for bib in bibs:
        bib['authors'] = format_authors(
            bib['authors'], 
            with_affiliation=False
        )
        formatted_ls = [str(bib[k]) for k in ['title', 'authors', 'venue', 'year']]
        formatted.append(", ".join(formatted_ls))

    return "; ".join(formatted)

Unhide the cell below to find the definition of the following functions:
* `load_files(dirname)`
* `generate_clean_df(all_files)`

In [3]:
def load_files(dirname):
    filenames = os.listdir(dirname)
    raw_files = []

    for filename in tqdm(filenames):
        filename = dirname + filename
        file = json.load(open(filename, 'rb'))
        raw_files.append(file)
    
    return raw_files

def generate_clean_df(all_files):
    cleaned_files = []
    
    for file in tqdm(all_files):
        features = [
            file['paper_id'],
            file['metadata']['title'],
            format_authors(file['metadata']['authors']),
            format_authors(file['metadata']['authors'], 
                           with_affiliation=True),
            format_body(file['abstract']),
            format_body(file['body_text']),
            format_bib(file['bib_entries']),
            file['metadata']['authors'],
            file['bib_entries']
        ]

        cleaned_files.append(features)

    col_names = ['paper_id', 'title', 'authors',
                 'affiliations', 'abstract', 'text', 
                 'bibliography','raw_authors','raw_bibliography']

    clean_df = pd.DataFrame(cleaned_files, columns=col_names)
    clean_df.head()
    
    return clean_df

## Biorxiv: Exploration

Let's first take a quick glance at the `biorxiv` subset of the data. We will also use this opportunity to load all of the json files into a list of **nested** dictionaries (each `dict` is an article).

In [4]:
biorxiv_dir = '/kaggle/input/CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/'
filenames = os.listdir(biorxiv_dir)
print("Number of articles retrieved from biorxiv:", len(filenames))

Number of articles retrieved from biorxiv: 1625


In [5]:
all_files = []

for filename in filenames:
    filename = biorxiv_dir + filename
    file = json.load(open(filename, 'rb'))
    all_files.append(file)

In [6]:
file = all_files[0]
print("Dictionary keys:", file.keys())

Dictionary keys: dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])


## Biorxiv: Abstract

The abstract dictionary is fairly simple:

In [7]:
pprint(file['abstract'])

[{'cite_spans': [],
  'ref_spans': [],
  'section': 'Abstract',
  'text': 'We model the extent to which age targeted quarantine can be used to '
          'reduce ICU admissions caused by novel coronavirus COVID-19. Using '
          'demographic data from New Zealand, we demonstrate that lowering the '
          'age threshold for quarantine to 50 years of age reduces ICU '
          'admissions drastically, and show that for sufficiently strict '
          'isolation protocols, isolating one third of the countries '
          'population for a total of 6 months is sufficient to avoid '
          'overwhelming ICU capacity throughout the entire course of the '
          'epidemic. Similar results are expected to hold for other countries, '
          'though some minor adaption will be required based on local age '
          'demographics and hospital facilities.'},
 {'cite_spans': [],
  'ref_spans': [],
  'section': 'Abstract',
  'text': '. CC-BY 4.0 International license It is made a

## Biorxiv: body text

Let's first probe what the `body_text` dictionary looks like:

In [8]:
print("body_text type:", type(file['body_text']))
print("body_text length:", len(file['body_text']))
print("body_text keys:", file['body_text'][0].keys())

body_text type: <class 'list'>
body_text length: 51
body_text keys: dict_keys(['text', 'cite_spans', 'ref_spans', 'section'])


We take a look at the first part of the `body_text` content. As you will notice, the body text is separated into a list of small subsections, each containing a `section` and a `text` key. Since multiple subsection can have the same section, we need to first group each subsection before concatenating everything.

In [9]:
print("body_text content:")
pprint(file['body_text'][:2], depth=3)

body_text content:
[{'cite_spans': [{...}, {...}],
  'ref_spans': [],
  'section': 'Introduction',
  'text': 'COVID-19, initially observed/detected in Hubei province of China '
          'during December 2019, has since spread to all but a handful '
          'countries, causing (as of the time of writing) an estimated 855,000 '
          'infections and 42,000 deaths ( [8] , March 31st). COVID-19 has a '
          'basic reproductive number, R 0 , currently estimated in the region '
          'of 2.5 -3 [5] . Social distance and general quarantine measures can '
          'reduce R 0 temporarily, but not permanently. For R 0 = 3, left '
          'unchecked COVID-19 can be expected to infect more than 90% of our '
          'community, with 30% of the population infected at the epidemic '
          'peak. Even with significant quarantine measures in place the '
          'population will not reach "herd immunity" to this virus until 2/3 '
          'of the population has gained resist

Let's see what the grouped section titles are for the example above:

In [10]:
texts = [(di['section'], di['text']) for di in file['body_text']]
texts_di = {di['section']: "" for di in file['body_text']}
for section, text in texts:
    texts_di[section] += text

pprint(list(texts_di.keys()))

['Introduction',
 'Targeted Quarantine and Release',
 'The Model',
 'Results',
 '6',
 '7',
 'Logistics',
 'Assumptions to be Investigated',
 'Opportunities',
 'Conclusions',
 '9',
 '10',
 '11']


The following example shows what the final result looks like, after we format each section title with its content:

In [11]:
body = ""

for section, text in texts_di.items():
    body += section
    body += "\n\n"
    body += text
    body += "\n\n"

print(body[:3000])

Introduction

COVID-19, initially observed/detected in Hubei province of China during December 2019, has since spread to all but a handful countries, causing (as of the time of writing) an estimated 855,000 infections and 42,000 deaths ( [8] , March 31st). COVID-19 has a basic reproductive number, R 0 , currently estimated in the region of 2.5 -3 [5] . Social distance and general quarantine measures can reduce R 0 temporarily, but not permanently. For R 0 = 3, left unchecked COVID-19 can be expected to infect more than 90% of our community, with 30% of the population infected at the epidemic peak. Even with significant quarantine measures in place the population will not reach "herd immunity" to this virus until 2/3 of the population has gained resistance-either through vaccination, or infection and subsequent recovery.In order to place these numbers in a concrete context, a recent survey in New Zealand indicated that the country had a total of 520 ventilator machines [7] . Given the c

The function below lets you display the body text in one line (unhide to see exactly the same as above):

In [12]:
print(format_body(file['body_text'])[:3000])

Introduction

COVID-19, initially observed/detected in Hubei province of China during December 2019, has since spread to all but a handful countries, causing (as of the time of writing) an estimated 855,000 infections and 42,000 deaths ( [8] , March 31st). COVID-19 has a basic reproductive number, R 0 , currently estimated in the region of 2.5 -3 [5] . Social distance and general quarantine measures can reduce R 0 temporarily, but not permanently. For R 0 = 3, left unchecked COVID-19 can be expected to infect more than 90% of our community, with 30% of the population infected at the epidemic peak. Even with significant quarantine measures in place the population will not reach "herd immunity" to this virus until 2/3 of the population has gained resistance-either through vaccination, or infection and subsequent recovery.In order to place these numbers in a concrete context, a recent survey in New Zealand indicated that the country had a total of 520 ventilator machines [7] . Given the c

## Biorxiv: Metadata

Let's first see what keys are contained in the `metadata` dictionary:

In [13]:
print(all_files[0]['metadata'].keys())

dict_keys(['title', 'authors'])


Let's take a look at each of the correspond values:

In [14]:
print(all_files[0]['metadata']['title'])

The Effectiveness of Targeted Quarantine for Minimising Impact of COVID-19


In [15]:
authors = all_files[0]['metadata']['authors']
pprint(authors[:3])

[{'affiliation': {'institution': 'Carl von Ossietzky Universität Oldenburg',
                  'laboratory': '',
                  'location': {'country': 'Germany'}},
  'email': '',
  'first': 'Alastair',
  'last': 'Jamieson-Lane',
  'middle': [],
  'suffix': ''},
 {'affiliation': {'institution': 'University of British Columbia',
                  'laboratory': '',
                  'location': {'country': 'Canada'}},
  'email': '',
  'first': 'Eric',
  'last': 'Cytrnbaum',
  'middle': [],
  'suffix': ''}]


The `format_name` and `format_affiliation` functions:

In [16]:
for author in authors:
    print("Name:", format_name(author))
    print("Affiliation:", format_affiliation(author['affiliation']))
    print()

Name: Alastair Jamieson-Lane
Affiliation: Carl von Ossietzky Universität Oldenburg, Germany

Name: Eric Cytrnbaum
Affiliation: University of British Columbia, Canada



Now, let's take as an example a slightly longer list of authors:

In [17]:
pprint(all_files[4]['metadata'], depth=4)

{'authors': [{'affiliation': {'institution': 'University of Oxford',
                              'laboratory': 'Li Ka Shing Centre for Health '
                                            'Information and Discovery',
                              'location': {...}},
              'email': '',
              'first': 'Luca',
              'last': 'Ferretti',
              'middle': [],
              'suffix': ''},
             {'affiliation': {'institution': 'University of Oxford',
                              'laboratory': 'Li Ka Shing Centre for Health '
                                            'Information and Discovery',
                              'location': {...}},
              'email': '',
              'first': 'Chris',
              'last': 'Wymant',
              'middle': [],
              'suffix': ''},
             {'affiliation': {'institution': 'University of Oxford',
                              'laboratory': 'Li Ka Shing Centre for Health '
                   

Here, I provide the function `format_authors` that let you format a list of authors to get a final string, with the optional argument of showing the affiliation:

In [18]:
authors = all_files[4]['metadata']['authors']
print("Formatting without affiliation:")
print(format_authors(authors, with_affiliation=False))
print("\nFormatting with affiliation:")
print(format_authors(authors, with_affiliation=True))

Formatting without affiliation:
Luca Ferretti, Chris Wymant, Michelle Kendall, Lele Zhao, Anel Nurtay, Lucie Abeler-Dörner, Michael Parker, David Bonsall, Christophe Fraser

Formatting with affiliation:
Luca Ferretti (University of Oxford, Oxford, UK), Chris Wymant (University of Oxford, Oxford, UK), Michelle Kendall (University of Oxford, Oxford, UK), Lele Zhao (University of Oxford, Oxford, UK), Anel Nurtay (University of Oxford, Oxford, UK), Lucie Abeler-Dörner (University of Oxford, Oxford, UK), Michael Parker (University of Oxford, UK), David Bonsall (University of Oxford, Oxford, UK), Christophe Fraser (University of Oxford, Oxford, UK)


## Biorxiv: bibliography

Let's take a look at the bibliography section. 

In [19]:
bibs = list(file['bib_entries'].values())
pprint(bibs[:2], depth=4)

[{'authors': [],
  'issn': '',
  'other_ids': {},
  'pages': '',
  'ref_id': 'b0',
  'title': 'Impact of non-pharmaceutical interventions (NPIs) to reduce '
           'COVID-19 mortality and healthcare demand',
  'venue': '',
  'volume': '',
  'year': None},
 {'authors': [],
  'issn': '',
  'other_ids': {},
  'pages': '',
  'ref_id': 'b1',
  'title': 'Place Summaries | New Zealand | Stats NZ',
  'venue': '',
  'volume': '',
  'year': None}]


You can reused the `format_authors` function here:

In [20]:
format_authors(bibs[1]['authors'], with_affiliation=False)

''

The following function let you format the bibliography all at once. It only extracts the title, authors, venue, year, and separate each entry of the bibliography with a `;`.

In [21]:
bib_formatted = format_bib(bibs[:5])
print(bib_formatted)

Impact of non-pharmaceutical interventions (NPIs) to reduce COVID-19 mortality and healthcare demand, , , None; Place Summaries | New Zealand | Stats NZ, , , None; Report sulle caratteristiche dei pazienti deceduti positivia COVID-19 inItal iaIl presente reportè basato sui dati aggiornatial 17 Marzo 2020, , , 2020; Library Catalog: www.cdc.go.kr,  Kcdc,  Kcdc, , None; The reproductive number of COVID-19 is higher compared to SARS coronavirus, Ying Liu, Albert A Gayle, Annelies Wilder-Smith, Joacim Rocklöv, Journal of Travel Medicine, 2020


## Biorxiv: Generate CSV

In this section, I show you how to manually generate the CSV files. As you can see, it's now super simple because of the `format_` helper functions. In the next sections, I show you have to generate them in 3 lines using the `load_files` and `generate_clean_dr` helper functions.

In [22]:
cleaned_files = []

for file in tqdm(all_files):
    features = [
        file['paper_id'],
        file['metadata']['title'],
        format_authors(file['metadata']['authors']),
        format_authors(file['metadata']['authors'], 
                       with_affiliation=True),
        format_body(file['abstract']),
        format_body(file['body_text']),
        format_bib(file['bib_entries']),
        file['metadata']['authors'],
        file['bib_entries']
    ]
    
    cleaned_files.append(features)

HBox(children=(FloatProgress(value=0.0, max=1625.0), HTML(value='')))




In [23]:
col_names = [
    'paper_id', 
    'title', 
    'authors',
    'affiliations', 
    'abstract', 
    'text', 
    'bibliography',
    'raw_authors',
    'raw_bibliography'
]

clean_df = pd.DataFrame(cleaned_files, columns=col_names)
clean_df.head()

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,bbf09194127619f57b3ddf5daf684593a5831367,The Effectiveness of Targeted Quarantine for M...,"Alastair Jamieson-Lane, Eric Cytrnbaum",Alastair Jamieson-Lane (Carl von Ossietzky Uni...,Abstract\n\nWe model the extent to which age t...,"Introduction\n\nCOVID-19, initially observed/d...",Impact of non-pharmaceutical interventions (NP...,"[{'first': 'Alastair', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Impact ..."
1,2a21fdd15e07c89c88e8c2f6c6ab5692568876ec,Evaluation of Group Testing for SARS-CoV-2 RNA,"Nasa Sinnott-Armstrong, Daniel L Klein, Brenda...","Nasa Sinnott-Armstrong, Daniel L Klein, Brenda...",Abstract\n\nDuring the current COVID-19 pandem...,Introduction\n\nGroup testing was first descri...,"In one Italian town, we showed mass testing co...","[{'first': 'Nasa', 'middle': [], 'last': 'Sinn...","{'BIBREF0': {'ref_id': 'b0', 'title': 'In one ..."
2,e686d1ce1540026ecb100c09f99ed091c139b92c,Why estimating population-based case fatality ...,"Lucas Böttcher, Mingtao Xia, Tom Chou","Lucas Böttcher, Mingtao Xia (UCLA, 90095-1555,...",Abstract\n\nDifferent ways of calculating mort...,\n\nDifferent ways of calculating mortality ra...,"COVID-19 statistics, , , None; The Lancet, Z X...","[{'first': 'Lucas', 'middle': [], 'last': 'Böt...","{'BIBREF2': {'ref_id': 'b2', 'title': 'COVID-1..."
3,c6039f8933305c9f44a44c81a15b321b6c2848dc,Far-UVC light: A new tool to control the sprea...,"David Welch, Manuela Buonanno, Veljko Grilj, I...",David Welch (Columbia University Medical Cente...,Abstract\n\nAirborne-mediated microbial diseas...,3\n\nAirborne-mediated microbial diseases repr...,"Global, regional, and national life expectancy...","[{'first': 'David', 'middle': [], 'last': 'Wel...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Global,..."
4,073d74442e2655d79b0b3f764a627ec667ad422c,Quantifying SARS-CoV-2 transmission suggests e...,"Luca Ferretti, Chris Wymant, Michelle Kendall,...","Luca Ferretti (University of Oxford, Oxford, U...",Abstract\n\nThe newly emergent human virus SAR...,IV.\n\nEnvironmental transmission: transmissio...,"Early Transmission Dynamics in Wuhan, China, o...","[{'first': 'Luca', 'middle': [], 'last': 'Ferr...","{'BIBREF2': {'ref_id': 'b2', 'title': 'Early T..."


In [24]:
clean_df.to_csv('biorxiv_clean.csv', index=False)

## Generate CSV: Custom (PMC), Commercial, Non-commercial licenses

In [25]:
pmc_dir = '/kaggle/input/CORD-19-research-challenge/custom_license/custom_license/pdf_json/'
pmc_files = load_files(pmc_dir)
pmc_df = generate_clean_df(pmc_files)
pmc_df.to_csv('clean_pmc.csv', index=False)
pmc_df.head()

HBox(children=(FloatProgress(value=0.0, max=26505.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26505.0), HTML(value='')))




Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,14572a7a9b3e92b960d92d9755979eb94c448bb5,Immune Parameters of Dry Cows Fed Mannan Oligo...,"S T Franklin, M C Newman, K E Newman, K I Meek","S T Franklin (University of Kentucky, 40546-02...",Abstract\n\nThe objective of this study was to...,INTRODUCTION\n\nThe periparturient period is a...,Immune response of pregnant heifers and cows t...,"[{'first': 'S', 'middle': ['T'], 'last': 'Fran...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Immune ..."
1,bb790e8366da63c4f5e2d64fa7bbd5673b93063c,Discontinuous Transcription or RNA Processing ...,"Beate Schwer, Paolo Vista, Jan C Vos, Hendrik ...","Beate Schwer, Paolo Vista, Jan C Vos, Hendrik ...",,Discontinuous\n\nTranscription or RNA Processi...,Poly (riboadenylic acid) preferentially inhibi...,"[{'first': 'Beate', 'middle': [], 'last': 'Sch...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Poly (r..."
2,24f204ce5a1a4d752dc9ea7525082d225caed8b3,,,,,Letter to the Editor\n\nThe non-contact handhe...,Novel coronavirus is putting the whole world o...,[],"{'BIBREF0': {'ref_id': 'b0', 'title': 'Novel c..."
3,f5bc62a289ef384131f592ec3a8852545304513a,Pediatric Natural Deaths 30,"Elizabeth C Burton, Nicole A Singer",Elizabeth C Burton (Johns Hopkins University S...,,"Introduction\n\nWorldwide, the leading causes ...",In athletes who experienced sudden death or in...,"[{'first': 'Elizabeth', 'middle': ['C'], 'last...","{'BIBREF0': {'ref_id': 'b0', 'title': 'In athl..."
4,ab78a42c688ac199a2d5669e42ee4c39ff0df2b8,A real-time convective PCR machine in a capill...,"Yi-Fan Hsieh, Da-Sheng Lee, Ping-Hei Chen, Sha...","Yi-Fan Hsieh (National Taiwan University, 106,...","Abstract\n\nThis research reports the design, ...",Introduction\n\nMullis et al. developed the po...,"The Polymerase Chain Reaction, K B Mullis, F F...","[{'first': 'Yi-Fan', 'middle': [], 'last': 'Hs...","{'BIBREF0': {'ref_id': 'b0', 'title': 'The Pol..."


In [26]:
comm_dir = '/kaggle/input/CORD-19-research-challenge/comm_use_subset/comm_use_subset/pdf_json/'
comm_files = load_files(comm_dir)
comm_df = generate_clean_df(comm_files)
comm_df.to_csv('clean_comm_use.csv', index=False)
comm_df.head()

HBox(children=(FloatProgress(value=0.0, max=9524.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9524.0), HTML(value='')))




Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,37cf1eb9be84f2b178b4d60de7ba9520deccf453,Processing of the SARS-CoV pp1a/ab nsp7-10 region,"Boris Krichel, Sven Falke, Rolf Hilgenfeld, La...",Boris Krichel (Leibniz Institute for Experimen...,Abstract\n\nSevere acute respiratory syndrome ...,Introduction\n\nThe discovery of severe acute ...,Identification of a novel coronavirus in patie...,"[{'first': 'Boris', 'middle': [], 'last': 'Kri...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Identif..."
1,8d095d0275e474dbb9d9b63a75591ff2c0667d73,Evidence of Recombination and Genetic Diversit...,"Ting Huang, Wei Wang, Mael Bessaud, Peijun Ren...","Ting Huang (Unit of Emerging Viruses, Shanghai...",Abstract\n\nBackground: Human rhinoviruses (HR...,Introduction\n\nHuman rhinoviruses (HRVs) are ...,"Rhinovirus and the lower respiratory tract, F ...","[{'first': 'Ting', 'middle': [], 'last': 'Huan...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Rhinovi..."
2,43918c75d7b7d3f9402b1342dfea1e19eb2bd909,Antibacterial Properties of Visible-Light-Resp...,"Der-Shan Sun, Jyh-Hwa Kau, Hsin-Hsien Huang, Y...","Der-Shan Sun (Tzu-Chi University, 97004, Huali...",Abstract\n\nThe bactericidal activity of conve...,Introduction\n\nAnthrax is a life-threatening ...,"Anthrax in humans and animals, , Anthrax in Hu...","[{'first': 'Der-Shan', 'middle': [], 'last': '...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Anthrax..."
3,4e65fbf4e2a747fb763730223d749a1a1288ca99,Revista da Sociedade Brasileira de Medicina Tr...,"Sebastian Vernal , Sebastian Vernal, Yuri Casa...","Sebastian Vernal , Sebastian Vernal, Yuri Casa...",Abstract\n\nTegumentary leishmaniasis (TL) dia...,INTRODUCTION\n\nDiagnosing American tegumentar...,Differential diagnosis of 86 cases with initia...,"[{'first': 'Sebastian', 'middle': ['Vernal'], ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Differe..."
4,daaa53a49dc0d23015e22a150495ac657aa993d8,Transcriptional and Translational Landscape of...,"Hazel Stewart, Katherine Brown, Adam M Dinan, ...","Hazel Stewart (University of Cambridge, Cambri...",Abstract\n\nThe genus Torovirus (subfamily Tor...,"\n\ncattle, goats, sheep, pigs, and horses, ca...",Mesoniviridae: a proposed new family in the or...,"[{'first': 'Hazel', 'middle': [], 'last': 'Ste...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Mesoniv..."


In [27]:
noncomm_dir = '/kaggle/input/CORD-19-research-challenge/noncomm_use_subset/noncomm_use_subset/pdf_json/'
noncomm_files = load_files(noncomm_dir)
noncomm_df = generate_clean_df(noncomm_files)
noncomm_df.to_csv('clean_noncomm_use.csv', index=False)
noncomm_df.head()

HBox(children=(FloatProgress(value=0.0, max=2490.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2490.0), HTML(value='')))




Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,cd92f91038067e7a10aa27d676ce696e1e4d67ce,EXPERIMENTAL AND THERAPEUTIC MEDICINE Dimethyl...,"Zhen-Hong Zhu, Wen-Qi Song, Chang-Qing Zhang, ...","Zhen-Hong Zhu (Shanghai Jiao Tong University, ...",Abstract\n\nMesenchymal stem cells have been w...,Introduction\n\nOsteonecrosis of the femoral h...,Avascular necrosis of the femoral head: Vascul...,"[{'first': 'Zhen-Hong', 'middle': [], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Avascul..."
1,bab279da548d8bd363acd5033e9dc54e7dbb7107,Effects of school breaks on influenza- like il...,"Yanhui Chu, Zhenyu Wu, Jiayi Ji, Jingyi Sun, X...","Yanhui Chu, Zhenyu Wu (Fudan University, Shang...",,INTRODUCTION\n\nSchoolchildren play a major ro...,Estimating household and community transmissio...,"[{'first': 'Yanhui', 'middle': [], 'last': 'Ch...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Estimat..."
2,71edbd57cdd9af956a12054932e0cbdb87ce1fea,Social Network Characteristics and Body Mass I...,"Won Joon Lee, Yoosik Youm, Yumie Rhee, Yeong-R...",Won Joon Lee (Yonsei University College of Med...,Abstract\n\nResearch has shown that obesity ap...,INTRODUCTION\n\nThe study of the effects of so...,The contribution of the social environment to ...,"[{'first': 'Won', 'middle': ['Joon'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'The con..."
3,2dfdbf2d6b77426866feaf93486327d372fd27c7,CLINICAL EXPERIMENTAL VACCINE RESEARCH,,,,\n\nThere may be many reasons for the signific...,"A short history of vaccination, S L Plotkin, S...",[],"{'BIBREF0': {'ref_id': 'b0', 'title': 'A short..."
4,0afa3ea846396533c7ca515968abcfea3f895082,Bone Marrow Dendritic Cells from Mice with an ...,"Stacey L Burgess, Erica Buonomo, Maureen Carey...",Stacey L Burgess (Johns Hopkins Bloomberg Scho...,Abstract\n\nThere is an emerging paradigm that...,\n\nport neutrophil infiltration in inflammato...,WHO/PAHO informal consultation on intestinal p...,"[{'first': 'Stacey', 'middle': ['L'], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'WHO/PAH..."
