# COVID-19 Open Research Dataset Challenge

https://www.youtube.com/watch?v=S6GVXk6kbcs

In [2]:
import os 
import pandas as pd
import json
from tqdm import tqdm
import re
import matplotlib.pyplot as plt
import heapq
from rank_bm25 import BM25Okapi
import nltk
import nltk.corpus import stopwords


In [48]:
#Import metadata
metadata = pd.read_csv("metadata.csv") 
metadata = metadata.dropna(subset=['sha'])
metadata.rename(columns={"sha": "paper_id"}, inplace = True)
metadata.rename(columns={"source_x": "source"}, inplace = True)
metadata= metadata.drop(columns = ['title','abstract'])

In [49]:
#import text from papers 
dirs = ['biorxiV_medrxiv', 'comm_use_subset', 'custom_license', 'noncomm_use_subset']

docs = []
for d in dirs:
    print(d)
    for file in tqdm(os.listdir(f"{d}/{d}")):
        filepath = f"{d}/{d}/{file}"
        j = json.load(open(filepath,'rb'))
        title = j['metadata']['title']
        paper_id = j['paper_id']
        try: 
            abstract = j['abstract'][0]['text']
        except:
            abstract = ''
            
        fulltext = ''
        for text in j['body_text']:
            fulltext += text['text']
        docs.append([paper_id,title, abstract, fulltext])


  9%|▊         | 76/885 [00:00<00:01, 747.33it/s]

biorxiV_medrxiv


100%|██████████| 885/885 [00:01<00:00, 748.33it/s]
  1%|          | 53/9118 [00:00<00:17, 512.28it/s]

comm_use_subset


100%|██████████| 9118/9118 [00:17<00:00, 521.04it/s]
  0%|          | 0/16959 [00:00<?, ?it/s]

custom_license


100%|██████████| 16959/16959 [00:27<00:00, 611.60it/s]
  4%|▍         | 89/2353 [00:00<00:02, 884.96it/s]

noncomm_use_subset


100%|██████████| 2353/2353 [00:03<00:00, 646.33it/s]


In [50]:
df = pd.DataFrame(docs, columns = ['paper_id','title', 'abstract', 'fulltext'])

29315

In [52]:
#Join metadata with paper text on paper_id
allpapers_df = pd.merge(df, metadata, on="paper_id")
allpapers_df['journal'] = allpapers_df['journal'].astype(str)
peer_reviewed = allpapers_df['journal'] !='nan'
allpapers_df.insert(12, "peer_reviewed", peer_reviewed, True) 

In [53]:
#Made data frame for all journal papers
journals_df = allpapers_df[allpapers_df['journal']!='nan']
#dataframe for unpublished papers
unpublished_df = allpapers_df[allpapers_df['journal']=='nan']

Unnamed: 0,paper_id,title,abstract,fulltext,source,doi,pmcid,pubmed_id,license,publish_time,authors,journal,peer_reviewed,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file
0,f905f78b32f63c6d14a79984dfb33f1b358b8ab4,Multimerization of HIV-1 integrase hinges on c...,New anti-AIDS treatments must be continually d...,"In the absence of a curative treatment, the hi...",biorxiv,10.1101/301721,,,biorxiv,2018-04-16,"Galilee, M.; Alian, A.",,False,,,True,biorxiv_medrxiv
1,abcfffafab399149d4adadd6bb458c4994e2025d,Time-varying transmission dynamics of Novel Co...,Rationale: Several studies have estimated basi...,"Eighteen years ago, severe acute respiratory s...",biorxiv,10.1101/2020.01.25.919787,,,biorxiv,2020-02-13,"Liu, T.; Hu, J.; Xiao, J.; He, G.; Kang, M.; R...",,False,,,True,biorxiv_medrxiv
2,0cb9c296684ca5e71462d825cab2827854a01544,p53 is not necessary for DUX4 pathology,Summary Statement: DUX4 is thought to mediate ...,Facioscapulohumeral muscular dystrophy (FSHD) ...,biorxiv,10.1101/118315,,,biorxiv,2017-03-19,"Bosnakovski, D.; Toso, E. A.; Recht, O. O.; Cu...",,False,,,True,biorxiv_medrxiv
3,9bbfd3d34ee18ea1b9f4669331a6cee9c5992893,Virological assessment of hospitalized cases o...,"emerged in late 2019 1,2 . Initial outbreaks i...",Pharyngeal virus shedding was very high during...,medrxiv,10.1101/2020.03.05.20030502,,,medrvix,2020-03-08,Roman Woelfel; Victor Max Corman; Wolfgang Gug...,,False,,,True,biorxiv_medrxiv
4,1218f278a4f8d83dac14b23c8f698062812ef9d5,Potential impact of seasonal forcing on a SARS...,A novel coronavirus (SARS-CoV-2) first detecte...,(2.2 with 90% high density interval 1.4-3.8 (R...,medrxiv,10.1101/2020.02.13.20022806,,,medrvix,2020-02-17,Richard A Neher; Robert Dyrdak; Valentin Druel...,,False,,,True,biorxiv_medrxiv


In [54]:
print(f'Total number of papers including journals {len(allpapers_df)} \n\nTotal number of journals {len(journals_df)} \n\nNumber of unpublsihed papers {len(unpublished_df)}')

Total number of papers including journals 27690 

Total number of journals 26796 

Number of unpublsihed papers 894
