# COVID-19 Data Scrapper

In [1]:
%load_ext autoreload
%autoreload 2

## Introduction

## Libraries

In [2]:
# To install libraries, uncomment those lines:
#!pip install bioc

### Standard

In [3]:
import bioc
import numpy as np
import pandas as pd
import os
import json

### Custom

## Paths

In [4]:
pubmed_filename = "../data/covid/pubmed/litcovid2BioCXML.xml"
path_to_documents = "../data/covid/pubmed/documents/"
cordmed_filename = "../data/covid/pubmed/cord-19_2022-06-02/2022-06-02/"

## PubMed

In [12]:
def DocToText(document):
    filename = document.id
    if 'journal' in document.passages[0].infons and len(document.passages[0].infons['journal'].split(";")) > 1:
            filename = get_month_and_year(document.passages[0].infons['journal'].replace(".",";").split(";")[1].strip())
            if filename is None:
                return (None, None)
            filename += "-"+str(document.id)
    txt_to_save = ""
    for passage in document.passages:
        txt_to_save += passage.text + "\n"
    return (filename, txt_to_save)

In [13]:
import re
from dateutil import parser
def get_month_and_year(dt):
    date_str = None

    try:
        if len(dt) == 6:
            dt = (dt[:4]+' ' + dt[4:])
        dtre = re.sub(r"([0-9]+(\.[0-9]+)?)",r" \1 ", dt).strip()
        date = parser.parse(dtre.lower())
        year = date.strftime("%Y")   # Four-digit year
        month = date.month  # Full month name
        date_str = year+"-"+str(month)
    except ValueError:
        try:
            dtre = re.sub(r"([0-9]+(\.[0-9]+)?)",r" \1 ", dt[:9]).strip()
            date = parser.parse(dtre.lower())
            year = date.strftime("%Y")   # Four-digit year
            month = date.month  # Full month name
            date_str = year+"-"+str(month)
        except:
            pass  # Month doesn't exist in the datetime object

    return date_str

In [27]:
from bioc import biocxml
from tqdm import tqdm
# read from a file
count = 0
with biocxml.iterparse(pubmed_filename) as reader:
    collection_info = reader.get_collection_info()
    print(dir(reader))
    for document in tqdm(reader):
        count += 1
        try:
            to_save = DocToText(document)
            if to_save[0] is not None and len(to_save[0].split("-"))>1:
                with open(path_to_documents+str(to_save[0])+'.txt', 'w') as f:
                    f.write(to_save[1])
        except:
            print('Error in '+str(i))

['_BioCXMLDocumentReader__collection', '_BioCXMLDocumentReader__context', '_BioCXMLDocumentReader__document', '_BioCXMLDocumentReader__elem', '_BioCXMLDocumentReader__event', '_BioCXMLDocumentReader__has_next', '_BioCXMLDocumentReader__next_event', '_BioCXMLDocumentReader__passage', '_BioCXMLDocumentReader__read', '_BioCXMLDocumentReader__read_annotation', '_BioCXMLDocumentReader__read_relation', '_BioCXMLDocumentReader__state', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__next__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'file', 'get_collection_info']


1308it [00:02, 496.91it/s]


KeyboardInterrupt: 

In [4]:
import glob
import os
from tqdm import tqdm
count = 0
for filepath in tqdm(glob.iglob(path_to_documents+'/*.txt')):
    path_name = str(filepath)
    directory = path_to_documents + '-'.join(path_name.split('/')[-1].replace(".txt","").split("-")[:2]) +"/"
    if not os.path.exists(directory):
        os.makedirs(directory)
    os.system("mv " + path_name + " " +directory+path_name.split('/')[-1])

17419it [01:06, 261.45it/s]


## PubMed from CORD-19

In [19]:
import csv
import os
import json
from collections import defaultdict
from dateutil import parser

In [6]:
def save_in_directory(filename, filecontent, directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(directory+filename+'.txt', 'w') as f:
        f.write(filecontent)

In [7]:
import glob
import os
from tqdm import tqdm
count = 0
def save_article(pubtime, abstract, full_text, folder, cord_uid):
    filename = pubtime+"-"+cord_uid
    save_in_directory(filename, abstract,folder+"abstract/"+pubtime+"/")
    save_in_directory(filename, full_text,folder+"fullpaper/"+pubtime+"/")

In [8]:
def get_published_time(timestr):
    try:
        date = parser.parse(timestr)
        year = date.strftime("%Y")   # Four-digit year
        month = date.month  # Full month name
        return str(year) + "-" + str(month)
    except:
        return None

In [24]:
# open the file
with open(cordmed_filename + "metadata.csv") as f_in:
    reader = csv.DictReader(f_in)
    for row in tqdm(reader):
        # access some metadata
        pubtime = get_published_time(row[ 'publish_time'])
        if pubtime is None:
            continue
        cord_uid = row['cord_uid']
        title = row['title']
        abstract = row['abstract']
        if(cord_uid == "h5hq6xmm"):
            print(get_published_time(row['publish_time']))
            break

969488it [01:04, 14973.64it/s]

2019-12





In [102]:
# open the file
with open(cordmed_filename + "metadata.csv") as f_in:
    reader = csv.DictReader(f_in)
    for row in reader:
        # access some metadata
        pubtime = get_published_time(row[ 'publish_time'])
        if pubtime is None:
            continue
        cord_uid = row['cord_uid']
        title = row['title']
        abstract = row['abstract']

        # access the full text (if available) for Intro
        full_text = ""
        if row['pdf_json_files']:
            for json_path in row['pdf_json_files'].split('; '):
                with open(cordmed_filename+json_path) as f_json:
                    full_text_dict = json.load(f_json)
                    if(len(full_text_dict['body_text']) > 0):
                        # grab full text from *some* version of the full text
                        for paragraph_dict in full_text_dict['body_text']:
                            paragraph_text = paragraph_dict['text']
                            section_name = paragraph_dict['section']
                            full_text += section_name + "\n" + paragraph_text + "\n"
                        break
        save_article(pubtime, abstract, full_text, path_to_documents, cord_uid)

In [136]:
from gatenlp import Document

In [137]:
from gatenlp import Document
from gatenlp.gateworker import GateWorker
gs = GateWorker(start=False, auth_token="1234")

In [141]:
import glob
import os
from tqdm import tqdm
count = 0
for filepath in tqdm(glob.iglob(path_to_documents+'/fullpaper/2019-12/*.txt')):
    filename = str(filepath).split("/")[-1].replace(".txt","")
    with open(filepath, "r") as f:
        pdoc = gs.worker.createDocument(f.read())
        pdoc.setName(filename)

728it [00:00, 1119.47it/s]


In [123]:
gs.worker

AttributeError: 'function' object has no attribute '_get_object_id'