# COVID19-related literature SQL database

In this notebook, we create a relational database dump of a set of COVID19-related publication datasets. These include:

* CORD19: https://pages.semanticscholar.org/coronavirus-research
* Dimensions: https://docs.google.com/spreadsheets/d/1-kTZJZ1GAhJ2m4GAIhw1ZdlgO46JpvX0ZQa232VWRmw/edit#gid=2034285255
* WHO: https://www.who.int/emergencies/diseases/novel-coronavirus-2019/global-research-on-novel-coronavirus-2019-ncov

In [157]:
# magics, warnings and inports

%load_ext autoreload
%autoreload 2
import warnings; warnings.simplefilter('ignore')

import os, random, codecs, json
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import pymysql
from sqlalchemy import create_engine
from sqlalchemy import Integer,String,Boolean,DateTime

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Load datasets

In [158]:
# point here to the versions of the datasets you want to use
dimensions_filename = "datasets_input/Dimensions_10_04_2020.csv"
who_filename = "datasets_input/WHO_10_04_2020.csv"
cord19_folder = "datasets_input/CORD19_2020_04_10"

df_dimensions = pd.read_csv(dimensions_filename, dtype=str)
df_who = pd.read_csv(who_filename, dtype=str)
df_cord = pd.read_csv(os.path.join(cord19_folder,"metadata.csv"), dtype=str)

In [159]:
df_cord.shape

(51078, 18)

### Prepare dataframes for ingestion

#### Clean-up data frames

##### Dimensions

In [160]:
df_dimensions.head()

Unnamed: 0,Date added,Publication ID,DOI,PMID,PMCID,Title,Abstract,Source title,Source UID,Publisher,...,Research Organizations - standardized,GRID IDs,City of Research organization,Country of Research organization,Funder,UIDs of supporting grants,Times cited,Altmetric,Source Linkout,Dimensions URL
0,2020-04-07,pub.1126168922,10.21203/rs.3.rs-19507/v1,,,Therapeutic Preferences for Coronavirus 2(SARS...,"<title xmlns=""http://www.ncbi.nlm.nih.gov/JATS...",Research Square,jour.1380788,Research Square,...,,,,,,,0,,,https://app.dimensions.ai/details/publication/...
1,2020-04-07,pub.1126165761,10.2174/18742106020140111,,,Coronavirus Disease 2019 (COVID-19) Pandemic B...,,The Open Dentistry Journal,jour.1040388,Bentham Science Publishers,...,,,,,,,0,,,https://app.dimensions.ai/details/publication/...
2,2020-04-07,pub.1126166337,10.26434/chemrxiv.12061734,,,Computational design of ACE2-based short pepti...,<div>Peptide inhibitors against the SARS-CoV-2...,ChemRxiv,jour.1315496,American Chemical Society (ACS),...,,,,,,,0,3.0,,https://app.dimensions.ai/details/publication/...
3,2020-04-07,pub.1126169729,10.32388/rem7m2,,,"Review of ""Smoking, vaping and hospitalization...",,Qeios,jour.1336677,Qeios Ltd,...,,,,,,,0,,,https://app.dimensions.ai/details/publication/...
4,2020-04-07,pub.1126165682,10.21552/estal/2020/1/23,,,Spain COVID-19 ·Juan Jorge Piernas López,,European State Aid Law Quarterly,jour.1273450,Lexxion Verlag,...,,,,,,,0,,,https://app.dimensions.ai/details/publication/...


In [161]:
df_dimensions.columns

Index(['Date added', 'Publication ID', 'DOI', 'PMID', 'PMCID', 'Title',
       'Abstract', 'Source title', 'Source UID', 'Publisher', 'MeSH terms',
       'Publication Date', 'PubYear', 'Volume', 'Issue', 'Pagination',
       'Open Access', 'Publication Type', 'Authors', 'Corresponding Authors',
       'Authors Affiliations', 'Research Organizations - standardized',
       'GRID IDs', 'City of Research organization',
       'Country of Research organization', 'Funder',
       'UIDs of supporting grants', 'Times cited', 'Altmetric',
       'Source Linkout', 'Dimensions URL'],
      dtype='object')

In [162]:
df_dimensions.drop(columns=['Date added', 'Publisher', 'Authors', 'Corresponding Authors',
       'Authors Affiliations', 'Research Organizations - standardized',
       'GRID IDs', 'City of Research organization',
       'Country of Research organization', 'Funder',
       'UIDs of supporting grants', 'Times cited', 'Altmetric',
       'Source Linkout'], inplace=True)

In [163]:
df_dimensions.columns

Index(['Publication ID', 'DOI', 'PMID', 'PMCID', 'Title', 'Abstract',
       'Source title', 'Source UID', 'MeSH terms', 'Publication Date',
       'PubYear', 'Volume', 'Issue', 'Pagination', 'Open Access',
       'Publication Type', 'Dimensions URL'],
      dtype='object')

In [164]:
df_dimensions.rename(columns={'Publication ID':'publication_id', 'DOI':'doi', 'PMID':'pmid', 'PMCID':'pmcid', 'Title':'title', 'Abstract':'abstract',
       'Source title':'journal', 'Source UID':'source_uid', 'MeSH terms':'mesh_terms', 'Publication Date':'publication_date',
       'PubYear':'publication_year', 'Volume':'volume', 'Issue':'issue', 'Pagination':'pages', 'Open Access':'open_access',
       'Publication Type':'publication_type', 'Dimensions URL':'dimensions_url'}, inplace=True)

In [165]:
def get_year(date):
    if len(date)>3 and date[:4].isdigit():
        return date[:4]
    return ""

month_to_number = {"Jan":"1","Feb":"2","Mar":"3","Apr":"4","May":"5","Jun":"6","Jul":"7","Aug":"8","Sep":"9","Oct":"10","Nov":"11","Dec":"12"}

def get_month(date):
    if len(date)>6:
        if "-" in date and date.split("-")[1].isdigit():
            return str(int(date.split("-")[1]))
        else:
            try:
                return month_to_number[date.split()[1]]
            except:
                return ""
    return ""

def sanitize_string(s):
    return " ".join(s.split())

In [166]:
df_dimensions["publication_year"] = df_dimensions["publication_year"].apply(get_year)
df_dimensions["publication_month"] = df_dimensions["publication_date"].apply(get_month)

In [167]:
df_dimensions.drop(columns="publication_date", inplace=True)
df_dimensions = df_dimensions.fillna('')

In [168]:
df_dimensions.head()

Unnamed: 0,publication_id,doi,pmid,pmcid,title,abstract,journal,source_uid,mesh_terms,publication_year,volume,issue,pages,open_access,publication_type,dimensions_url,publication_month
0,pub.1126168922,10.21203/rs.3.rs-19507/v1,,,Therapeutic Preferences for Coronavirus 2(SARS...,"<title xmlns=""http://www.ncbi.nlm.nih.gov/JATS...",Research Square,jour.1380788,,2020,,,,"All OA; Green, Submitted",preprint,https://app.dimensions.ai/details/publication/...,4
1,pub.1126165761,10.2174/18742106020140111,,,Coronavirus Disease 2019 (COVID-19) Pandemic B...,,The Open Dentistry Journal,jour.1040388,,2020,14.0,1.0,111-112,Closed,article,https://app.dimensions.ai/details/publication/...,4
2,pub.1126166337,10.26434/chemrxiv.12061734,,,Computational design of ACE2-based short pepti...,<div>Peptide inhibitors against the SARS-CoV-2...,ChemRxiv,jour.1315496,,2020,,,,"All OA; Green, Submitted",preprint,https://app.dimensions.ai/details/publication/...,4
3,pub.1126169729,10.32388/rem7m2,,,"Review of ""Smoking, vaping and hospitalization...",,Qeios,jour.1336677,,2020,,,,Closed,article,https://app.dimensions.ai/details/publication/...,4
4,pub.1126165682,10.21552/estal/2020/1/23,,,Spain COVID-19 ·Juan Jorge Piernas López,,European State Aid Law Quarterly,jour.1273450,,2020,19.0,1.0,96-97,Closed,article,https://app.dimensions.ai/details/publication/...,1


In [169]:
df_dimensions[df_dimensions.doi==""].shape

(335, 17)

##### WHO

In [170]:
df_who.head()

Unnamed: 0,Title,Authors,Abstract,Published Year,Published Month,Journal,Volume,Issue,Pages,Accession Number,DOI,Ref,Covidence #,Study,Notes,Tags
0,COVID-19 causes more delay in managed care lau...,,The state of Illinois is further delaying the ...,2020,,Mental Health Weekly,30.0,12.0,8-8,,10.1002/mhw.32288,12343,#11606,,,"* Opinion piece; Ethics, social science, econo..."
1,Daily Situation Report on Coronavirus disease ...,,After detection of the first confirmed cases o...,2020,,Arch Acad Emerg Med,,,,32259119.0,,50417,#45346,,,
2,Daily Situation Report on Coronavirus disease ...,,After detection of the first confirmed cases o...,2020,,Arch Acad Emerg Med,,,,32259120.0,,50416,#45347,,,
3,Daily Situation Report on Coronavirus disease ...,,After detection of the first confirmed cases o...,2020,,Arch Acad Emerg Med,,,,32259121.0,,50415,#45348,,,
4,Daily Situation Report on Coronavirus disease ...,,The main strategy of the Ministry of Health (M...,2020,,Arch Acad Emerg Med,,,,32259123.0,,50414,#45349,,,


In [171]:
df_who.columns

Index(['Title', 'Authors', 'Abstract', 'Published Year', 'Published Month',
       'Journal', 'Volume', 'Issue', 'Pages', 'Accession Number', 'DOI', 'Ref',
       'Covidence #', 'Study', 'Notes', 'Tags'],
      dtype='object')

In [172]:
df_who.drop(columns="Authors", inplace=True)

In [173]:
df_who.rename(columns={'Title':'title', 'Abstract':'abstract', 'Published Year':'publication_year', 'Published Month':'publication_month',
       'Journal':'journal', 'Volume':'volume', 'Issue':'issue', 'Pages':'pages', 'Accession Number':'accession_number', 'DOI':'doi', 'Ref':'ref',
       'Covidence #':'covidence', 'Study':'study', 'Notes':'notes', 'Tags':'tags'}, inplace=True)

In [174]:
df_who["pmid"] = ""
df_who["pmcid"] = ""
df_who = df_who.fillna('')

In [175]:
df_who.head()

Unnamed: 0,title,abstract,publication_year,publication_month,journal,volume,issue,pages,accession_number,doi,ref,covidence,study,notes,tags,pmid,pmcid
0,COVID-19 causes more delay in managed care lau...,The state of Illinois is further delaying the ...,2020,,Mental Health Weekly,30.0,12.0,8-8,,10.1002/mhw.32288,12343,#11606,,,"* Opinion piece; Ethics, social science, econo...",,
1,Daily Situation Report on Coronavirus disease ...,After detection of the first confirmed cases o...,2020,,Arch Acad Emerg Med,,,,32259119.0,,50417,#45346,,,,,
2,Daily Situation Report on Coronavirus disease ...,After detection of the first confirmed cases o...,2020,,Arch Acad Emerg Med,,,,32259120.0,,50416,#45347,,,,,
3,Daily Situation Report on Coronavirus disease ...,After detection of the first confirmed cases o...,2020,,Arch Acad Emerg Med,,,,32259121.0,,50415,#45348,,,,,
4,Daily Situation Report on Coronavirus disease ...,The main strategy of the Ministry of Health (M...,2020,,Arch Acad Emerg Med,,,,32259123.0,,50414,#45349,,,,,


In [176]:
df_who[df_who.doi==""].shape

(620, 17)

##### CORD19

In [177]:
df_cord.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,xqhn0vbp,1e1286db212100993d03cc22374b624f7caee956,PMC,Airborne rhinovirus detection and effect of ul...,10.1186/1471-2458-3-5,PMC140314,12525263,no-cc,"BACKGROUND: Rhinovirus, the most common cause ...",2003-01-13,"Myatt, Theodore A; Johnston, Sebastian L; Rudn...",BMC Public Health,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
1,gi6uaa83,8ae137c8da1607b3a8e4c946c07ca8bda67f88ac,PMC,Discovering human history from stomach bacteria,10.1186/gb-2003-4-5-213,PMC156578,12734001,no-cc,Recent analyses of human pathogens have reveal...,2003-04-28,"Disotell, Todd R",Genome Biol,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
2,le0ogx1s,,PMC,A new recruit for the army of the men of death,10.1186/gb-2003-4-7-113,PMC193621,12844350,no-cc,"The army of the men of death, in John Bunyan's...",2003-06-27,"Petsko, Gregory A",Genome Biol,,,False,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
3,fy4w7xz8,0104f6ceccf92ae8567a0102f89cbb976969a774,PMC,Association of HLA class I with severe acute r...,10.1186/1471-2350-4-9,PMC212558,12969506,no-cc,BACKGROUND: The human leukocyte antigen (HLA) ...,2003-09-12,"Lin, Marie; Tseng, Hsiang-Kuang; Trejaut, Jean...",BMC Med Genet,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...
4,0qaoam29,5b68a553a7cbbea13472721cd1ad617d42b40c26,PMC,A double epidemic model for the SARS propagation,10.1186/1471-2334-3-19,PMC222908,12964944,no-cc,BACKGROUND: An epidemic of a Severe Acute Resp...,2003-09-10,"Ng, Tuen Wai; Turinici, Gabriel; Danchin, Antoine",BMC Infect Dis,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...


In [178]:
# NEW columns (for now, we drop)
df_cord.drop(columns=["cord_uid","url","has_pmc_xml_parse"],inplace=True)

In [179]:
df_cord.columns

Index(['sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id', 'license',
       'abstract', 'publish_time', 'authors', 'journal',
       'Microsoft Academic Paper ID', 'WHO #Covidence', 'has_pdf_parse',
       'full_text_file'],
      dtype='object')

In [180]:
df_cord.drop(columns='authors', inplace=True)
df_cord = df_cord.fillna('')

In [181]:
df_cord.rename(columns={'source_x':'source', 'pubmed_id': 'pmid',
       'Microsoft Academic Paper ID': 'ms_academic_id', 'WHO #Covidence': 'who_covidence', 'has_pdf_parse':'has_full_text'}, inplace=True)

In [182]:
df_cord["publication_year"] = df_cord["publish_time"].apply(get_year)
df_cord["publication_month"] = df_cord["publish_time"].apply(get_month)

In [183]:
df_cord.drop(columns='publish_time', inplace=True)

In [184]:
df_cord['pages'] = ""
df_cord['volume'] = ""
df_cord['issue'] = ""

In [185]:
df_cord.head()

Unnamed: 0,sha,source,title,doi,pmcid,pmid,license,abstract,journal,ms_academic_id,who_covidence,has_full_text,full_text_file,publication_year,publication_month,pages,volume,issue
0,1e1286db212100993d03cc22374b624f7caee956,PMC,Airborne rhinovirus detection and effect of ul...,10.1186/1471-2458-3-5,PMC140314,12525263,no-cc,"BACKGROUND: Rhinovirus, the most common cause ...",BMC Public Health,,,True,custom_license,2003,1,,,
1,8ae137c8da1607b3a8e4c946c07ca8bda67f88ac,PMC,Discovering human history from stomach bacteria,10.1186/gb-2003-4-5-213,PMC156578,12734001,no-cc,Recent analyses of human pathogens have reveal...,Genome Biol,,,True,custom_license,2003,4,,,
2,,PMC,A new recruit for the army of the men of death,10.1186/gb-2003-4-7-113,PMC193621,12844350,no-cc,"The army of the men of death, in John Bunyan's...",Genome Biol,,,False,custom_license,2003,6,,,
3,0104f6ceccf92ae8567a0102f89cbb976969a774,PMC,Association of HLA class I with severe acute r...,10.1186/1471-2350-4-9,PMC212558,12969506,no-cc,BACKGROUND: The human leukocyte antigen (HLA) ...,BMC Med Genet,,,True,custom_license,2003,9,,,
4,5b68a553a7cbbea13472721cd1ad617d42b40c26,PMC,A double epidemic model for the SARS propagation,10.1186/1471-2334-3-19,PMC222908,12964944,no-cc,BACKGROUND: An epidemic of a Severe Acute Resp...,BMC Infect Dis,,,True,custom_license,2003,9,,,


In [186]:
df_cord[(df_cord.doi=="") & ((df_cord.sha!="") | (df_cord.pmid!="") | (df_cord.pmcid!=""))].shape

(3038, 18)

In [187]:
df_dimensions.shape

(9332, 17)

In [188]:
df_who.shape

(5362, 17)

In [189]:
df_cord.shape

(51078, 18)

### Prepare tables

In [190]:
# the main table: pub

In [253]:
pub_table_columns = ['title','abstract','publication_year','publication_month','journal','volume','issue','pages','doi','pmid','pmcid']

df_pub = df_dimensions[pub_table_columns].append(df_who[pub_table_columns], ignore_index = True)

In [254]:
df_pub = df_pub[pub_table_columns].append(df_cord[pub_table_columns], ignore_index=True)

In [256]:
df_pub["title"] = df_pub["title"].apply(sanitize_string)
df_pub["abstract"] = df_pub["abstract"].apply(sanitize_string)
df_pub["doi"] = df_pub["doi"].apply(str.lower)
df_pub["pmid"] = df_pub["pmid"].apply(str.lower)
df_pub["pmcid"] = df_pub["pmcid"].apply(str.lower)

In [257]:
df_pub.shape

(65772, 11)

In [258]:
df_pub[(df_pub.doi=="") & (df_pub.pmid=="") & (df_pub.pmcid=="")].shape

(1193, 11)

In [259]:
# check to have at least one valid identifier per publication
# we drop publications which do not: hopefully, they will be equipped with an identifier in future releases

df_pub = df_pub[~((df_pub.doi=="") & (df_pub.pmid=="") & (df_pub.pmcid==""))]

In [260]:
# drop duplicates, first on dois then pmids then pmcids. We need this to keep empty values!
df_tmp = df_pub[df_pub.doi==""]
df_pub = df_pub[df_pub.doi!=""].groupby('doi').first()
df_pub.reset_index(inplace=True)
df_tmp2 = df_tmp[df_tmp.pmid==""]
df_pub2 = df_tmp[df_tmp.pmid!=""].groupby('pmid').first()
df_pub2.reset_index(inplace=True)
df_pub3 = df_tmp2[df_tmp2.pmcid!=""].groupby('pmcid').first()
df_pub3.reset_index(inplace=True)

In [261]:
df_pub = pd.concat([df_pub,df_pub2,df_pub3])

In [262]:
# add PK and reset index
df_pub.reset_index(drop=True,inplace=True)
df_pub["pub_id"] = df_pub.index.values

In [263]:
df_pub.shape

(57066, 12)

In [264]:
df_pub.tail()

Unnamed: 0,doi,title,abstract,publication_year,publication_month,journal,volume,issue,pages,pmid,pmcid,pub_id
57061,,Daily Situation Report on Coronavirus disease ...,After detection of the first confirmed cases o...,2020,3,Arch Acad Emerg Med,,,,,pmc7114930,57061
57062,,Daily Situation Report on Coronavirus disease ...,The main strategy of the Ministry of Health (M...,2020,3,Arch Acad Emerg Med,,,,,pmc7114933,57062
57063,,Epidemiological and Clinical Aspects of COVID-...,There are significant misconceptions and many ...,2020,4,Arch Acad Emerg Med,,,,,pmc7117787,57063
57064,,Covid 19 pandemic and gynaecological laparosco...,"<p xmlns=""https://jats.nlm.nih.gov/ns/archivin...",2020,4,"Facts, views & vision in ObGyn",12.0,1.0,3-7,,pmc7117791,57064
57065,,Laboratory Parameters in Detection of COVID-19...,INTRODUCTION: The role of laboratory parameter...,2020,4,Arch Acad Emerg Med,,,,,pmc7130449,57065


In [265]:
df_pub.dtypes

doi                  object
title                object
abstract             object
publication_year     object
publication_month    object
journal              object
volume               object
issue                object
pages                object
pmid                 object
pmcid                object
pub_id                int64
dtype: object

In [269]:
# create other tables via joins

df_datasource = pd.DataFrame.from_dict({"source":["CORD19","Dimensions","WHO"],"url":["https://pages.semanticscholar.org/coronavirus-research","https://docs.google.com/spreadsheets/d/1-kTZJZ1GAhJ2m4GAIhw1ZdlgO46JpvX0ZQa232VWRmw/edit#gid=2034285255",
"https://www.who.int/emergencies/diseases/novel-coronavirus-2019/global-research-on-novel-coronavirus-2019-ncov"]})
df_cord_metadata = df_cord[['source','license','full_text_file','ms_academic_id','who_covidence','doi','pmid','pmcid','sha']]
df_who_metadata = df_who[['accession_number', 'doi', 'ref',
       'covidence', 'study', 'notes', 'tags', 'pmid', 'pmcid']]
df_dimensions_metadata = df_dimensions[['publication_id', 'doi', 'pmid', 'pmcid', 'source_uid', 'mesh_terms',
       'open_access', 'publication_type', 'dimensions_url']]

In [270]:
df_cord_metadata["doi"] = df_cord_metadata["doi"].apply(str.lower)
df_cord_metadata["pmid"] = df_cord_metadata["pmid"].apply(str.lower)
df_cord_metadata["pmcid"] = df_cord_metadata["pmcid"].apply(str.lower)
df_who_metadata["doi"] = df_who_metadata["doi"].apply(str.lower)
df_who_metadata["pmid"] = df_who_metadata["pmid"].apply(str.lower)
df_who_metadata["pmcid"] = df_who_metadata["pmcid"].apply(str.lower)
df_dimensions_metadata["doi"] = df_dimensions_metadata["doi"].apply(str.lower)
df_dimensions_metadata["pmid"] = df_dimensions_metadata["pmid"].apply(str.lower)
df_dimensions_metadata["pmcid"] = df_dimensions_metadata["pmcid"].apply(str.lower)

In [271]:
df_datasource.head()

Unnamed: 0,source,url
0,CORD19,https://pages.semanticscholar.org/coronavirus-...
1,Dimensions,https://docs.google.com/spreadsheets/d/1-kTZJZ...
2,WHO,https://www.who.int/emergencies/diseases/novel...


In [272]:
# CORD19 metadata

In [273]:
#We need this to keep empty values!

df_tmp = df_cord_metadata[df_cord_metadata.doi==""]
df_cord_metadata1 = pd.merge(df_cord_metadata[df_cord_metadata.doi!=""], df_pub[['pub_id','doi']],  how='inner', left_on=['doi'], right_on=['doi'])
df_tmp2 = df_tmp[df_tmp.pmid==""]
df_cord_metadata2 = pd.merge(df_tmp[df_tmp.pmid!=""], df_pub[['pub_id','pmid']],  how='inner', left_on=['pmid'], right_on=['pmid'])
df_cord_metadata3 = pd.merge(df_tmp2[df_tmp2.pmcid!=""], df_pub[['pub_id','pmcid']],  how='inner', left_on=['pmcid'], right_on=['pmcid'])

In [274]:
df_cord_metadata1 = df_cord_metadata1.groupby("doi").first()
df_cord_metadata1.reset_index(inplace=True)
df_cord_metadata2 = df_cord_metadata2.groupby("pmid").first()
df_cord_metadata2.reset_index(inplace=True)
df_cord_metadata3 = df_cord_metadata3.groupby("pmcid").first()
df_cord_metadata3.reset_index(inplace=True)

In [275]:
df_cord_metadata = pd.concat([df_cord_metadata1,df_cord_metadata2,df_cord_metadata3])

In [276]:
df_cord_metadata.shape

(50759, 10)

In [210]:
# read full texts in
folders = ['biorxiv_medrxiv/pdf_json','comm_use_subset/pdf_json','custom_license/pdf_json','noncomm_use_subset/pdf_json']
shas = list()
full_texts = list()

for folder in folders:
    for root, dirs, files in os.walk(os.path.join(cord19_folder,folder)):
        for file in tqdm(files):
            if ".json" in file: # read
                data = json.loads(codecs.open(os.path.join(root,file)).read())
                sha = data["paper_id"]
                full_text = " ".join(sanitize_string(section["text"]) for section in data["body_text"])
                shas.append(sha)
                full_texts.append(full_text)

HBox(children=(FloatProgress(value=0.0, max=1625.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9524.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26505.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2490.0), HTML(value='')))




In [211]:
df_cord_fulltext = pd.DataFrame.from_dict({"sha":shas,"full_text":full_texts})

In [277]:
df_cord_fulltext.shape

(40144, 2)

In [278]:
df_cord_metadata = pd.merge(df_cord_metadata, df_cord_fulltext,  how='left', left_on=['sha'], right_on=['sha'])
df_cord_metadata = df_cord_metadata.fillna('')
df_cord_metadata.rename(columns={"id":"pub_id"},inplace=True)

In [279]:
df_cord_metadata.head()

Unnamed: 0,doi,source,license,full_text_file,ms_academic_id,who_covidence,pmid,pmcid,sha,pub_id,full_text
0,0.1126/science.abb7331,WHO,unk,,,#8463,,,,0,
1,10.0376/cma.j.issn.0376-2491.2020.0002,WHO,unk,,3003451419.0,#615,32036640.0,,,1,
2,10.1001/archinte.168.22.2489,PMC,unk,,,,19064834.0,pmc2783624,,2,
3,10.1001/jama.2010.675,PMC,unk,,,,20501927.0,pmc2968755,,3,
4,10.1001/jama.2014.2116,PMC,unk,,,,24566924.0,pmc6689404,,4,


In [280]:
# WHO and Dimensions metadata

In [281]:
df_tmp = df_who_metadata[df_who_metadata.doi==""]
df_who_metadata1 = pd.merge(df_who_metadata[df_who_metadata.doi!=""], df_pub[['pub_id','doi']],  how='inner', left_on=['doi'], right_on=['doi'])
df_tmp2 = df_tmp[df_tmp.pmid==""]
df_who_metadata2 = pd.merge(df_tmp[df_tmp.pmid!=""], df_pub[['pub_id','pmid']],  how='inner', left_on=['pmid'], right_on=['pmid'])
df_who_metadata3 = pd.merge(df_tmp2[df_tmp2.pmcid!=""], df_pub[['pub_id','pmcid']],  how='inner', left_on=['pmcid'], right_on=['pmcid'])

In [282]:
df_who_metadata1 = df_who_metadata1.groupby("doi").first()
df_who_metadata1.reset_index(inplace=True)
df_who_metadata2 = df_who_metadata2.groupby("pmid").first()
df_who_metadata2.reset_index(inplace=True)
df_who_metadata3 = df_who_metadata3.groupby("pmcid").first()
df_who_metadata3.reset_index(inplace=True)

In [283]:
df_who_metadata = pd.concat([df_who_metadata1,df_who_metadata2,df_who_metadata3])

In [284]:
df_who_metadata.shape

(4525, 10)

In [285]:
df_who_metadata.rename(columns={"id":"pub_id"},inplace=True)

In [286]:
df_tmp = df_dimensions_metadata[df_dimensions_metadata.doi==""]
df_dimensions_metadata1 = pd.merge(df_dimensions_metadata[df_dimensions_metadata.doi!=""], df_pub[['pub_id','doi']],  how='inner', left_on=['doi'], right_on=['doi'])
df_tmp2 = df_tmp[df_tmp.pmid==""]
df_dimensions_metadata2 = pd.merge(df_tmp[df_tmp.pmid!=""], df_pub[['pub_id','pmid']],  how='inner', left_on=['pmid'], right_on=['pmid'])
df_dimensions_metadata3 = pd.merge(df_tmp2[df_tmp2.pmcid!=""], df_pub[['pub_id','pmcid']],  how='inner', left_on=['pmcid'], right_on=['pmcid'])

In [287]:
df_dimensions_metadata1 = df_dimensions_metadata1.groupby("doi").first()
df_dimensions_metadata1.reset_index(inplace=True)
df_dimensions_metadata2 = df_dimensions_metadata2.groupby("pmid").first()
df_dimensions_metadata2.reset_index(inplace=True)
df_dimensions_metadata3 = df_dimensions_metadata3.groupby("pmcid").first()
df_dimensions_metadata3.reset_index(inplace=True)

In [288]:
df_dimensions_metadata = pd.concat([df_dimensions_metadata1,df_dimensions_metadata2,df_dimensions_metadata3])

In [289]:
df_dimensions_metadata.shape

(9029, 10)

In [290]:
df_dimensions_metadata.rename(columns={"id":"pub_id"},inplace=True)

In [291]:
# Create datasource tables

In [292]:
cord_source_id = df_datasource[df_datasource.source=="CORD19"].index.values[0]
who_source_id = df_datasource[df_datasource.source=="WHO"].index.values[0]
dimensions_source_id = df_datasource[df_datasource.source=="Dimensions"].index.values[0]

In [293]:
df_cord_metadata["source_id"] = cord_source_id
df_who_metadata["source_id"] = who_source_id
df_dimensions_metadata["source_id"] = dimensions_source_id

In [294]:
df_pub_to_datasource = df_cord_metadata[["pub_id","source_id"]]
df_pub_to_datasource = df_pub_to_datasource.append(df_who_metadata[["pub_id","source_id"]],ignore_index=True)
df_pub_to_datasource = df_pub_to_datasource.append(df_dimensions_metadata[["pub_id","source_id"]],ignore_index=True)

In [295]:
df_pub_to_datasource.drop_duplicates(inplace=True)
df_pub_to_datasource.rename(columns={"source_id":"datasource_id"},inplace=True)

In [296]:
df_pub_to_datasource.shape

(64297, 2)

In [297]:
df_pub_to_datasource[df_pub_to_datasource.pub_id==22787]

Unnamed: 0,pub_id,datasource_id
21801,22787,0


In [298]:
# remove unnecessary columns
df_cord_metadata.drop(columns=['doi','pmid','pmcid','source_id'],inplace=True)
df_who_metadata.drop(columns=['doi','pmid','pmcid','source_id'],inplace=True)
df_dimensions_metadata.drop(columns=['doi','pmid','pmcid','source_id'],inplace=True)

In [299]:
# reset all indexes which will become PKs
df_cord_metadata.reset_index(drop=True,inplace=True)
df_who_metadata.reset_index(drop=True,inplace=True)
df_dimensions_metadata.reset_index(drop=True,inplace=True)
df_datasource.reset_index(drop=True,inplace=True)
df_cord_metadata["cord19_metadata_id"] = df_cord_metadata.index.values
df_who_metadata["who_metadata_id"] = df_who_metadata.index.values
df_dimensions_metadata["dimensions_metadata_id"] = df_dimensions_metadata.index.values
df_datasource["datasource_metadata_id"] = df_datasource.index.values

In [300]:
# make numeric where needed
df_pub["publication_year"] = pd.to_numeric(df_pub["publication_year"])
df_pub["publication_month"] = pd.to_numeric(df_pub["publication_month"])
df_pub["pmid"] = pd.to_numeric(df_pub["pmid"])

In [301]:
# add timestamp
df_pub["timestamp"] = pd.Timestamp.now()

In [302]:
# clean-up text (optional)
replaces = [""]

def clean_up(txt):
    for r in replaces:
        txt = txt.replace(r,"")
    return txt.encode('utf8', 'ignore').decode('utf8')
df_pub["abstract"] = [clean_up(a) for a in df_pub["abstract"].values]

In [303]:
df_pub.head()

Unnamed: 0,doi,title,abstract,publication_year,publication_month,journal,volume,issue,pages,pmid,pmcid,pub_id,timestamp
0,0.1126/science.abb7331,‘A ticking time bomb’: Scientists worry about ...,"CAPE TOWN, SOUTH AFRICA—Late on Sunday evening...",2020.0,,Science,,,,,,0,2020-04-13 08:51:40.021610
1,10.0376/cma.j.issn.0376-2491.2020.0002,[Ten hot issues of breast cancer under the nov...,,2020.0,2.0,Chinese medical journal,100.0,0.0,e002,32036640.0,,1,2020-04-13 08:51:40.021610
2,10.1001/archinte.168.22.2489,Another Piece of the Puzzle: Human Metapneumov...,BACKGROUND: Each winter respiratory viruses ac...,2008.0,12.0,Archives of Internal Medicine,,,,19064834.0,pmc2783624,2,2020-04-13 08:51:40.021610
3,10.1001/jama.2010.675,Viral etiology of severe pneumonia among Kenya...,CONTEXT: Pneumonia is the leading cause of chi...,2010.0,5.0,JAMA,,,,20501927.0,pmc2968755,3,2020-04-13 08:51:40.021610
4,10.1001/jama.2014.2116,Critically Ill Patients With Influenza A(H1N1)...,,2014.0,4.0,JAMA,,,,24566924.0,pmc6689404,4,2020-04-13 08:51:40.021610


In [304]:
# reorder the columns to match the SQL schema

df_datasource.columns

Index(['source', 'url', 'datasource_metadata_id'], dtype='object')

In [305]:
df_pub = df_pub[['pub_id', 'title', 'abstract', 'publication_year', 'publication_month', 'journal',
       'volume', 'issue', 'pages', 'doi', 'pmid', 'pmcid',
       'timestamp']]
df_who_metadata = df_who_metadata[['who_metadata_id', 'accession_number', 'ref', 'covidence', 'study', 'notes', 'tags',
       'pub_id']]
df_dimensions_metadata = df_dimensions_metadata[['dimensions_metadata_id', 'publication_id', 'source_uid', 'open_access',
       'publication_type', 'dimensions_url', 'mesh_terms', 'pub_id']]
df_cord_metadata = df_cord_metadata[[ 'cord19_metadata_id', 'source', 'license', 'full_text_file', 'ms_academic_id',
       'who_covidence', 'sha', 'full_text', 'pub_id']]
df_datasource = df_datasource[['datasource_metadata_id', 'source', 'url']]

In [306]:
df_pub.doi.value_counts()

                                              3081
10.1007/978-3-319-54093-1_1                      1
10.1038/425915a                                  1
10.1016/s0924-8579(05)80083-4                    1
10.1186/1471-2334-10-82                          1
                                              ... 
10.1186/1471-2334-5-87                           1
https://doi.org/10.1016/j.jmii.2020.03.004       1
10.1016/j.hrtlng.2015.02.007                     1
10.1186/s12939-016-0358-0                        1
10.1016/j.prevetmed.2015.04.009                  1
Name: doi, Length: 53986, dtype: int64

In [307]:
df_pub[df_pub.doi == "10.1016/s0140-6736(20)30607-3"].doi.to_string()

'22681    10.1016/s0140-6736(20)30607-3'

### Dump to CSV

In [309]:
### Export the df_pub dataframe for further use

df_pub.to_csv("dataset_output/df_pub.csv", compression="gzip", index=False)

In [310]:
# export TSV for ingestion

df_pub.to_csv("dataset_output/sql_tables/pub.csv",index=False,sep="\t",header=False)
df_cord_metadata.to_csv("dataset_output/sql_tables/cord19_metadata.csv",index=False,sep="\t",header=False)
df_dimensions_metadata.to_csv("dataset_output/sql_tables/dimensions_metadata.csv",index=False,sep="\t",header=False)
df_who_metadata.to_csv("dataset_output/sql_tables/who_metadata.csv",index=False,sep="\t",header=False)
df_datasource.to_csv("dataset_output/sql_tables/datasource.csv",index=False,sep="\t",header=False)
df_pub_to_datasource.to_csv("dataset_output/sql_tables/pub_datasource.csv",index=False,sep="\t",header=False)

### Dump to MySQL

Use this if you want to create a MySQL db.

In [None]:
dtype_dict = {'pub_id':Integer, 'title':String, 'abstract':String, 'publication_year':Integer, 'publication_month':Integer, 'journal':String,
       'volume':String, 'issue':String, 'pages':String, 'doi':String, 'pmid':Integer, 'pmcid':String, 'timestamp':DateTime}

In [None]:
# get API key
import configparser
config = configparser.ConfigParser()
config.read("credentials/conf.ini")
mysql_username = config["MYSQL"]["username"]
mysql_password = config["MYSQL"]["password"]
mysql_database = config["MYSQL"]["database"]

In [None]:
sqlEngine = create_engine('mysql+pymysql://%s:%s@127.0.0.1/%s'%(mysql_username,mysql_password,mysql_database), pool_recycle=3600)
dbConnection = sqlEngine.connect()

In [None]:
# main table
table_name = "pub"
try:
    frame = df_pub.to_sql(table_name, dbConnection, if_exists='append', index=False, index_label="pub_id", dtype=dtype_dict);
except ValueError as vx:
    print(vx)
except Exception as ex:   
    print(ex)
else:
    print("Table %s created successfully."%table_name);   
finally:
    dbConnection.close()

In [None]:
sqlEngine = create_engine('mysql+pymysql://%s:%s@127.0.0.1/%s'%(mysql_username,mysql_password,mysql_database), pool_recycle=3600)
dbConnection = sqlEngine.connect()

In [None]:
# other tables
try:
    frame = df_cord_metadata.to_sql("cord19_metadata", dbConnection, if_exists='append', index=True, index_label="cord19_metadata_id")
    frame = df_who_metadata.to_sql("who_metadata", dbConnection, if_exists='append', index=True, index_label="who_metadata_id")
    frame = df_dimensions_metadata.to_sql("dimensions_metadata", dbConnection, if_exists='append', index=True, index_label="dimensions_metadata_id")
    frame = df_datasource.to_sql("datasource", dbConnection, if_exists='append', index=True, index_label="datasource_id")
except ValueError as vx:
    print(vx)
except Exception as ex:   
    print(ex)
else:
    print("Tables created successfully.");   
finally:
    dbConnection.close()

In [None]:
sqlEngine = create_engine('mysql+pymysql://%s:%s@127.0.0.1/%s'%(mysql_username,mysql_password,mysql_database), pool_recycle=3600)
dbConnection = sqlEngine.connect()

In [None]:
# last table
try:
    frame = df_pub_to_datasource.to_sql("pub_datasource", dbConnection, if_exists='append', index=False, index_label=["pub_id","datasource_id"])
except ValueError as vx:
    print(vx)
except Exception as ex:   
    print(ex)
else:
    print("Table created successfully.");   
finally:
    dbConnection.close()