# COVID19-related literature SQL database

In this notebook, we create a relational database dump of a set of COVID19-related publication datasets. These include:

* CORD19: https://pages.semanticscholar.org/coronavirus-research
* Dimensions: https://docs.google.com/spreadsheets/d/1-kTZJZ1GAhJ2m4GAIhw1ZdlgO46JpvX0ZQa232VWRmw/edit#gid=2034285255
* WHO: https://www.who.int/emergencies/diseases/novel-coronavirus-2019/global-research-on-novel-coronavirus-2019-ncov

In [1]:
# magics, warnings and inports

%load_ext autoreload
%autoreload 2
import warnings; warnings.simplefilter('ignore')

import os, random, codecs, json
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import pymysql
from sqlalchemy import create_engine
from sqlalchemy import Integer,String,Boolean,DateTime

#### Load datasets

In [2]:
# point here to the versions of the datasets you want to use
dimensions_filename = "datasets_input/Dimensions_28_03_2020.csv"
who_filename = "datasets_input/WHO_28_03_2020.csv"
cord19_folder = "datasets_input/CORD19_2020_03_28"

df_dimensions = pd.read_csv(dimensions_filename, dtype=str)
df_who = pd.read_csv(who_filename, dtype=str)
df_cord = pd.read_csv(os.path.join(cord19_folder,"metadata.csv"), dtype=str)

### Prepare dataframes for ingestion

#### Clean-up data frames

##### Dimensions

In [3]:
df_dimensions.head()

Unnamed: 0,Date added,Publication ID,DOI,PMID,PMCID,Title,Abstract,Source title,Source UID,Publisher,...,Research Organizations - standardized,GRID IDs,City of Research organization,Country of Research organization,Funder,UIDs of supporting grants,Times cited,Altmetric,Source Linkout,Dimensions URL
0,2020-03-23,pub.1125820003,10.29333/ejgm/7850,,,The Possible Immunological Pathways for the Va...,,Electronic Journal of General Medicine,jour.1371209,Modestum,...,,,,,,,0,,,https://app.dimensions.ai/details/publication/...
1,2020-03-23,pub.1125819565,10.21203/rs.3.rs-18190/v1,,,A Method of Estimating Time-to-Recovery for a ...,,Research Square,jour.1380788,Research Square,...,,,,,,,0,,,https://app.dimensions.ai/details/publication/...
2,2020-03-23,pub.1125819661,10.2196/preprints.18821,,,Preparation for the quarantine of the cruise s...,,JMIR Preprints,jour.1345647,JMIR Publications,...,,,,,,,0,,http://dx.doi.org/10.2196/preprints.18821,https://app.dimensions.ai/details/publication/...
3,2020-03-23,pub.1125818201,10.26355/eurrev_202003_20551,32196628.0,,Differences and similarities between Severe Ac...,,European review for medical and pharmacologica...,jour.1092032,,...,University of Catania,grid.8158.4,Catania,Italy,,,0,1.0,,https://app.dimensions.ai/details/publication/...
4,2020-03-23,pub.1125818269,10.12116/j.issn.1004-5619.2020.01.001,32198983.0,,From SARS-CoV to SARS-CoV-2: The response and ...,Abstract:,Fa yi xue za zhi,jour.1108124,,...,Southern Medical University,grid.284723.8,Guangzhou,China,,,0,,,https://app.dimensions.ai/details/publication/...


In [4]:
df_dimensions.columns

Index(['Date added', 'Publication ID', 'DOI', 'PMID', 'PMCID', 'Title',
       'Abstract', 'Source title', 'Source UID', 'Publisher', 'MeSH terms',
       'Publication Date', 'PubYear', 'Volume', 'Issue', 'Pagination',
       'Open Access', 'Publication Type', 'Authors', 'Corresponding Authors',
       'Authors Affiliations', 'Research Organizations - standardized',
       'GRID IDs', 'City of Research organization',
       'Country of Research organization', 'Funder',
       'UIDs of supporting grants', 'Times cited', 'Altmetric',
       'Source Linkout', 'Dimensions URL'],
      dtype='object')

In [5]:
df_dimensions.drop(columns=['Date added', 'Publisher', 'Authors', 'Corresponding Authors',
       'Authors Affiliations', 'Research Organizations - standardized',
       'GRID IDs', 'City of Research organization',
       'Country of Research organization', 'Funder',
       'UIDs of supporting grants', 'Times cited', 'Altmetric',
       'Source Linkout'], inplace=True)

In [6]:
df_dimensions.columns

Index(['Publication ID', 'DOI', 'PMID', 'PMCID', 'Title', 'Abstract',
       'Source title', 'Source UID', 'MeSH terms', 'Publication Date',
       'PubYear', 'Volume', 'Issue', 'Pagination', 'Open Access',
       'Publication Type', 'Dimensions URL'],
      dtype='object')

In [7]:
df_dimensions.rename(columns={'Publication ID':'publication_id', 'DOI':'doi', 'PMID':'pmid', 'PMCID':'pmcid', 'Title':'title', 'Abstract':'abstract',
       'Source title':'journal', 'Source UID':'source_uid', 'MeSH terms':'mesh_terms', 'Publication Date':'publication_date',
       'PubYear':'publication_year', 'Volume':'volume', 'Issue':'issue', 'Pagination':'pages', 'Open Access':'open_access',
       'Publication Type':'publication_type', 'Dimensions URL':'dimensions_url'}, inplace=True)

In [8]:
def get_year(date):
    if len(date)>3 and date[:4].isdigit():
        return date[:4]
    return ""

month_to_number = {"Jan":"1","Feb":"2","Mar":"3","Apr":"4","May":"5","Jun":"6","Jul":"7","Aug":"8","Sep":"9","Oct":"10","Nov":"11","Dec":"12"}

def get_month(date):
    if len(date)>6:
        if "-" in date and date.split("-")[1].isdigit():
            return str(int(date.split("-")[1]))
        else:
            try:
                return month_to_number[date.split()[1]]
            except:
                return ""
    return ""

In [9]:
df_dimensions["publication_year"] = df_dimensions["publication_year"].apply(get_year)
df_dimensions["publication_month"] = df_dimensions["publication_date"].apply(get_month)

In [10]:
df_dimensions.drop(columns="publication_date", inplace=True)
df_dimensions = df_dimensions.fillna('')

In [11]:
df_dimensions.head()

Unnamed: 0,publication_id,doi,pmid,pmcid,title,abstract,journal,source_uid,mesh_terms,publication_year,volume,issue,pages,open_access,publication_type,dimensions_url,publication_month
0,pub.1125820003,10.29333/ejgm/7850,,,The Possible Immunological Pathways for the Va...,,Electronic Journal of General Medicine,jour.1371209,,2020,17.0,4.0,,Closed,article,https://app.dimensions.ai/details/publication/...,3
1,pub.1125819565,10.21203/rs.3.rs-18190/v1,,,A Method of Estimating Time-to-Recovery for a ...,,Research Square,jour.1380788,,2020,,,,"All OA; Green, Submitted",preprint,https://app.dimensions.ai/details/publication/...,3
2,pub.1125819661,10.2196/preprints.18821,,,Preparation for the quarantine of the cruise s...,,JMIR Preprints,jour.1345647,,2020,,,,"All OA; Green, Submitted",preprint,https://app.dimensions.ai/details/publication/...,3
3,pub.1125818201,10.26355/eurrev_202003_20551,32196628.0,,Differences and similarities between Severe Ac...,,European review for medical and pharmacologica...,jour.1092032,,2020,24.0,5.0,2781-2783,Closed,article,https://app.dimensions.ai/details/publication/...,3
4,pub.1125818269,10.12116/j.issn.1004-5619.2020.01.001,32198983.0,,From SARS-CoV to SARS-CoV-2: The response and ...,Abstract:,Fa yi xue za zhi,jour.1108124,,2020,36.0,1.0,1-3,Closed,article,https://app.dimensions.ai/details/publication/...,2


In [12]:
df_dimensions[df_dimensions.doi==""].shape

(120, 17)

##### WHO

In [13]:
df_who.head()

Unnamed: 0,Title,Authors,Abstract,Published Year,Published Month,Journal,Volume,Issue,Pages,Accession Number,DOI,Ref,Covidence #,Study,Notes,Tags
0,Disseminated intravascular coagulation in pati...,"Lillicrap, David",,2020,,Journal of thrombosis and haemostasis : JTH,,,,,10.1111/jth.14781,16680,#15962,Lillicrap 2020,,"* Case study/series; Clinical aspects, diagnos..."
1,Chest CT Findings in a Pregnant Patient with 2...,"Liao, Xinggui; Yang, Huan; Kong, Junfeng; Yang...",,2020,,Balkan medical journal,,,,,10.4274/balkanmedj.galenos.2020.2020.3.89,16639,#15947,Liao 2020,,"* Case study/series; Clinical aspects, diagnos..."
2,A novel bacterium-like particle vaccine displa...,"Li, E.; Chi, H.; Huang, P.; Yan, F.; Zhang, Y....",Middle East respiratory syndrome coronavirus (...,2019,,Viruses,11.0,9.0,799,,http://dx.doi.org/10.3390/v11090799,17043,#15503,Li 2019,,* Epidemiological study; Other related disease...
3,Covid-19: Don’t forget the impact on US family...,"Kamerow, Douglas",As covid-19 continues its exponential growth i...,2020,,BMJ,368.0,,m1260-m1260,,10.1136/bmj.m1260,16622,#15923,Kamerow 2020,,"* Opinion piece; Epidemiology; Ethics, social ..."
4,Covid-19: risk factors for severe disease and ...,"Jordan, Rachel E.; Adab, Peymane; Cheng, K. K.",A long list is emerging from largely unadjuste...,2020,,BMJ,368.0,,m1198-m1198,,10.1136/bmj.m1198,16621,#15924,Jordan 2020,,* Epidemiological study; Epidemiology


In [14]:
df_who.columns

Index(['Title', 'Authors', 'Abstract', 'Published Year', 'Published Month',
       'Journal', 'Volume', 'Issue', 'Pages', 'Accession Number', 'DOI', 'Ref',
       'Covidence #', 'Study', 'Notes', 'Tags'],
      dtype='object')

In [15]:
df_who.drop(columns="Authors", inplace=True)

In [16]:
df_who.rename(columns={'Title':'title', 'Abstract':'abstract', 'Published Year':'publication_year', 'Published Month':'publication_month',
       'Journal':'journal', 'Volume':'volume', 'Issue':'issue', 'Pages':'pages', 'Accession Number':'accession_number', 'DOI':'doi', 'Ref':'ref',
       'Covidence #':'covidence', 'Study':'study', 'Notes':'notes', 'Tags':'tags'}, inplace=True)

In [17]:
df_who["pmid"] = ""
df_who["pmcid"] = ""
df_who = df_who.fillna('')

In [18]:
df_who.head()

Unnamed: 0,title,abstract,publication_year,publication_month,journal,volume,issue,pages,accession_number,doi,ref,covidence,study,notes,tags,pmid,pmcid
0,Disseminated intravascular coagulation in pati...,,2020,,Journal of thrombosis and haemostasis : JTH,,,,,10.1111/jth.14781,16680,#15962,Lillicrap 2020,,"* Case study/series; Clinical aspects, diagnos...",,
1,Chest CT Findings in a Pregnant Patient with 2...,,2020,,Balkan medical journal,,,,,10.4274/balkanmedj.galenos.2020.2020.3.89,16639,#15947,Liao 2020,,"* Case study/series; Clinical aspects, diagnos...",,
2,A novel bacterium-like particle vaccine displa...,Middle East respiratory syndrome coronavirus (...,2019,,Viruses,11.0,9.0,799,,http://dx.doi.org/10.3390/v11090799,17043,#15503,Li 2019,,* Epidemiological study; Other related disease...,,
3,Covid-19: Don’t forget the impact on US family...,As covid-19 continues its exponential growth i...,2020,,BMJ,368.0,,m1260-m1260,,10.1136/bmj.m1260,16622,#15923,Kamerow 2020,,"* Opinion piece; Epidemiology; Ethics, social ...",,
4,Covid-19: risk factors for severe disease and ...,A long list is emerging from largely unadjuste...,2020,,BMJ,368.0,,m1198-m1198,,10.1136/bmj.m1198,16621,#15924,Jordan 2020,,* Epidemiological study; Epidemiology,,


In [19]:
df_who[df_who.doi==""].shape

(440, 17)

##### CORD19

In [20]:
df_cord.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file,url
0,vho70jcx,f056da9c64fbf00a4645ae326e8a4339d015d155,biorxiv,SIANN: Strain Identification by Alignment to N...,10.1101/001727,,,biorxiv,Next-generation sequencing is increasingly bei...,2014-01-10,Samuel Minot; Stephen D Turner; Krista L Ternu...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/001727
1,i9tbix2v,daf32e013d325a6feb80e83d15aabc64a48fae33,biorxiv,Spatial epidemiology of networked metapopulati...,10.1101/003889,,,biorxiv,An emerging disease is one infectious epidemic...,2014-06-04,Lin WANG; Xiang Li,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/003889
2,62gfisc6,f33c6d94b0efaa198f8f3f20e644625fa3fe10d2,biorxiv,Sequencing of the human IG light chain loci fr...,10.1101/006866,,,biorxiv,Germline variation at immunoglobulin gene (IG)...,2014-07-03,Corey T Watson; Karyn Meltz Steinberg; Tina A ...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/006866
3,058r9486,4da8a87e614373d56070ed272487451266dce919,biorxiv,Bayesian mixture analysis for metagenomic comm...,10.1101/007476,,,biorxiv,Deep sequencing of clinical samples is now an ...,2014-07-25,Sofia Morfopoulou; Vincent Plagnol,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/007476
4,wich35l7,eccef80cfbe078235df22398f195d5db462d8000,biorxiv,Mapping a viral phylogeny onto outbreak trees ...,10.1101/010389,,,biorxiv,Developing methods to reconstruct transmission...,2014-11-11,Stephen P Velsko; Jonathan E Allen,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/010389


In [21]:
# NEW columns (for now, we drop)
df_cord.drop(columns=["cord_uid","url"],inplace=True)

In [22]:
df_cord.columns

Index(['sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id', 'license',
       'abstract', 'publish_time', 'authors', 'journal',
       'Microsoft Academic Paper ID', 'WHO #Covidence', 'has_full_text',
       'full_text_file'],
      dtype='object')

In [23]:
df_cord.drop(columns='authors', inplace=True)
df_cord = df_cord.fillna('')

In [24]:
df_cord.rename(columns={'source_x':'source', 'pubmed_id': 'pmid',
       'Microsoft Academic Paper ID': 'ms_academic_id', 'WHO #Covidence': 'who_covidence'}, inplace=True)

In [25]:
df_cord["publication_year"] = df_cord["publish_time"].apply(get_year)
df_cord["publication_month"] = df_cord["publish_time"].apply(get_month)

In [26]:
df_cord.drop(columns='publish_time', inplace=True)

In [27]:
df_cord['pages'] = ""
df_cord['volume'] = ""
df_cord['issue'] = ""

In [28]:
df_cord.head()

Unnamed: 0,sha,source,title,doi,pmcid,pmid,license,abstract,journal,ms_academic_id,who_covidence,has_full_text,full_text_file,publication_year,publication_month,pages,volume,issue
0,f056da9c64fbf00a4645ae326e8a4339d015d155,biorxiv,SIANN: Strain Identification by Alignment to N...,10.1101/001727,,,biorxiv,Next-generation sequencing is increasingly bei...,,,,True,biorxiv_medrxiv,2014,1,,,
1,daf32e013d325a6feb80e83d15aabc64a48fae33,biorxiv,Spatial epidemiology of networked metapopulati...,10.1101/003889,,,biorxiv,An emerging disease is one infectious epidemic...,,,,True,biorxiv_medrxiv,2014,6,,,
2,f33c6d94b0efaa198f8f3f20e644625fa3fe10d2,biorxiv,Sequencing of the human IG light chain loci fr...,10.1101/006866,,,biorxiv,Germline variation at immunoglobulin gene (IG)...,,,,True,biorxiv_medrxiv,2014,7,,,
3,4da8a87e614373d56070ed272487451266dce919,biorxiv,Bayesian mixture analysis for metagenomic comm...,10.1101/007476,,,biorxiv,Deep sequencing of clinical samples is now an ...,,,,True,biorxiv_medrxiv,2014,7,,,
4,eccef80cfbe078235df22398f195d5db462d8000,biorxiv,Mapping a viral phylogeny onto outbreak trees ...,10.1101/010389,,,biorxiv,Developing methods to reconstruct transmission...,,,,True,biorxiv_medrxiv,2014,11,,,


In [29]:
df_cord[(df_cord.doi=="") & ((df_cord.sha!="") | (df_cord.pmid!="") | (df_cord.pmcid!=""))].shape

(3035, 18)

In [30]:
df_dimensions.shape

(4797, 17)

In [31]:
df_who.shape

(2912, 17)

In [32]:
df_cord.shape

(45774, 18)

### Prepare tables

In [33]:
# the main table: pub

In [34]:
pub_table_columns = ['title','abstract','publication_year','publication_month','journal','volume','issue','pages','doi','pmid','pmcid']

df_pub = df_dimensions[pub_table_columns].append(df_who[pub_table_columns], ignore_index = True)

In [35]:
df_pub = df_pub[pub_table_columns].append(df_cord[pub_table_columns], ignore_index=True)

In [36]:
df_pub.shape

(53483, 11)

In [37]:
df_pub[(df_pub.doi=="") & (df_pub.pmid=="") & (df_pub.pmcid=="")].shape

(841, 11)

In [38]:
# check to have at least one valid identifier per publication
# we drop publications which do not: hopefully, they will be equipped with an identifier in future releases

df_pub = df_pub[~((df_pub.doi=="") & (df_pub.pmid=="") & (df_pub.pmcid==""))]

In [39]:
# drop duplicates, first on dois then pmids then pmcids. We need this to keep empty values!
df_tmp = df_pub[df_pub.doi==""]
df_pub = df_pub[df_pub.doi!=""].drop_duplicates(['doi'])
df_tmp2 = df_tmp[df_tmp.pmid==""]
df_pub2 = df_tmp[df_tmp.pmid!=""].drop_duplicates(['pmid'])
df_pub3 = df_tmp2[df_tmp2.pmcid!=""].drop_duplicates(['pmcid'])

In [40]:
df_pub = pd.concat([df_pub,df_pub2,df_pub3])

In [41]:
# add PK and reset index
df_pub.reset_index(drop=True,inplace=True)
df_pub["pub_id"] = df_pub.index.values

In [42]:
df_pub.shape

(48428, 12)

In [43]:
df_pub.tail()

Unnamed: 0,title,abstract,publication_year,publication_month,journal,volume,issue,pages,doi,pmid,pmcid,pub_id
48423,eCMAJ's top 10 — May,,2003,7.0,,,,,,,PMC164992,48423
48424,WHO Scientific activities,,1982,,,,,,,,PMC2535994,48424
48425,Chinese government detains doctor who criticis...,,2004,7.0,,,,,,,PMC478255,48425
48426,Viruses and Demyelinating Diseases,,1986,3.0,,,,,,,PMC1680146,48426
48427,The Back Pages,,2003,6.0,,,,,,,PMC1314638,48427


In [44]:
# create other tables via joins

df_datasource = pd.DataFrame.from_dict({"source":["CORD19","Dimensions","WHO"],"url":["https://pages.semanticscholar.org/coronavirus-research","https://docs.google.com/spreadsheets/d/1-kTZJZ1GAhJ2m4GAIhw1ZdlgO46JpvX0ZQa232VWRmw/edit#gid=2034285255",
"https://www.who.int/emergencies/diseases/novel-coronavirus-2019/global-research-on-novel-coronavirus-2019-ncov"]})
df_cord_metadata = df_cord[['source','license','full_text_file','ms_academic_id','who_covidence','doi','pmid','pmcid','sha']]
df_who_metadata = df_who[['accession_number', 'doi', 'ref',
       'covidence', 'study', 'notes', 'tags', 'pmid', 'pmcid']]
df_dimensions_metadata = df_dimensions[['publication_id', 'doi', 'pmid', 'pmcid', 'source_uid', 'mesh_terms',
       'open_access', 'publication_type', 'dimensions_url']]

In [45]:
df_datasource.head()

Unnamed: 0,source,url
0,CORD19,https://pages.semanticscholar.org/coronavirus-...
1,Dimensions,https://docs.google.com/spreadsheets/d/1-kTZJZ...
2,WHO,https://www.who.int/emergencies/diseases/novel...


In [46]:
# CORD19 metadata

In [47]:
#We need this to keep empty values!

df_tmp = df_cord_metadata[df_cord_metadata.doi==""]
df_cord_metadata1 = pd.merge(df_cord_metadata[df_cord_metadata.doi!=""], df_pub[['pub_id','doi']],  how='inner', left_on=['doi'], right_on=['doi'])
df_tmp2 = df_tmp[df_tmp.pmid==""]
df_cord_metadata2 = pd.merge(df_tmp[df_tmp.pmid!=""], df_pub[['pub_id','pmid']],  how='inner', left_on=['pmid'], right_on=['pmid'])
df_cord_metadata3 = pd.merge(df_tmp2[df_tmp2.pmcid!=""], df_pub[['pub_id','pmcid']],  how='inner', left_on=['pmcid'], right_on=['pmcid'])

In [48]:
df_cord_metadata1.drop_duplicates("doi",inplace=True)
df_cord_metadata2.drop_duplicates("pmid",inplace=True)
df_cord_metadata3.drop_duplicates("pmcid",inplace=True)

In [49]:
df_cord_metadata1.head()

Unnamed: 0,source,license,full_text_file,ms_academic_id,who_covidence,doi,pmid,pmcid,sha,pub_id
0,biorxiv,biorxiv,biorxiv_medrxiv,,,10.1101/001727,,,f056da9c64fbf00a4645ae326e8a4339d015d155,5814
1,biorxiv,biorxiv,biorxiv_medrxiv,,,10.1101/003889,,,daf32e013d325a6feb80e83d15aabc64a48fae33,5815
2,biorxiv,biorxiv,biorxiv_medrxiv,,,10.1101/006866,,,f33c6d94b0efaa198f8f3f20e644625fa3fe10d2,5816
3,biorxiv,biorxiv,biorxiv_medrxiv,,,10.1101/007476,,,4da8a87e614373d56070ed272487451266dce919,5817
4,biorxiv,biorxiv,biorxiv_medrxiv,,,10.1101/010389,,,eccef80cfbe078235df22398f195d5db462d8000,5818


In [50]:
df_cord_metadata = pd.concat([df_cord_metadata1,df_cord_metadata2,df_cord_metadata3])

In [51]:
df_cord_metadata.shape

(45471, 10)

In [52]:
# read full texts in
folders = ['biorxiv_medrxiv/biorxiv_medrxiv','comm_use_subset/comm_use_subset','custom_license/custom_license','noncomm_use_subset/noncomm_use_subset']
shas = list()
full_texts = list()

for folder in folders:
    for root, dirs, files in os.walk(os.path.join(cord19_folder,folder)):
        for file in tqdm(files):
            if ".json" in file: # read
                data = json.loads(codecs.open(os.path.join(root,file)).read())
                sha = data["paper_id"]
                full_text = "\n".join(section["text"] for section in data["body_text"])
                shas.append(sha)
                full_texts.append(full_text)

HBox(children=(FloatProgress(value=0.0, max=1053.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9315.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20657.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2350.0), HTML(value='')))




In [53]:
df_cord_fulltext = pd.DataFrame.from_dict({"sha":shas,"full_text":full_texts})

In [54]:
df_cord_metadata = pd.merge(df_cord_metadata, df_cord_fulltext,  how='left', left_on=['sha'], right_on=['sha'])
df_cord_metadata = df_cord_metadata.fillna('')
df_cord_metadata.rename(columns={"id":"pub_id"},inplace=True)

In [55]:
df_cord_metadata.head()

Unnamed: 0,source,license,full_text_file,ms_academic_id,who_covidence,doi,pmid,pmcid,sha,pub_id,full_text
0,biorxiv,biorxiv,biorxiv_medrxiv,,,10.1101/001727,,,f056da9c64fbf00a4645ae326e8a4339d015d155,5814,There are many different methods that characte...
1,biorxiv,biorxiv,biorxiv_medrxiv,,,10.1101/003889,,,daf32e013d325a6feb80e83d15aabc64a48fae33,5815,The term metapopulation was coined by Levins [...
2,biorxiv,biorxiv,biorxiv_medrxiv,,,10.1101/006866,,,f33c6d94b0efaa198f8f3f20e644625fa3fe10d2,5816,Antibodies are essential components of the imm...
3,biorxiv,biorxiv,biorxiv_medrxiv,,,10.1101/007476,,,4da8a87e614373d56070ed272487451266dce919,5817,Metagenomics can be defined as the analysis of...
4,biorxiv,biorxiv,biorxiv_medrxiv,,,10.1101/010389,,,eccef80cfbe078235df22398f195d5db462d8000,5818,: Schematic of transmission links overlaid on ...


In [56]:
# WHO and Dimensions metadata

In [57]:
df_tmp = df_who_metadata[df_who_metadata.doi==""]
df_who_metadata1 = pd.merge(df_who_metadata[df_who_metadata.doi!=""], df_pub[['pub_id','doi']],  how='inner', left_on=['doi'], right_on=['doi'])
df_tmp2 = df_tmp[df_tmp.pmid==""]
df_who_metadata2 = pd.merge(df_tmp[df_tmp.pmid!=""], df_pub[['pub_id','pmid']],  how='inner', left_on=['pmid'], right_on=['pmid'])
df_who_metadata3 = pd.merge(df_tmp2[df_tmp2.pmcid!=""], df_pub[['pub_id','pmcid']],  how='inner', left_on=['pmcid'], right_on=['pmcid'])

In [58]:
df_who_metadata1.drop_duplicates("doi",inplace=True)
df_who_metadata2.drop_duplicates("pmid",inplace=True)
df_who_metadata3.drop_duplicates("pmcid",inplace=True)

Unnamed: 0,accession_number,doi,ref,covidence,study,notes,tags,pmid,pub_id,pmcid


In [59]:
df_who_metadata = pd.concat([df_who_metadata1,df_who_metadata2,df_who_metadata3])

In [60]:
df_who_metadata.shape

(2359, 10)

In [61]:
df_who_metadata.rename(columns={"id":"pub_id"},inplace=True)

In [62]:
df_tmp = df_dimensions_metadata[df_dimensions_metadata.doi==""]
df_dimensions_metadata1 = pd.merge(df_dimensions_metadata[df_dimensions_metadata.doi!=""], df_pub[['pub_id','doi']],  how='inner', left_on=['doi'], right_on=['doi'])
df_tmp2 = df_tmp[df_tmp.pmid==""]
df_dimensions_metadata2 = pd.merge(df_tmp[df_tmp.pmid!=""], df_pub[['pub_id','pmid']],  how='inner', left_on=['pmid'], right_on=['pmid'])
df_dimensions_metadata3 = pd.merge(df_tmp2[df_tmp2.pmcid!=""], df_pub[['pub_id','pmcid']],  how='inner', left_on=['pmcid'], right_on=['pmcid'])

In [63]:
df_dimensions_metadata1.drop_duplicates("doi",inplace=True)
df_dimensions_metadata2.drop_duplicates("pmid",inplace=True)
df_dimensions_metadata3.drop_duplicates("pmcid",inplace=True)

Unnamed: 0,publication_id,doi,pmid,source_uid,mesh_terms,open_access,publication_type,dimensions_url,pub_id,pmcid


In [64]:
df_dimensions_metadata = pd.concat([df_dimensions_metadata1,df_dimensions_metadata2,df_dimensions_metadata3])

In [65]:
df_dimensions_metadata.shape

(4693, 10)

In [66]:
df_dimensions_metadata.rename(columns={"id":"pub_id"},inplace=True)

In [67]:
# Create datasource tables

In [68]:
cord_source_id = df_datasource[df_datasource.source=="CORD19"].index.values[0]
who_source_id = df_datasource[df_datasource.source=="WHO"].index.values[0]
dimensions_source_id = df_datasource[df_datasource.source=="Dimensions"].index.values[0]

In [69]:
df_cord_metadata["source_id"] = cord_source_id
df_who_metadata["source_id"] = who_source_id
df_dimensions_metadata["source_id"] = dimensions_source_id

In [70]:
df_pub_to_datasource = df_cord_metadata[["pub_id","source_id"]]
df_pub_to_datasource = df_pub_to_datasource.append(df_who_metadata[["pub_id","source_id"]],ignore_index=True)
df_pub_to_datasource = df_pub_to_datasource.append(df_dimensions_metadata[["pub_id","source_id"]],ignore_index=True)

In [71]:
df_pub_to_datasource.drop_duplicates(inplace=True)
df_pub_to_datasource.rename(columns={"source_id":"datasource_id"},inplace=True)

In [72]:
df_pub_to_datasource.shape

(52522, 2)

In [73]:
df_pub_to_datasource[df_pub_to_datasource.pub_id==22787]

Unnamed: 0,pub_id,datasource_id
18728,22787,0


In [74]:
# remove unnecessary columns
df_cord_metadata.drop(columns=['doi','pmid','pmcid','source_id'],inplace=True)
df_who_metadata.drop(columns=['doi','pmid','pmcid','source_id'],inplace=True)
df_dimensions_metadata.drop(columns=['doi','pmid','pmcid','source_id'],inplace=True)

In [75]:
# reset all indexes which will become PKs
df_cord_metadata.reset_index(drop=True,inplace=True)
df_who_metadata.reset_index(drop=True,inplace=True)
df_dimensions_metadata.reset_index(drop=True,inplace=True)
df_datasource.reset_index(drop=True,inplace=True)
df_cord_metadata["cord19_metadata_id"] = df_cord_metadata.index.values
df_who_metadata["who_metadata_id"] = df_who_metadata.index.values
df_dimensions_metadata["dimensions_metadata_id"] = df_dimensions_metadata.index.values
df_datasource["datasource_metadata_id"] = df_datasource.index.values

In [76]:
# make numeric where needed
df_pub["publication_year"] = pd.to_numeric(df_pub["publication_year"])
df_pub["publication_month"] = pd.to_numeric(df_pub["publication_month"])
df_pub["pmid"] = pd.to_numeric(df_pub["pmid"])

In [77]:
# add timestamp
df_pub["timestamp"] = pd.Timestamp.now()

In [78]:
# clean-up text (optional)
replaces = [""]

def clean_up(txt):
    for r in replaces:
        txt = txt.replace(r,"")
    return txt.encode('utf8', 'ignore').decode('utf8')
df_pub["abstract"] = [clean_up(a) for a in df_pub["abstract"].values]

In [79]:
df_pub.head()

Unnamed: 0,title,abstract,publication_year,publication_month,journal,volume,issue,pages,doi,pmid,pmcid,pub_id,timestamp
0,The Possible Immunological Pathways for the Va...,,2020.0,3.0,Electronic Journal of General Medicine,17.0,4.0,,10.29333/ejgm/7850,,,0,2020-03-28 08:55:45.059511
1,A Method of Estimating Time-to-Recovery for a ...,,2020.0,3.0,Research Square,,,,10.21203/rs.3.rs-18190/v1,,,1,2020-03-28 08:55:45.059511
2,Preparation for the quarantine of the cruise s...,,2020.0,3.0,JMIR Preprints,,,,10.2196/preprints.18821,,,2,2020-03-28 08:55:45.059511
3,Differences and similarities between Severe Ac...,,2020.0,3.0,European review for medical and pharmacologica...,24.0,5.0,2781-2783,10.26355/eurrev_202003_20551,32196628.0,,3,2020-03-28 08:55:45.059511
4,From SARS-CoV to SARS-CoV-2: The response and ...,Abstract:,2020.0,2.0,Fa yi xue za zhi,36.0,1.0,1-3,10.12116/j.issn.1004-5619.2020.01.001,32198983.0,,4,2020-03-28 08:55:45.059511


In [80]:
# reorder the columns to match the SQL schema

df_datasource.columns

Index(['source', 'url', 'datasource_metadata_id'], dtype='object')

In [81]:
df_pub = df_pub[['pub_id', 'title', 'abstract', 'publication_year', 'publication_month', 'journal',
       'volume', 'issue', 'pages', 'doi', 'pmid', 'pmcid',
       'timestamp']]
df_who_metadata = df_who_metadata[['who_metadata_id', 'accession_number', 'ref', 'covidence', 'study', 'notes', 'tags',
       'pub_id']]
df_dimensions_metadata = df_dimensions_metadata[['dimensions_metadata_id', 'publication_id', 'source_uid', 'open_access',
       'publication_type', 'dimensions_url', 'mesh_terms', 'pub_id']]
df_cord_metadata = df_cord_metadata[[ 'cord19_metadata_id', 'source', 'license', 'full_text_file', 'ms_academic_id',
       'who_covidence', 'sha', 'full_text', 'pub_id']]
df_datasource = df_datasource[['datasource_metadata_id', 'source', 'url']]

### Dump to MySQL

Use this if you want to create a MySQL db.

In [None]:
dtype_dict = {'pub_id':Integer, 'title':String, 'abstract':String, 'publication_year':Integer, 'publication_month':Integer, 'journal':String,
       'volume':String, 'issue':String, 'pages':String, 'doi':String, 'pmid':Integer, 'pmcid':String, 'timestamp':DateTime}

In [None]:
# get API key
import configparser
config = configparser.ConfigParser()
config.read("credentials/conf.ini")
mysql_username = config["MYSQL"]["username"]
mysql_password = config["MYSQL"]["password"]
mysql_database = config["MYSQL"]["database"]

In [None]:
sqlEngine = create_engine('mysql+pymysql://%s:%s@127.0.0.1/%s'%(mysql_username,mysql_password,mysql_database), pool_recycle=3600)
dbConnection = sqlEngine.connect()

In [None]:
# main table
table_name = "pub"
try:
    frame = df_pub.to_sql(table_name, dbConnection, if_exists='append', index=False, index_label="pub_id", dtype=dtype_dict);
except ValueError as vx:
    print(vx)
except Exception as ex:   
    print(ex)
else:
    print("Table %s created successfully."%table_name);   
finally:
    dbConnection.close()

In [None]:
sqlEngine = create_engine('mysql+pymysql://%s:%s@127.0.0.1/%s'%(mysql_username,mysql_password,mysql_database), pool_recycle=3600)
dbConnection = sqlEngine.connect()

In [None]:
# other tables
try:
    frame = df_cord_metadata.to_sql("cord19_metadata", dbConnection, if_exists='append', index=True, index_label="cord19_metadata_id")
    frame = df_who_metadata.to_sql("who_metadata", dbConnection, if_exists='append', index=True, index_label="who_metadata_id")
    frame = df_dimensions_metadata.to_sql("dimensions_metadata", dbConnection, if_exists='append', index=True, index_label="dimensions_metadata_id")
    frame = df_datasource.to_sql("datasource", dbConnection, if_exists='append', index=True, index_label="datasource_id")
except ValueError as vx:
    print(vx)
except Exception as ex:   
    print(ex)
else:
    print("Tables created successfully.");   
finally:
    dbConnection.close()

In [None]:
sqlEngine = create_engine('mysql+pymysql://%s:%s@127.0.0.1/%s'%(mysql_username,mysql_password,mysql_database), pool_recycle=3600)
dbConnection = sqlEngine.connect()

In [None]:
# last table
try:
    frame = df_pub_to_datasource.to_sql("pub_datasource", dbConnection, if_exists='append', index=False, index_label=["pub_id","datasource_id"])
except ValueError as vx:
    print(vx)
except Exception as ex:   
    print(ex)
else:
    print("Table created successfully.");   
finally:
    dbConnection.close()

In [None]:
### Export the df_pub dataframe for further use

df_pub.to_csv("datasets_output/df_pub.csv", compression="gzip")

In [None]:
# export TSV for ingestion

df_pub.to_csv("datasets_output/sql_tables/pub.csv",index=False,sep="\t",header=False)
df_cord_metadata.to_csv("datasets_output/sql_tables/cord19_metadata.csv",index=False,sep="\t",header=False)
df_dimensions_metadata.to_csv("datasets_output/sql_tables/dimensions_metadata.csv",index=False,sep="\t",header=False)
df_who_metadata.to_csv("datasets_output/sql_tables/who_metadata.csv",index=False,sep="\t",header=False)
df_datasource.to_csv("datasets_output/sql_tables/datasource.csv",index=False,sep="\t",header=False)
df_pub_to_datasource.to_csv("datasets_output/sql_tables/pub_datasource.csv",index=False,sep="\t",header=False)