# COVID19-related literature SQL database

In this notebook, we create a relational database dump of a set of COVID19-related publication datasets. These include:

* CORD19: https://pages.semanticscholar.org/coronavirus-research
* Dimensions: https://docs.google.com/spreadsheets/d/1-kTZJZ1GAhJ2m4GAIhw1ZdlgO46JpvX0ZQa232VWRmw/edit#gid=2034285255
* WHO: https://www.who.int/emergencies/diseases/novel-coronavirus-2019/global-research-on-novel-coronavirus-2019-ncov

In [1]:
# magics, warnings and imports

%load_ext autoreload
%autoreload 2
import warnings; warnings.simplefilter('ignore')

import os, random, codecs, json
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import pymysql
from sqlalchemy import create_engine
from sqlalchemy import Integer,String,Boolean,DateTime

#### Load datasets

In [2]:
# point here to the versions of the datasets you want to use
dimensions_filename = "datasets_input/Dimensions_24_04_2020.csv"
who_filename = "datasets_input/WHO_24_04_2020.csv"
cord19_folder = "datasets_input/CORD19_2020_04_24"

df_dimensions = pd.read_csv(dimensions_filename, dtype=str)
df_who = pd.read_csv(who_filename, dtype=str)
df_cord = pd.read_csv(os.path.join(cord19_folder,"metadata.csv"), dtype=str)

In [3]:
df_cord.shape

(57366, 18)

### Prepare dataframes for ingestion

#### Clean-up data frames

##### Dimensions

In [4]:
df_dimensions.head()

Unnamed: 0,Date added,Publication ID,DOI,PMID,PMCID,Title,Abstract,Source title,Source UID,Publisher,...,Research Organizations - standardized,GRID IDs,City of Research organization,Country of Research organization,Funder,UIDs of supporting grants,Times cited,Altmetric,Source Linkout,Dimensions URL
0,2020-04-23,pub.1126872409,10.2139/ssrn.3572849,,,Digestive Symptoms of COVID-19 and High Expres...,,SSRN Electronic Journal,jour.1276748,Elsevier,...,,,,,,,0,,,https://app.dimensions.ai/details/publication/...
1,2020-04-23,pub.1126872857,10.2196/preprints.19433,,,Developing a mobile application (iGAM) to prom...,"<sec xmlns=""http://www.ncbi.nlm.nih.gov/JATS1""...",JMIR Preprints,jour.1345647,JMIR Publications,...,,,,,,,0,,http://dx.doi.org/10.2196/preprints.19433,https://app.dimensions.ai/details/publication/...
2,2020-04-23,pub.1126870571,10.14744/ejmi.2020.38479,,,Novel Coronavirus Disease ( COVID-19) Pandemic...,,Eurasian Journal of Medical Investigation,jour.1300430,Kare Publishing,...,,,,,,,0,,,https://app.dimensions.ai/details/publication/...
3,2020-04-23,pub.1126870635,10.1503/cmaj.200609,,,SARS-CoV-2 infection associated with spontaneo...,,Canadian Medical Association Journal,jour.1019308,Joule,...,,,,,,,0,9.0,https://www.cmaj.ca/content/cmaj/early/2020/04...,https://app.dimensions.ai/details/publication/...
4,2020-04-23,pub.1126862773,10.1007/s00259-020-04780-4,,,Key elements of preparedness for pandemic coro...,,European Journal of Nuclear Medicine and Molec...,jour.1297401,Springer Nature,...,,,,,,,0,1.0,https://link.springer.com/content/pdf/10.1007/...,https://app.dimensions.ai/details/publication/...


In [5]:
df_dimensions.columns

Index(['Date added', 'Publication ID', 'DOI', 'PMID', 'PMCID', 'Title',
       'Abstract', 'Source title', 'Source UID', 'Publisher', 'MeSH terms',
       'Publication Date', 'PubYear', 'Volume', 'Issue', 'Pagination',
       'Open Access', 'Publication Type', 'Authors', 'Corresponding Authors',
       'Authors Affiliations', 'Research Organizations - standardized',
       'GRID IDs', 'City of Research organization',
       'Country of Research organization', 'Funder',
       'UIDs of supporting grants', 'Times cited', 'Altmetric',
       'Source Linkout', 'Dimensions URL'],
      dtype='object')

In [6]:
df_dimensions.drop(columns=['Date added', 'Publisher', 'Authors', 'Corresponding Authors',
       'Authors Affiliations', 'Research Organizations - standardized',
       'GRID IDs', 'City of Research organization',
       'Country of Research organization', 'Funder',
       'UIDs of supporting grants', 'Times cited', 'Altmetric',
       'Source Linkout'], inplace=True)

In [7]:
df_dimensions.columns

Index(['Publication ID', 'DOI', 'PMID', 'PMCID', 'Title', 'Abstract',
       'Source title', 'Source UID', 'MeSH terms', 'Publication Date',
       'PubYear', 'Volume', 'Issue', 'Pagination', 'Open Access',
       'Publication Type', 'Dimensions URL'],
      dtype='object')

In [8]:
df_dimensions.rename(columns={'Publication ID':'dimensions_id', 'DOI':'doi', 'PMID':'pmid', 'PMCID':'pmcid', 'Title':'title', 'Abstract':'abstract',
       'Source title':'journal', 'Source UID':'source_uid', 'MeSH terms':'mesh_terms', 'Publication Date':'publication_date',
       'PubYear':'publication_year', 'Volume':'volume', 'Issue':'issue', 'Pagination':'pages', 'Open Access':'open_access',
       'Publication Type':'publication_type', 'Dimensions URL':'dimensions_url'}, inplace=True)

In [9]:
def get_year(date):
    if len(date)>3 and date[:4].isdigit():
        return date[:4]
    return ""

month_to_number = {"Jan":"1","Feb":"2","Mar":"3","Apr":"4","May":"5","Jun":"6","Jul":"7","Aug":"8","Sep":"9","Oct":"10","Nov":"11","Dec":"12"}

def get_month(date):
    if len(date)>6:
        if "-" in date and date.split("-")[1].isdigit():
            return str(int(date.split("-")[1]))
        else:
            try:
                return month_to_number[date.split()[1]]
            except:
                return ""
    return ""

def sanitize_string(s):
    return " ".join(s.split())

In [10]:
df_dimensions["publication_year"] = df_dimensions["publication_year"].apply(get_year)
df_dimensions["publication_month"] = df_dimensions["publication_date"].apply(get_month)

In [11]:
df_dimensions.drop(columns="publication_date", inplace=True)
df_dimensions = df_dimensions.fillna('')

In [12]:
df_dimensions.head()

Unnamed: 0,dimensions_id,doi,pmid,pmcid,title,abstract,journal,source_uid,mesh_terms,publication_year,volume,issue,pages,open_access,publication_type,dimensions_url,publication_month
0,pub.1126872409,10.2139/ssrn.3572849,,,Digestive Symptoms of COVID-19 and High Expres...,,SSRN Electronic Journal,jour.1276748,,2020,,,,"All OA; Green, Submitted",preprint,https://app.dimensions.ai/details/publication/...,1
1,pub.1126872857,10.2196/preprints.19433,,,Developing a mobile application (iGAM) to prom...,"<sec xmlns=""http://www.ncbi.nlm.nih.gov/JATS1""...",JMIR Preprints,jour.1345647,,2020,,,,"All OA; Green, Submitted",preprint,https://app.dimensions.ai/details/publication/...,4
2,pub.1126870571,10.14744/ejmi.2020.38479,,,Novel Coronavirus Disease ( COVID-19) Pandemic...,,Eurasian Journal of Medical Investigation,jour.1300430,,2020,,,,Closed,article,https://app.dimensions.ai/details/publication/...,1
3,pub.1126870635,10.1503/cmaj.200609,,,SARS-CoV-2 infection associated with spontaneo...,,Canadian Medical Association Journal,jour.1019308,,2020,,,cmaj.200609,All OA; Bronze,article,https://app.dimensions.ai/details/publication/...,4
4,pub.1126862773,10.1007/s00259-020-04780-4,,,Key elements of preparedness for pandemic coro...,,European Journal of Nuclear Medicine and Molec...,jour.1297401,,2020,,,,All OA; Bronze,article,https://app.dimensions.ai/details/publication/...,4


In [13]:
df_dimensions[df_dimensions.doi==""].shape

(710, 17)

##### WHO

In [14]:
df_who.head()

Unnamed: 0,Title,Authors,Abstract,Published Year,Published Month,Journal,Volume,Issue,Pages,Accession Number,DOI,Ref,Covidence #,Study,Notes,Tags
0,Analysis of the susceptibility to COVID-19 in ...,"Zhao, Xiaoxuan; Jiang, Yuepeng; Zhao, Yang; Xi...",To analyze the susceptibility of SARS-CoV-2 in...,2020.0,,European Journal of Clinical Microbiology & In...,,,1-12,,10.1007/S10096-020-03897-6,46248,#109007,Zhao 2020,,
1,[Clinical Characteristics and Coping Strategie...,"Zhao, Nannan; Shi, Jie; Zeng, Lizhong; Yang, S...","Since mid-December 2019, severe acute respirat...",2020.0,,Zhongguo Fei Ai Za Zhi,,,,32316713.0,10.3779/j.issn.1009-3419.2020.102.15,46703,#109649,Zhao 2020,,
2,Anti-SARS-CoV-2 virus antibody levels in conva...,"Zhang, Libo; Pang, Rongrong; Xue, Xiang; Bao, ...",BACKGROUND: Anti-SARS-CoV-2 virus antibody lev...,2020.0,,Aging,12.0,,,,10.18632/aging.103102,46245,#109004,Zhang 2020,,
3,Positive rectal swabs in young patients recove...,"Zhang, Bin; Liu, Shuyi; Dong, Yuhao; Zhang, Lu...",ObjectivesTo investigate the widely concerned ...,,,Journal of Infection,,,,,10.1016/j.jinf.2020.04.023,46454,#108733,,,
4,Does high cardiorespiratory fitness confer som...,"Zbinden-Foncea, Hermann; Francaux, Marc; Deldi...",Abstract Severe acute respiratory syndrome cor...,2020.0,,Obesity,,,,,10.1002/oby.22849,46243,#109002,Zbinden-Foncea 2020,,


In [15]:
df_who.columns

Index(['Title', 'Authors', 'Abstract', 'Published Year', 'Published Month',
       'Journal', 'Volume', 'Issue', 'Pages', 'Accession Number', 'DOI', 'Ref',
       'Covidence #', 'Study', 'Notes', 'Tags'],
      dtype='object')

In [16]:
df_who.drop(columns="Authors", inplace=True)

In [17]:
df_who.rename(columns={'Title':'title', 'Abstract':'abstract', 'Published Year':'publication_year', 'Published Month':'publication_month',
       'Journal':'journal', 'Volume':'volume', 'Issue':'issue', 'Pages':'pages', 'Accession Number':'accession_number', 'DOI':'doi', 'Ref':'ref',
       'Covidence #':'covidence', 'Study':'study', 'Notes':'notes', 'Tags':'tags'}, inplace=True)

In [18]:
df_who["pmid"] = ""
df_who["pmcid"] = ""
df_who["dimensions_id"] = ""
df_who = df_who.fillna('')

In [19]:
df_who.head()

Unnamed: 0,title,abstract,publication_year,publication_month,journal,volume,issue,pages,accession_number,doi,ref,covidence,study,notes,tags,pmid,pmcid,dimensions_id
0,Analysis of the susceptibility to COVID-19 in ...,To analyze the susceptibility of SARS-CoV-2 in...,2020.0,,European Journal of Clinical Microbiology & In...,,,1-12,,10.1007/S10096-020-03897-6,46248,#109007,Zhao 2020,,,,,
1,[Clinical Characteristics and Coping Strategie...,"Since mid-December 2019, severe acute respirat...",2020.0,,Zhongguo Fei Ai Za Zhi,,,,32316713.0,10.3779/j.issn.1009-3419.2020.102.15,46703,#109649,Zhao 2020,,,,,
2,Anti-SARS-CoV-2 virus antibody levels in conva...,BACKGROUND: Anti-SARS-CoV-2 virus antibody lev...,2020.0,,Aging,12.0,,,,10.18632/aging.103102,46245,#109004,Zhang 2020,,,,,
3,Positive rectal swabs in young patients recove...,ObjectivesTo investigate the widely concerned ...,,,Journal of Infection,,,,,10.1016/j.jinf.2020.04.023,46454,#108733,,,,,,
4,Does high cardiorespiratory fitness confer som...,Abstract Severe acute respiratory syndrome cor...,2020.0,,Obesity,,,,,10.1002/oby.22849,46243,#109002,Zbinden-Foncea 2020,,,,,


In [20]:
df_who[df_who.doi==""].shape

(929, 18)

##### CORD19

In [21]:
df_cord.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,zjufx4fo,b2897e1277f56641193a6db73825f707eed3e4c9,PMC,Sequence requirements for RNA strand transfer ...,10.1093/emboj/20.24.7220,PMC125340,11742998,unk,Nidovirus subgenomic mRNAs contain a leader se...,2001-12-17,"Pasternak, Alexander O.; van den Born, Erwin; ...",The EMBO Journal,,,True,True,custom_license,http://europepmc.org/articles/pmc125340?pdf=re...
1,ymceytj3,e3d0d482ebd9a8ba81c254cc433f314142e72174,PMC,"Crystal structure of murine sCEACAM1a[1,4]: a ...",10.1093/emboj/21.9.2076,PMC125375,11980704,unk,CEACAM1 is a member of the carcinoembryonic an...,2002-05-01,"Tan, Kemin; Zelus, Bruce D.; Meijers, Rob; Liu...",The EMBO Journal,,,True,True,custom_license,http://europepmc.org/articles/pmc125375?pdf=re...
2,wzj2glte,00b1d99e70f779eb4ede50059db469c65e8c1469,PMC,Synthesis of a novel hepatitis C virus protein...,10.1093/emboj/20.14.3840,PMC125543,11447125,no-cc,Hepatitis C virus (HCV) is an important human ...,2001-07-16,"Xu, Zhenming; Choi, Jinah; Yen, T.S.Benedict; ...",EMBO J,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
3,2sfqsfm1,cf584e00f637cbd8f1bb35f3f09f5ed07b71aeb0,PMC,Structure of coronavirus main proteinase revea...,10.1093/emboj/cdf327,PMC126080,12093723,unk,The key enzyme in coronavirus polyprotein proc...,2002-07-01,"Anand, Kanchan; Palm, Gottfried J.; Mesters, J...",The EMBO Journal,,,True,True,custom_license,http://europepmc.org/articles/pmc126080?pdf=re...
4,i0zym7iq,dde02f11923815e6a16a31dd6298c46b109c5dfa,PMC,Discontinuous and non-discontinuous subgenomic...,10.1093/emboj/cdf635,PMC136939,12456663,unk,"Arteri-, corona-, toro- and roniviruses are ev...",2002-12-01,"van Vliet, A.L.W.; Smits, S.L.; Rottier, P.J.M...",The EMBO Journal,,,True,True,custom_license,http://europepmc.org/articles/pmc136939?pdf=re...


In [22]:
# NEW columns (for now, we drop)
df_cord.drop(columns=["cord_uid","url","has_pmc_xml_parse"],inplace=True)

In [23]:
df_cord.columns

Index(['sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id', 'license',
       'abstract', 'publish_time', 'authors', 'journal',
       'Microsoft Academic Paper ID', 'WHO #Covidence', 'has_pdf_parse',
       'full_text_file'],
      dtype='object')

In [24]:
df_cord.drop(columns='authors', inplace=True)
df_cord = df_cord.fillna('')

In [25]:
df_cord.rename(columns={'source_x':'source', 'pubmed_id': 'pmid',
       'Microsoft Academic Paper ID': 'ms_academic_id', 'WHO #Covidence': 'who_covidence', 'has_pdf_parse':'has_full_text'}, inplace=True)

In [26]:
df_cord["publication_year"] = df_cord["publish_time"].apply(get_year)
df_cord["publication_month"] = df_cord["publish_time"].apply(get_month)

In [27]:
df_cord.drop(columns='publish_time', inplace=True)

In [28]:
df_cord['pages'] = ""
df_cord['volume'] = ""
df_cord['issue'] = ""
df_cord["dimensions_id"] = ""

In [29]:
df_cord.head()

Unnamed: 0,sha,source,title,doi,pmcid,pmid,license,abstract,journal,ms_academic_id,who_covidence,has_full_text,full_text_file,publication_year,publication_month,pages,volume,issue,dimensions_id
0,b2897e1277f56641193a6db73825f707eed3e4c9,PMC,Sequence requirements for RNA strand transfer ...,10.1093/emboj/20.24.7220,PMC125340,11742998,unk,Nidovirus subgenomic mRNAs contain a leader se...,The EMBO Journal,,,True,custom_license,2001,12,,,,
1,e3d0d482ebd9a8ba81c254cc433f314142e72174,PMC,"Crystal structure of murine sCEACAM1a[1,4]: a ...",10.1093/emboj/21.9.2076,PMC125375,11980704,unk,CEACAM1 is a member of the carcinoembryonic an...,The EMBO Journal,,,True,custom_license,2002,5,,,,
2,00b1d99e70f779eb4ede50059db469c65e8c1469,PMC,Synthesis of a novel hepatitis C virus protein...,10.1093/emboj/20.14.3840,PMC125543,11447125,no-cc,Hepatitis C virus (HCV) is an important human ...,EMBO J,,,True,custom_license,2001,7,,,,
3,cf584e00f637cbd8f1bb35f3f09f5ed07b71aeb0,PMC,Structure of coronavirus main proteinase revea...,10.1093/emboj/cdf327,PMC126080,12093723,unk,The key enzyme in coronavirus polyprotein proc...,The EMBO Journal,,,True,custom_license,2002,7,,,,
4,dde02f11923815e6a16a31dd6298c46b109c5dfa,PMC,Discontinuous and non-discontinuous subgenomic...,10.1093/emboj/cdf635,PMC136939,12456663,unk,"Arteri-, corona-, toro- and roniviruses are ev...",The EMBO Journal,,,True,custom_license,2002,12,,,,


In [30]:
df_cord[(df_cord.doi=="") & ((df_cord.sha!="") | (df_cord.pmid!="") | (df_cord.pmcid!=""))].shape

(3043, 19)

In [31]:
df_cord[(df_cord.doi=="") & (df_cord.pmid=="") & (df_cord.pmcid=="")].shape

(306, 19)

In [32]:
df_dimensions.shape

(17208, 17)

In [33]:
df_who.shape

(9940, 18)

In [34]:
df_cord.shape

(57366, 19)

In [35]:
df_cord[df_cord.doi=="0.1126/science.abb7331"]

Unnamed: 0,sha,source,title,doi,pmcid,pmid,license,abstract,journal,ms_academic_id,who_covidence,has_full_text,full_text_file,publication_year,publication_month,pages,volume,issue,dimensions_id
37774,,WHO,‘A ticking time bomb’: Scientists worry about ...,0.1126/science.abb7331,,,unk,"CAPE TOWN, SOUTH AFRICA—Late on Sunday evening...",Science,,#8463,False,,2020,,,,,


### Prepare tables

In [36]:
# the main table: pub

In [37]:
pub_table_columns = ['title','abstract','publication_year','publication_month','journal','volume','issue','pages','doi','pmid','pmcid','dimensions_id']

df_pub = df_dimensions[pub_table_columns].append(df_who[pub_table_columns], ignore_index = True)

In [38]:
df_pub = df_pub[pub_table_columns].append(df_cord[pub_table_columns], ignore_index=True)

In [39]:
df_pub["title"] = df_pub["title"].apply(sanitize_string)
df_pub["abstract"] = df_pub["abstract"].apply(sanitize_string)
df_pub["doi"] = df_pub["doi"].apply(str.lower)
df_pub["pmid"] = df_pub["pmid"].apply(str.lower)
df_pub["pmcid"] = df_pub["pmcid"].apply(str.lower)
df_pub["dimensions_id"] = df_pub["dimensions_id"].apply(str.lower)

In [40]:
df_pub.shape

(84514, 12)

In [41]:
df_pub[(df_pub.doi=="") & (df_pub.pmid=="") & (df_pub.pmcid=="") & (df_pub.dimensions_id=="")].shape

(1235, 12)

In [42]:
# check to have at least one valid identifier per publication
# we drop publications which do not: hopefully, they will be equipped with an identifier in future releases

df_pub = df_pub[~((df_pub.doi=="") & (df_pub.pmid=="") & (df_pub.pmcid=="") & (df_pub.dimensions_id==""))]

In [43]:
df_pub[df_pub.doi=="0.1126/science.abb7331"]

Unnamed: 0,title,abstract,publication_year,publication_month,journal,volume,issue,pages,doi,pmid,pmcid,dimensions_id
24898,‘A ticking time bomb’: Scientists worry about ...,"CAPE TOWN, SOUTH AFRICA—Late on Sunday evening...",2020,,Science,,,,0.1126/science.abb7331,,,
64922,‘A ticking time bomb’: Scientists worry about ...,"CAPE TOWN, SOUTH AFRICA—Late on Sunday evening...",2020,,Science,,,,0.1126/science.abb7331,,,


In [44]:
# drop duplicates, first on dois then pmids then pmcids. We need this to keep empty values!
df_tmp = df_pub[df_pub.doi==""]
df_pub1 = df_pub[df_pub.doi!=""].groupby('doi').first()
df_pub1.reset_index(inplace=True)
df_tmp2 = df_tmp[df_tmp.pmid==""]
df_pub2 = df_tmp[df_tmp.pmid!=""].groupby('pmid').first()
df_pub2.reset_index(inplace=True)
df_tmp3 = df_tmp2[df_tmp2.pmcid==""]
df_pub3 = df_tmp2[df_tmp2.pmcid!=""].groupby('pmcid').first()
df_pub3.reset_index(inplace=True)
df_pub4 = df_tmp3[df_tmp3.dimensions_id!=""].groupby('dimensions_id').first()
df_pub4.reset_index(inplace=True)

In [45]:
df_pub1[df_pub1.doi=="0.1126/science.abb7331"]

Unnamed: 0,doi,title,abstract,publication_year,publication_month,journal,volume,issue,pages,pmid,pmcid,dimensions_id
0,0.1126/science.abb7331,‘A ticking time bomb’: Scientists worry about ...,"CAPE TOWN, SOUTH AFRICA—Late on Sunday evening...",2020,,Science,,,,,,


In [46]:
df_pub = pd.concat([df_pub1,df_pub2,df_pub3,df_pub4])

In [47]:
# add PK and reset index
df_pub.reset_index(drop=True,inplace=True)
df_pub["pub_id"] = df_pub.index.values

In [48]:
df_pub.shape

(69969, 13)

In [49]:
# create other tables via joins

df_datasource = pd.DataFrame.from_dict({"source":["CORD19","Dimensions","WHO"],"url":["https://pages.semanticscholar.org/coronavirus-research","https://docs.google.com/spreadsheets/d/1-kTZJZ1GAhJ2m4GAIhw1ZdlgO46JpvX0ZQa232VWRmw/edit#gid=2034285255",
"https://www.who.int/emergencies/diseases/novel-coronavirus-2019/global-research-on-novel-coronavirus-2019-ncov"]})
df_cord_metadata = df_cord[['source','license','full_text_file','ms_academic_id','who_covidence','doi','pmid','pmcid','sha']]
df_who_metadata = df_who[['accession_number', 'doi', 'ref',
       'covidence', 'study', 'notes', 'tags', 'pmid', 'pmcid']]
df_dimensions_metadata = df_dimensions[['dimensions_id', 'doi', 'pmid', 'pmcid', 'source_uid', 'mesh_terms',
       'open_access', 'publication_type', 'dimensions_url']]

In [50]:
df_cord_metadata["doi"] = df_cord_metadata["doi"].apply(str.lower)
df_cord_metadata["pmid"] = df_cord_metadata["pmid"].apply(str.lower)
df_cord_metadata["pmcid"] = df_cord_metadata["pmcid"].apply(str.lower)
df_who_metadata["doi"] = df_who_metadata["doi"].apply(str.lower)
df_who_metadata["pmid"] = df_who_metadata["pmid"].apply(str.lower)
df_who_metadata["pmcid"] = df_who_metadata["pmcid"].apply(str.lower)
df_dimensions_metadata["doi"] = df_dimensions_metadata["doi"].apply(str.lower)
df_dimensions_metadata["pmid"] = df_dimensions_metadata["pmid"].apply(str.lower)
df_dimensions_metadata["pmcid"] = df_dimensions_metadata["pmcid"].apply(str.lower)

In [51]:
df_datasource.head()

Unnamed: 0,source,url
0,CORD19,https://pages.semanticscholar.org/coronavirus-...
1,Dimensions,https://docs.google.com/spreadsheets/d/1-kTZJZ...
2,WHO,https://www.who.int/emergencies/diseases/novel...


In [52]:
# CORD19 metadata

In [53]:
df_cord_metadata.shape

(57366, 9)

In [54]:
df_pub[df_pub.doi=="0.1126/science.abb7331"]

Unnamed: 0,doi,title,abstract,publication_year,publication_month,journal,volume,issue,pages,pmid,pmcid,dimensions_id,pub_id
0,0.1126/science.abb7331,‘A ticking time bomb’: Scientists worry about ...,"CAPE TOWN, SOUTH AFRICA—Late on Sunday evening...",2020,,Science,,,,,,,0


In [55]:
#We need this to keep empty values!

df_tmp = df_cord_metadata[df_cord_metadata.doi==""]
df_cord_metadata1 = pd.merge(df_cord_metadata[df_cord_metadata.doi!=""], df_pub[['pub_id','doi']],  how='inner', left_on=['doi'], right_on=['doi'])
df_tmp2 = df_tmp[df_tmp.pmid==""]
df_cord_metadata2 = pd.merge(df_tmp[df_tmp.pmid!=""], df_pub[['pub_id','pmid']],  how='inner', left_on=['pmid'], right_on=['pmid'])
df_cord_metadata3 = pd.merge(df_tmp2[df_tmp2.pmcid!=""], df_pub[['pub_id','pmcid']],  how='inner', left_on=['pmcid'], right_on=['pmcid'])

In [56]:
df_cord_metadata1 = df_cord_metadata1.groupby("doi").first()
df_cord_metadata1.reset_index(inplace=True)
df_cord_metadata2 = df_cord_metadata2.groupby("pmid").first()
df_cord_metadata2.reset_index(inplace=True)
df_cord_metadata3 = df_cord_metadata3.groupby("pmcid").first()
df_cord_metadata3.reset_index(inplace=True)

In [57]:
df_cord_metadata = pd.concat([df_cord_metadata1,df_cord_metadata2,df_cord_metadata3])

In [58]:
df_cord_metadata.shape

(57041, 10)

In [59]:
# read full texts in
folders = ['biorxiv_medrxiv/pdf_json','comm_use_subset/pdf_json','custom_license/pdf_json','noncomm_use_subset/pdf_json']
shas = list()
full_texts = list()

for folder in folders:
    for root, dirs, files in os.walk(os.path.join(cord19_folder,folder)):
        for file in tqdm(files):
            if ".json" in file: # read
                data = json.loads(codecs.open(os.path.join(root,file)).read())
                sha = data["paper_id"]
                full_text = " ".join(sanitize_string(section["text"]) for section in data["body_text"])
                shas.append(sha)
                full_texts.append(full_text)

HBox(children=(FloatProgress(value=0.0, max=2278.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9769.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=31376.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2518.0), HTML(value='')))




In [60]:
df_cord_fulltext = pd.DataFrame.from_dict({"sha":shas,"full_text":full_texts})

In [61]:
df_cord_fulltext.shape

(45941, 2)

In [62]:
df_cord_metadata = pd.merge(df_cord_metadata, df_cord_fulltext,  how='left', left_on=['sha'], right_on=['sha'])
df_cord_metadata = df_cord_metadata.fillna('')
df_cord_metadata.rename(columns={"id":"pub_id"},inplace=True)

In [63]:
df_cord_metadata.head()

Unnamed: 0,doi,source,license,full_text_file,ms_academic_id,who_covidence,pmid,pmcid,sha,pub_id,full_text
0,0.1126/science.abb7331,WHO,unk,,,#8463,,,,0,
1,10.0376/cma.j.issn.0376-2491.2020.0002,WHO,unk,,3003451419.0,#615,32036640.0,,,1,
2,10.1001/archinte.168.22.2489,PMC,unk,,,,19064834.0,pmc2783624,,2,
3,10.1001/jama.2010.675,PMC,unk,,,,20501927.0,pmc2968755,,3,
4,10.1001/jama.2014.2116,PMC,unk,,,,24566924.0,pmc6689404,,4,


In [64]:
# WHO and Dimensions metadata

In [65]:
df_tmp = df_who_metadata[df_who_metadata.doi==""]
df_who_metadata1 = pd.merge(df_who_metadata[df_who_metadata.doi!=""], df_pub[['pub_id','doi']],  how='inner', left_on=['doi'], right_on=['doi'])
df_tmp2 = df_tmp[df_tmp.pmid==""]
df_who_metadata2 = pd.merge(df_tmp[df_tmp.pmid!=""], df_pub[['pub_id','pmid']],  how='inner', left_on=['pmid'], right_on=['pmid'])
df_who_metadata3 = pd.merge(df_tmp2[df_tmp2.pmcid!=""], df_pub[['pub_id','pmcid']],  how='inner', left_on=['pmcid'], right_on=['pmcid'])

In [66]:
df_who_metadata1 = df_who_metadata1.groupby("doi").first()
df_who_metadata1.reset_index(inplace=True)
df_who_metadata2 = df_who_metadata2.groupby("pmid").first()
df_who_metadata2.reset_index(inplace=True)
df_who_metadata3 = df_who_metadata3.groupby("pmcid").first()
df_who_metadata3.reset_index(inplace=True)

In [67]:
df_who_metadata = pd.concat([df_who_metadata1,df_who_metadata2,df_who_metadata3])

In [68]:
df_who_metadata.shape

(8609, 10)

In [69]:
df_who_metadata.rename(columns={"id":"pub_id"},inplace=True)

In [70]:
df_tmp = df_dimensions_metadata[df_dimensions_metadata.dimensions_id==""]
df_dimensions_metadata1 = pd.merge(df_dimensions_metadata[df_dimensions_metadata.dimensions_id!=""], df_pub[['pub_id','dimensions_id']],  how='inner', left_on=['dimensions_id'], right_on=['dimensions_id'])

In [71]:
df_dimensions_metadata1 = df_dimensions_metadata1.groupby("dimensions_id").first()
df_dimensions_metadata1.reset_index(inplace=True)

In [72]:
df_dimensions_metadata = pd.concat([df_dimensions_metadata1])

In [73]:
df_dimensions_metadata.shape

(17128, 10)

In [74]:
df_dimensions_metadata.rename(columns={"id":"pub_id"},inplace=True)

In [75]:
# Create datasource tables

In [76]:
cord_source_id = df_datasource[df_datasource.source=="CORD19"].index.values[0]
who_source_id = df_datasource[df_datasource.source=="WHO"].index.values[0]
dimensions_source_id = df_datasource[df_datasource.source=="Dimensions"].index.values[0]

In [77]:
df_cord_metadata["source_id"] = cord_source_id
df_who_metadata["source_id"] = who_source_id
df_dimensions_metadata["source_id"] = dimensions_source_id

In [78]:
df_pub_to_datasource = df_cord_metadata[["pub_id","source_id"]]
df_pub_to_datasource = df_pub_to_datasource.append(df_who_metadata[["pub_id","source_id"]],ignore_index=True)
df_pub_to_datasource = df_pub_to_datasource.append(df_dimensions_metadata[["pub_id","source_id"]],ignore_index=True)

In [79]:
df_pub_to_datasource.drop_duplicates(inplace=True)
df_pub_to_datasource.rename(columns={"source_id":"datasource_id"},inplace=True)

In [80]:
df_pub_to_datasource.shape

(82761, 2)

In [81]:
df_pub_to_datasource[df_pub_to_datasource.pub_id==22787]

Unnamed: 0,pub_id,datasource_id
68703,22787,1


In [82]:
# remove unnecessary columns
df_cord_metadata.drop(columns=['doi','pmid','pmcid','source_id'],inplace=True)
df_who_metadata.drop(columns=['doi','pmid','pmcid','source_id'],inplace=True)
df_dimensions_metadata.drop(columns=['doi','pmid','pmcid','source_id'],inplace=True)

In [83]:
# reset all indexes which will become PKs
df_cord_metadata.reset_index(drop=True,inplace=True)
df_who_metadata.reset_index(drop=True,inplace=True)
df_dimensions_metadata.reset_index(drop=True,inplace=True)
df_datasource.reset_index(drop=True,inplace=True)
df_cord_metadata["cord19_metadata_id"] = df_cord_metadata.index.values
df_who_metadata["who_metadata_id"] = df_who_metadata.index.values
df_dimensions_metadata["dimensions_metadata_id"] = df_dimensions_metadata.index.values
df_datasource["datasource_metadata_id"] = df_datasource.index.values

In [84]:
# make numeric where needed
df_pub["publication_year"] = pd.to_numeric(df_pub["publication_year"])
df_pub["publication_month"] = pd.to_numeric(df_pub["publication_month"])
df_pub["pmid"] = pd.to_numeric(df_pub["pmid"])

In [85]:
# add timestamp
df_pub["timestamp"] = pd.Timestamp.now()

In [86]:
# clean-up text (optional)
replaces = [""]

def clean_up(txt):
    for r in replaces:
        txt = txt.replace(r,"")
    return txt.encode('utf8', 'ignore').decode('utf8')
df_pub["abstract"] = [clean_up(a) for a in df_pub["abstract"].values]

In [87]:
df_pub.head()

Unnamed: 0,doi,title,abstract,publication_year,publication_month,journal,volume,issue,pages,pmid,pmcid,dimensions_id,pub_id,timestamp
0,0.1126/science.abb7331,‘A ticking time bomb’: Scientists worry about ...,"CAPE TOWN, SOUTH AFRICA—Late on Sunday evening...",2020.0,,Science,,,,,,,0,2020-04-25 09:08:51.611715
1,10.0376/cma.j.issn.0376-2491.2020.0002,[Ten hot issues of breast cancer under the nov...,,2020.0,2.0,Chinese medical journal,100.0,0.0,e002,32036640.0,,pub.1124777091,1,2020-04-25 09:08:51.611715
2,10.1001/archinte.168.22.2489,Another Piece of the Puzzle: Human Metapneumov...,BACKGROUND: Each winter respiratory viruses ac...,2008.0,12.0,Archives of Internal Medicine,,,,19064834.0,pmc2783624,,2,2020-04-25 09:08:51.611715
3,10.1001/jama.2010.675,Viral etiology of severe pneumonia among Kenya...,CONTEXT: Pneumonia is the leading cause of chi...,2010.0,5.0,JAMA,,,,20501927.0,pmc2968755,,3,2020-04-25 09:08:51.611715
4,10.1001/jama.2014.2116,Critically Ill Patients With Influenza A(H1N1)...,,2014.0,4.0,JAMA,,,,24566924.0,pmc6689404,,4,2020-04-25 09:08:51.611715


In [88]:
# reorder the columns to match the SQL schema

df_datasource.columns

Index(['source', 'url', 'datasource_metadata_id'], dtype='object')

In [89]:
df_pub = df_pub[['pub_id', 'title', 'abstract', 'publication_year', 'publication_month', 'journal',
       'volume', 'issue', 'pages', 'doi', 'pmid', 'pmcid', 'dimensions_id',
       'timestamp']]
df_who_metadata = df_who_metadata[['who_metadata_id', 'accession_number', 'ref', 'covidence', 'study', 'notes', 'tags',
       'pub_id']]
df_dimensions_metadata = df_dimensions_metadata[['dimensions_metadata_id', 'dimensions_id', 'source_uid', 'open_access',
       'publication_type', 'dimensions_url', 'mesh_terms', 'pub_id']]
df_cord_metadata = df_cord_metadata[[ 'cord19_metadata_id', 'source', 'license', 'full_text_file', 'ms_academic_id',
       'who_covidence', 'sha', 'full_text', 'pub_id']]
df_datasource = df_datasource[['datasource_metadata_id', 'source', 'url']]

In [90]:
df_pub.doi.value_counts()

                                      3733
10.1016/s1294-5501(07)88761-9            1
10.1016/j.dsx.2020.04.028                1
10.1016/j.diagmicrobio.2020.114988       1
10.1136/vr.m1233                         1
                                      ... 
10.1111/irv.12391                        1
10.1007/978-0-85729-883-6_8              1
10.1016/b978-1-4160-3949-5.50084-4       1
10.1007/978-3-319-21596-9_1              1
10.1111/j.1469-0691.2011.03672.x         1
Name: doi, Length: 66237, dtype: int64

In [91]:
df_pub[df_pub.doi == "10.1016/s0140-6736(20)30607-3"].doi.to_string()

'26972    10.1016/s0140-6736(20)30607-3'

### Dump to CSV

In [92]:
### Export the df_pub dataframe for further use

df_pub.to_csv("datasets_output/df_pub.csv", compression="gzip", index=False)

In [93]:
# export TSV for ingestion

df_pub.to_csv("datasets_output/sql_tables/pub.csv",index=False,sep="\t",header=False)
df_cord_metadata.to_csv("datasets_output/sql_tables/cord19_metadata.csv",index=False,sep="\t",header=False)
df_dimensions_metadata.to_csv("datasets_output/sql_tables/dimensions_metadata.csv",index=False,sep="\t",header=False)
df_who_metadata.to_csv("datasets_output/sql_tables/who_metadata.csv",index=False,sep="\t",header=False)
df_datasource.to_csv("datasets_output/sql_tables/datasource.csv",index=False,sep="\t",header=False)
df_pub_to_datasource.to_csv("datasets_output/sql_tables/pub_datasource.csv",index=False,sep="\t",header=False)

### Dump to MySQL

Use this if you want to create a MySQL db.

In [None]:
dtype_dict = {'pub_id':Integer, 'title':String, 'abstract':String, 'publication_year':Integer, 'publication_month':Integer, 'journal':String,
       'volume':String, 'issue':String, 'pages':String, 'doi':String, 'pmid':Integer, 'pmcid':String, 'timestamp':DateTime}

In [None]:
# get API key
import configparser
config = configparser.ConfigParser()
config.read("credentials/conf.ini")
mysql_username = config["MYSQL"]["username"]
mysql_password = config["MYSQL"]["password"]
mysql_database = config["MYSQL"]["database"]

In [None]:
sqlEngine = create_engine('mysql+pymysql://%s:%s@127.0.0.1/%s'%(mysql_username,mysql_password,mysql_database), pool_recycle=3600)
dbConnection = sqlEngine.connect()

In [None]:
# main table
table_name = "pub"
try:
    frame = df_pub.to_sql(table_name, dbConnection, if_exists='append', index=False, index_label="pub_id", dtype=dtype_dict);
except ValueError as vx:
    print(vx)
except Exception as ex:   
    print(ex)
else:
    print("Table %s created successfully."%table_name);   
finally:
    dbConnection.close()

In [None]:
sqlEngine = create_engine('mysql+pymysql://%s:%s@127.0.0.1/%s'%(mysql_username,mysql_password,mysql_database), pool_recycle=3600)
dbConnection = sqlEngine.connect()

In [None]:
# other tables
try:
    frame = df_cord_metadata.to_sql("cord19_metadata", dbConnection, if_exists='append', index=True, index_label="cord19_metadata_id")
    frame = df_who_metadata.to_sql("who_metadata", dbConnection, if_exists='append', index=True, index_label="who_metadata_id")
    frame = df_dimensions_metadata.to_sql("dimensions_metadata", dbConnection, if_exists='append', index=True, index_label="dimensions_metadata_id")
    frame = df_datasource.to_sql("datasource", dbConnection, if_exists='append', index=True, index_label="datasource_id")
except ValueError as vx:
    print(vx)
except Exception as ex:   
    print(ex)
else:
    print("Tables created successfully.");   
finally:
    dbConnection.close()

In [None]:
sqlEngine = create_engine('mysql+pymysql://%s:%s@127.0.0.1/%s'%(mysql_username,mysql_password,mysql_database), pool_recycle=3600)
dbConnection = sqlEngine.connect()

In [None]:
# last table
try:
    frame = df_pub_to_datasource.to_sql("pub_datasource", dbConnection, if_exists='append', index=False, index_label=["pub_id","datasource_id"])
except ValueError as vx:
    print(vx)
except Exception as ex:   
    print(ex)
else:
    print("Table created successfully.");   
finally:
    dbConnection.close()