# COVID19-related literature SQL database

In [1192]:
# magics and warnings
%load_ext autoreload
%autoreload 2
import warnings; warnings.simplefilter('ignore')

import os, random, codecs, json
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import pymysql
from sqlalchemy import create_engine
from sqlalchemy import Integer,String,Boolean,DateTime

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Load datasets

In [1193]:
# point here to the versions you want to use
dimensions_filename = "datasets_input/Dimensions_18_03_2020.csv"
who_filename = "datasets_input/WHO_18_03_2020.csv"
cord19_folder = "datasets_input/CORD19_2020_03_20"

df_dimensions = pd.read_csv(dimensions_filename, dtype=str)
df_who = pd.read_csv(who_filename, dtype=str)
df_cord = pd.read_csv(os.path.join(cord19_folder,"metadata.csv"), dtype=str)

### Prepare dataframes for ingestion

#### Clean-up data frames

##### Dimensions

In [1194]:
df_dimensions.head()

Unnamed: 0,Date added,Publication ID,DOI,PMID,PMCID,Title,Abstract,Source title,Source UID,Publisher,...,Research Organizations - standardized,GRID IDs,City of Research organization,Country of Research organization,Funder,UIDs of supporting grants,Times cited,Altmetric,Source Linkout,Dimensions URL
0,2020-03-16,pub.1125672140,10.5812/iji.102184,,,COVID-19: The New Threat,,International Journal of Infection,jour.1051749,Kowsar Medical Institute,...,,,,,,,,,,https://app.dimensions.ai/details/publication/...
1,2020-03-16,pub.1125670218,10.1515/cclm-2020-0188,,,Prominent changes in blood coagulation of pati...,Abstract Background As the number of patients...,Clinical Chemistry and Laboratory Medicine,jour.1294896,De Gruyter,...,,,,,National Natural Science Foundation of China,grant.8360207,,2.0,,https://app.dimensions.ai/details/publication/...
2,2020-03-16,pub.1125671401,10.3348/kjr.2020.0163,,,What Is Needed to Make Interventional Radiolog...,,Korean Journal of Radiology,jour.1023226,The Korean Society of Radiology (KAMJE),...,National University of Singapore; Tan Tock Sen...,grid.4280.e; grid.240988.f,Singapore; Singapore,Singapore; Singapore,,,,,,https://app.dimensions.ai/details/publication/...
3,2020-03-16,pub.1125671336,10.32598/jpr.8.2.139,,,COVID-19 Infection in Iranian Children: A Case...,,Journal of Pediatrics Review,jour.1154967,Negah Scientific Publisher,...,,,,,,,,,,https://app.dimensions.ai/details/publication/...
4,2020-03-16,pub.1125671402,10.3348/kjr.2020.0164,,,Computed Tomographic Findings in COVID-19,,Korean Journal of Radiology,jour.1023226,The Korean Society of Radiology (KAMJE),...,Hainan Medical University,grid.443397.e,Haikou,China,,,,,,https://app.dimensions.ai/details/publication/...


In [1195]:
df_dimensions.columns

Index(['Date added', 'Publication ID', 'DOI', 'PMID', 'PMCID', 'Title',
       'Abstract', 'Source title', 'Source UID', 'Publisher', 'MeSH terms',
       'Publication Date', 'PubYear', 'Volume', 'Issue', 'Pagination',
       'Open Access', 'Publication Type', 'Authors', 'Corresponding Authors',
       'Authors Affiliations', 'Research Organizations - standardized',
       'GRID IDs', 'City of Research organization',
       'Country of Research organization', 'Funder',
       'UIDs of supporting grants', 'Times cited', 'Altmetric',
       'Source Linkout', 'Dimensions URL'],
      dtype='object')

In [1196]:
df_dimensions.drop(columns=['Date added', 'Publisher', 'Authors', 'Corresponding Authors',
       'Authors Affiliations', 'Research Organizations - standardized',
       'GRID IDs', 'City of Research organization',
       'Country of Research organization', 'Funder',
       'UIDs of supporting grants', 'Times cited', 'Altmetric',
       'Source Linkout'], inplace=True)

In [1197]:
df_dimensions.columns

Index(['Publication ID', 'DOI', 'PMID', 'PMCID', 'Title', 'Abstract',
       'Source title', 'Source UID', 'MeSH terms', 'Publication Date',
       'PubYear', 'Volume', 'Issue', 'Pagination', 'Open Access',
       'Publication Type', 'Dimensions URL'],
      dtype='object')

In [1198]:
df_dimensions.rename(columns={'Publication ID':'publication_id', 'DOI':'doi', 'PMID':'pmid', 'PMCID':'pmcid', 'Title':'title', 'Abstract':'abstract',
       'Source title':'journal', 'Source UID':'source_uid', 'MeSH terms':'mesh_terms', 'Publication Date':'publication_date',
       'PubYear':'publication_year', 'Volume':'volume', 'Issue':'issue', 'Pagination':'pages', 'Open Access':'open_access',
       'Publication Type':'publication_type', 'Dimensions URL':'dimensions_url'}, inplace=True)

In [1199]:
def get_year(date):
    if len(date)>3 and date[:4].isdigit():
        return date[:4]
    return ""

month_to_number = {"Jan":"1","Feb":"2","Mar":"3","Apr":"4","May":"5","Jun":"6","Jul":"7","Aug":"8","Sep":"9","Oct":"10","Nov":"11","Dec":"12"}

def get_month(date):
    if len(date)>6:
        if "-" in date and date.split("-")[1].isdigit():
            return str(int(date.split("-")[1]))
        else:
            try:
                return month_to_number[date.split()[1]]
            except:
                return ""
    return ""

In [1200]:
df_dimensions["publication_year"] = df_dimensions["publication_year"].apply(get_year)
df_dimensions["publication_month"] = df_dimensions["publication_date"].apply(get_month)

In [1201]:
df_dimensions.drop(columns="publication_date", inplace=True)
df_dimensions = df_dimensions.fillna('')

In [1202]:
df_dimensions.head()

Unnamed: 0,publication_id,doi,pmid,pmcid,title,abstract,journal,source_uid,mesh_terms,publication_year,volume,issue,pages,open_access,publication_type,dimensions_url,publication_month
0,pub.1125672140,10.5812/iji.102184,,,COVID-19: The New Threat,,International Journal of Infection,jour.1051749,,2020,7.0,1.0,,Closed,article,https://app.dimensions.ai/details/publication/...,3
1,pub.1125670218,10.1515/cclm-2020-0188,,,Prominent changes in blood coagulation of pati...,Abstract Background As the number of patients...,Clinical Chemistry and Laboratory Medicine,jour.1294896,,2020,0.0,0.0,,Closed,article,https://app.dimensions.ai/details/publication/...,3
2,pub.1125671401,10.3348/kjr.2020.0163,,,What Is Needed to Make Interventional Radiolog...,,Korean Journal of Radiology,jour.1023226,,2020,21.0,,,Closed,article,https://app.dimensions.ai/details/publication/...,1
3,pub.1125671336,10.32598/jpr.8.2.139,,,COVID-19 Infection in Iranian Children: A Case...,,Journal of Pediatrics Review,jour.1154967,,2020,,,139-144,Closed,article,https://app.dimensions.ai/details/publication/...,4
4,pub.1125671402,10.3348/kjr.2020.0164,,,Computed Tomographic Findings in COVID-19,,Korean Journal of Radiology,jour.1023226,,2020,21.0,,,Closed,article,https://app.dimensions.ai/details/publication/...,1


In [1203]:
df_dimensions[df_dimensions.doi==""].shape

(68, 17)

##### WHO

In [1204]:
df_who.head()

Unnamed: 0,Title,Authors,Abstract,Published Year,Published Month,Journal,Volume,Issue,Pages,Accession Number,DOI,Ref,Covidence #,Study,Notes,Tags
0,Platelet-to-lymphocyte ratio is associated wit...,"Qu, Rong; Ling, Yun; Zhang, Yi-Huizhi; Wei, Li...","INTRODUCTION: SinceDecember 2019, novelcoronav...",2020,,Journal of medical virology,,,,,10.1002/jmv.25767,9121,#8973,Qu 2020,,* Case study/series; * Opinion piece; Clinical...
1,Epidemiologic and Clinical Characteristics of ...,"Qian, Guo-Qing; Yang, Nai-Bin; Ding, Feng; Ma,...",BACKGROUND: Recent studies have focused initia...,2020,,QJM : monthly journal of the Association of Ph...,,,,,10.1093/qjmed/hcaa089,9120,#9128,Qian 2020,,* Epidemiological study; * Opinion piece; Epid...
2,Correlation between travellers departing from ...,"Ping Zhong, M. D. Songxue Guo M. D. Ting Chen ...",Highlight We found a strong correlation betwee...,2020,,Journal of Travel Medicine,,,,,,9099,#9222,PingZhong 2020,,* Epidemiological study; * Opinion piece; Epid...
3,On the front lines of coronavirus: the Italian...,"Paterlini, Marta",Italy has rapidly become the country hit secon...,2020,,BMJ,368.0,,m1065-m1065,,10.1136/bmj.m1065,9119,#8989,Paterlini 2020,,"* Opinion piece; Epidemiology; Ethics, social ..."
4,Coronavirus cases have dropped sharply in Sout...,"Normile, Dennis",Europe is now the epicenter of the COVID-19 pa...,2020,,Science,,,,,10.1126/science.abb7566,9620,#9246,Normile 2020,,* Opinion piece; Epidemiology; Infection preve...


In [1205]:
df_who.columns

Index(['Title', 'Authors', 'Abstract', 'Published Year', 'Published Month',
       'Journal', 'Volume', 'Issue', 'Pages', 'Accession Number', 'DOI', 'Ref',
       'Covidence #', 'Study', 'Notes', 'Tags'],
      dtype='object')

In [1206]:
df_who.drop(columns="Authors", inplace=True)

In [1207]:
df_who.rename(columns={'Title':'title', 'Abstract':'abstract', 'Published Year':'publication_year', 'Published Month':'publication_month',
       'Journal':'journal', 'Volume':'volume', 'Issue':'issue', 'Pages':'pages', 'Accession Number':'accession_number', 'DOI':'doi', 'Ref':'ref',
       'Covidence #':'covidence', 'Study':'study', 'Notes':'notes', 'Tags':'tags'}, inplace=True)

In [1208]:
df_who["pmid"] = ""
df_who["pmcid"] = ""
df_who = df_who.fillna('')

In [1209]:
df_who.head()

Unnamed: 0,title,abstract,publication_year,publication_month,journal,volume,issue,pages,accession_number,doi,ref,covidence,study,notes,tags,pmid,pmcid
0,Platelet-to-lymphocyte ratio is associated wit...,"INTRODUCTION: SinceDecember 2019, novelcoronav...",2020,,Journal of medical virology,,,,,10.1002/jmv.25767,9121,#8973,Qu 2020,,* Case study/series; * Opinion piece; Clinical...,,
1,Epidemiologic and Clinical Characteristics of ...,BACKGROUND: Recent studies have focused initia...,2020,,QJM : monthly journal of the Association of Ph...,,,,,10.1093/qjmed/hcaa089,9120,#9128,Qian 2020,,* Epidemiological study; * Opinion piece; Epid...,,
2,Correlation between travellers departing from ...,Highlight We found a strong correlation betwee...,2020,,Journal of Travel Medicine,,,,,,9099,#9222,PingZhong 2020,,* Epidemiological study; * Opinion piece; Epid...,,
3,On the front lines of coronavirus: the Italian...,Italy has rapidly become the country hit secon...,2020,,BMJ,368.0,,m1065-m1065,,10.1136/bmj.m1065,9119,#8989,Paterlini 2020,,"* Opinion piece; Epidemiology; Ethics, social ...",,
4,Coronavirus cases have dropped sharply in Sout...,Europe is now the epicenter of the COVID-19 pa...,2020,,Science,,,,,10.1126/science.abb7566,9620,#9246,Normile 2020,,* Opinion piece; Epidemiology; Infection preve...,,


In [1210]:
df_who[df_who.doi==""].shape

(349, 17)

##### CORD19

In [1211]:
df_cord.head()

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file
0,,Elsevier,Intrauterine virus infections and congenital h...,10.1016/0002-8703(72)90077-4,,4361535,els-covid,Abstract The etiologic basis for the vast majo...,1972-12-31,"Overall, James C.",American Heart Journal,,,False,custom_license
1,,Elsevier,Coronaviruses in Balkan nephritis,10.1016/0002-8703(80)90355-5,,6243850,els-covid,,1980-03-31,"Georgescu, Leonida; Diosi, Peter; Buţiu, Ioan;...",American Heart Journal,,,False,custom_license
2,,Elsevier,Cigarette smoking and coronary heart disease: ...,10.1016/0002-8703(80)90356-7,,7355701,els-covid,,1980-03-31,"Friedman, Gary D",American Heart Journal,,,False,custom_license
3,aecbc613ebdab36753235197ffb4f35734b5ca63,Elsevier,Clinical and immunologic studies in identical ...,10.1016/0002-9343(73)90176-9,,4579077,els-covid,"Abstract Middle-aged female identical twins, o...",1973-08-31,"Brunner, Carolyn M.; Horwitz, David A.; Shann,...",The American Journal of Medicine,,,True,custom_license
4,,Elsevier,Epidemiology of community-acquired respiratory...,10.1016/0002-9343(85)90361-4,,4014285,els-covid,Abstract Upper respiratory tract infections ar...,1985-06-28,"Garibaldi, Richard A.",The American Journal of Medicine,,,False,custom_license


In [1212]:
df_cord.columns

Index(['sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id', 'license',
       'abstract', 'publish_time', 'authors', 'journal',
       'Microsoft Academic Paper ID', 'WHO #Covidence', 'has_full_text',
       'full_text_file'],
      dtype='object')

In [1213]:
df_cord.drop(columns='authors', inplace=True)
df_cord = df_cord.fillna('')

In [1214]:
df_cord.rename(columns={'source_x':'source', 'pubmed_id': 'pmid',
       'Microsoft Academic Paper ID': 'ms_academic_id', 'WHO #Covidence': 'who_covidence'}, inplace=True)

In [1215]:
df_cord["publication_year"] = df_cord["publish_time"].apply(get_year)
df_cord["publication_month"] = df_cord["publish_time"].apply(get_month)

In [1216]:
df_cord.drop(columns='publish_time', inplace=True)

In [1217]:
df_cord['pages'] = ""
df_cord['volume'] = ""
df_cord['issue'] = ""

In [1218]:
df_cord.head()

Unnamed: 0,sha,source,title,doi,pmcid,pmid,license,abstract,journal,ms_academic_id,who_covidence,has_full_text,full_text_file,publication_year,publication_month,pages,volume,issue
0,,Elsevier,Intrauterine virus infections and congenital h...,10.1016/0002-8703(72)90077-4,,4361535,els-covid,Abstract The etiologic basis for the vast majo...,American Heart Journal,,,False,custom_license,1972,12,,,
1,,Elsevier,Coronaviruses in Balkan nephritis,10.1016/0002-8703(80)90355-5,,6243850,els-covid,,American Heart Journal,,,False,custom_license,1980,3,,,
2,,Elsevier,Cigarette smoking and coronary heart disease: ...,10.1016/0002-8703(80)90356-7,,7355701,els-covid,,American Heart Journal,,,False,custom_license,1980,3,,,
3,aecbc613ebdab36753235197ffb4f35734b5ca63,Elsevier,Clinical and immunologic studies in identical ...,10.1016/0002-9343(73)90176-9,,4579077,els-covid,"Abstract Middle-aged female identical twins, o...",The American Journal of Medicine,,,True,custom_license,1973,8,,,
4,,Elsevier,Epidemiology of community-acquired respiratory...,10.1016/0002-9343(85)90361-4,,4014285,els-covid,Abstract Upper respiratory tract infections ar...,The American Journal of Medicine,,,False,custom_license,1985,6,,,


In [1220]:
df_cord[(df_cord.doi=="") & ((df_cord.sha!="") | (df_cord.pmid!="") | (df_cord.pmcid!=""))].shape

(3118, 18)

In [1221]:
df_dimensions.shape

(2956, 17)

In [1222]:
df_who.shape

(2048, 17)

In [1223]:
df_cord.shape

(44220, 18)

### Prepare tables

In [1276]:
# the main table: pub

In [1224]:
pub_table_columns = ['title','abstract','publication_year','publication_month','journal','volume','issue','pages','doi','pmid','pmcid']

df_pub = df_dimensions[pub_table_columns].append(df_who[pub_table_columns], ignore_index = True)

In [1225]:
df_pub = df_pub[pub_table_columns].append(df_cord[pub_table_columns], ignore_index=True)

In [1226]:
df_pub.shape

(49224, 11)

In [1227]:
df_pub[(df_pub.doi=="") & (df_pub.pmid=="") & (df_pub.pmcid=="")].shape

(764, 11)

In [1228]:
# check to have at least one valid identifier per publication
# we drop publications which do not: hopefully, they will be equipped with an identifier in future releases

df_pub = df_pub[~((df_pub.doi=="") & (df_pub.pmid=="") & (df_pub.pmcid==""))]

In [1229]:
# drop duplicates, first on dois then pmids then pmcids. We need this to keep empty values!
df_tmp = df_pub[df_pub.doi==""]
df_pub = df_pub[df_pub.doi!=""].drop_duplicates(['doi'])
df_tmp2 = df_tmp[df_tmp.pmid==""]
df_pub2 = df_tmp[df_tmp.pmid!=""].drop_duplicates(['pmid'])
df_pub3 = df_tmp2[df_tmp2.pmcid!=""].drop_duplicates(['pmcid'])

In [1230]:
df_pub = pd.concat([df_pub,df_pub2,df_pub3])

In [1231]:
# add PK and reset index
df_pub.reset_index(drop=True,inplace=True)
df_pub["id"] = df_pub.index.values

In [1232]:
df_pub.shape

(45646, 12)

In [1233]:
df_pub.tail()

Unnamed: 0,title,abstract,publication_year,publication_month,journal,volume,issue,pages,doi,pmid,pmcid,id
45641,Liver and Kidney Injuries in COVID-19 and Thei...,,2020.0,3.0,Arch Acad Emerg Med,,,,,,PMC7075271,45641
45642,Information Typology in Coronavirus (COVID-19)...,,2020.0,3.0,Arch Acad Emerg Med,,,,,,PMC7075270,45642
45643,Coronavirus Pandemic and Worries during Pregna...,,2020.0,3.0,Arch Acad Emerg Med,,,,,,PMC7075675,45643
45644,A Novel Coronavirus Outbreak from Wuhan City i...,,2020.0,2.0,Arch Acad Emerg Med,,,,,,PMC7075272,45644
45645,,,,,,,,,,,PMC7064018,45645


In [1234]:
# create other tables via joins

df_datasource = pd.DataFrame.from_dict({"source":["CORD19","Dimensions","WHO"],"url":["https://pages.semanticscholar.org/coronavirus-research","https://docs.google.com/spreadsheets/d/1-kTZJZ1GAhJ2m4GAIhw1ZdlgO46JpvX0ZQa232VWRmw/edit#gid=2034285255",
"https://www.who.int/emergencies/diseases/novel-coronavirus-2019/global-research-on-novel-coronavirus-2019-ncov"]})
df_cord_metadata = df_cord[['source','license','full_text_file','ms_academic_id','who_covidence','doi','pmid','pmcid','sha']]
df_who_metadata = df_who[['accession_number', 'doi', 'ref',
       'covidence', 'study', 'notes', 'tags', 'pmid', 'pmcid']]
df_dimensions_metadata = df_dimensions[['publication_id', 'doi', 'pmid', 'pmcid', 'source_uid', 'mesh_terms',
       'open_access', 'publication_type', 'dimensions_url']]

In [1235]:
df_datasource.head()

Unnamed: 0,source,url
0,CORD19,https://pages.semanticscholar.org/coronavirus-...
1,Dimensions,https://docs.google.com/spreadsheets/d/1-kTZJZ...
2,WHO,https://www.who.int/emergencies/diseases/novel...


In [1236]:
# CORD19 metadata

In [1237]:
#We need this to keep empty values!

df_tmp = df_cord_metadata[df_cord_metadata.doi==""]
df_cord_metadata1 = pd.merge(df_cord_metadata[df_cord_metadata.doi!=""], df_pub[['id','doi']],  how='inner', left_on=['doi'], right_on=['doi'])
df_tmp2 = df_tmp[df_tmp.pmid==""]
df_cord_metadata2 = pd.merge(df_tmp[df_tmp.pmid!=""], df_pub[['id','pmid']],  how='inner', left_on=['pmid'], right_on=['pmid'])
df_cord_metadata3 = pd.merge(df_tmp2[df_tmp2.pmcid!=""], df_pub[['id','pmcid']],  how='inner', left_on=['pmcid'], right_on=['pmcid'])

In [1238]:
df_cord_metadata1.drop_duplicates("doi",inplace=True)
df_cord_metadata2.drop_duplicates("pmid",inplace=True)
df_cord_metadata3.drop_duplicates("pmcid",inplace=True)

In [1239]:
df_cord_metadata1.head()

Unnamed: 0,source,license,full_text_file,ms_academic_id,who_covidence,doi,pmid,pmcid,sha,id
0,Elsevier,els-covid,custom_license,,,10.1016/0002-8703(72)90077-4,4361535,,,3766
1,Elsevier,els-covid,custom_license,,,10.1016/0002-8703(80)90355-5,6243850,,,3767
2,Elsevier,els-covid,custom_license,,,10.1016/0002-8703(80)90356-7,7355701,,,3768
3,Elsevier,els-covid,custom_license,,,10.1016/0002-9343(73)90176-9,4579077,,aecbc613ebdab36753235197ffb4f35734b5ca63,3769
4,Elsevier,els-covid,custom_license,,,10.1016/0002-9343(85)90361-4,4014285,,,3770


In [1240]:
df_cord_metadata = pd.concat([df_cord_metadata1,df_cord_metadata2,df_cord_metadata3])

In [1241]:
df_cord_metadata.shape

(43856, 10)

In [721]:
# read full texts in
folders = ['biorxiv_medrxiv/biorxiv_medrxiv','comm_use_subset/comm_use_subset','custom_license/custom_license','noncomm_use_subset/noncomm_use_subset']
shas = list()
full_texts = list()

for folder in folders:
    for root, dirs, files in os.walk(os.path.join(cord19_folder,folder)):
        for file in tqdm(files):
            if ".json" in file: # read
                data = json.loads(codecs.open(os.path.join(root,file)).read())
                sha = data["paper_id"]
                full_text = "\n".join(section["text"] for section in data["body_text"])
                shas.append(sha)
                full_texts.append(full_text)

HBox(children=(FloatProgress(value=0.0, max=885.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9118.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16959.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2353.0), HTML(value='')))




In [1242]:
df_cord_fulltext = pd.DataFrame.from_dict({"sha":shas,"full_text":full_texts})

In [1243]:
df_cord_metadata = pd.merge(df_cord_metadata, df_cord_fulltext,  how='left', left_on=['sha'], right_on=['sha'])
df_cord_metadata = df_cord_metadata.fillna('')
df_cord_metadata.rename(columns={"id":"pub_id"},inplace=True)

In [1244]:
df_cord_metadata.head()

Unnamed: 0,source,license,full_text_file,ms_academic_id,who_covidence,doi,pmid,pmcid,sha,pub_id,full_text
0,Elsevier,els-covid,custom_license,,,10.1016/0002-8703(72)90077-4,4361535,,,3766,
1,Elsevier,els-covid,custom_license,,,10.1016/0002-8703(80)90355-5,6243850,,,3767,
2,Elsevier,els-covid,custom_license,,,10.1016/0002-8703(80)90356-7,7355701,,,3768,
3,Elsevier,els-covid,custom_license,,,10.1016/0002-9343(73)90176-9,4579077,,aecbc613ebdab36753235197ffb4f35734b5ca63,3769,"The patient (Fo, ) was a 58 year old mentally ..."
4,Elsevier,els-covid,custom_license,,,10.1016/0002-9343(85)90361-4,4014285,,,3770,


In [1245]:
# WHO and Dimensions metadata

In [1246]:
df_tmp = df_who_metadata[df_who_metadata.doi==""]
df_who_metadata1 = pd.merge(df_who_metadata[df_who_metadata.doi!=""], df_pub[['id','doi']],  how='inner', left_on=['doi'], right_on=['doi'])
df_tmp2 = df_tmp[df_tmp.pmid==""]
df_who_metadata2 = pd.merge(df_tmp[df_tmp.pmid!=""], df_pub[['id','pmid']],  how='inner', left_on=['pmid'], right_on=['pmid'])
df_who_metadata3 = pd.merge(df_tmp2[df_tmp2.pmcid!=""], df_pub[['id','pmcid']],  how='inner', left_on=['pmcid'], right_on=['pmcid'])

In [1247]:
df_who_metadata1.drop_duplicates("doi",inplace=True)
df_who_metadata2.drop_duplicates("pmid",inplace=True)
df_who_metadata3.drop_duplicates("pmcid",inplace=True)

Unnamed: 0,accession_number,doi,ref,covidence,study,notes,tags,pmid,id,pmcid


In [1248]:
df_who_metadata = pd.concat([df_who_metadata1,df_who_metadata2,df_who_metadata3])

In [1249]:
df_who_metadata.shape

(1630, 10)

In [1250]:
df_who_metadata.rename(columns={"id":"pub_id"},inplace=True)

In [1251]:
df_tmp = df_dimensions_metadata[df_dimensions_metadata.doi==""]
df_dimensions_metadata1 = pd.merge(df_dimensions_metadata[df_dimensions_metadata.doi!=""], df_pub[['id','doi']],  how='inner', left_on=['doi'], right_on=['doi'])
df_tmp2 = df_tmp[df_tmp.pmid==""]
df_dimensions_metadata2 = pd.merge(df_tmp[df_tmp.pmid!=""], df_pub[['id','pmid']],  how='inner', left_on=['pmid'], right_on=['pmid'])
df_dimensions_metadata3 = pd.merge(df_tmp2[df_tmp2.pmcid!=""], df_pub[['id','pmcid']],  how='inner', left_on=['pmcid'], right_on=['pmcid'])

In [1252]:
df_dimensions_metadata1.drop_duplicates("doi",inplace=True)
df_dimensions_metadata2.drop_duplicates("pmid",inplace=True)
df_dimensions_metadata3.drop_duplicates("pmcid",inplace=True)

Unnamed: 0,publication_id,doi,pmid,source_uid,mesh_terms,open_access,publication_type,dimensions_url,id,pmcid


In [1253]:
df_dimensions_metadata = pd.concat([df_dimensions_metadata1,df_dimensions_metadata2,df_dimensions_metadata3])

In [1254]:
df_dimensions_metadata.shape

(2901, 10)

In [1255]:
df_dimensions_metadata.rename(columns={"id":"pub_id"},inplace=True)

In [1256]:
# Create datasource tables

In [1257]:
cord_source_id = df_datasource[df_datasource.source=="CORD19"].index.values[0]
who_source_id = df_datasource[df_datasource.source=="WHO"].index.values[0]
dimensions_source_id = df_datasource[df_datasource.source=="Dimensions"].index.values[0]

In [1258]:
df_cord_metadata["source_id"] = cord_source_id
df_who_metadata["source_id"] = who_source_id
df_dimensions_metadata["source_id"] = dimensions_source_id

In [1259]:
df_pub_to_datasource = df_cord_metadata[["pub_id","source_id"]]
df_pub_to_datasource = df_pub_to_datasource.append(df_who_metadata[["pub_id","source_id"]],ignore_index=True)
df_pub_to_datasource = df_pub_to_datasource.append(df_dimensions_metadata[["pub_id","source_id"]],ignore_index=True)

In [1260]:
df_pub_to_datasource.drop_duplicates(inplace=True)

In [1261]:
df_pub_to_datasource.shape

(48387, 2)

In [1262]:
df_pub_to_datasource[df_pub_to_datasource.pub_id==22787]

Unnamed: 0,pub_id,source_id
20031,22787,0


In [1263]:
# remove unnecessary columns
df_cord_metadata.drop(columns=['doi','pmid','pmcid','source_id'],inplace=True)
df_who_metadata.drop(columns=['doi','pmid','pmcid','source_id'],inplace=True)
df_dimensions_metadata.drop(columns=['doi','pmid','pmcid','source_id'],inplace=True)

In [1264]:
# reset all indexes which will become PKs
df_cord_metadata.reset_index(drop=True,inplace=True)
df_who_metadata.reset_index(drop=True,inplace=True)
df_dimensions_metadata.reset_index(drop=True,inplace=True)
df_datasource.reset_index(drop=True,inplace=True)

In [1265]:
# make numeric where needed
df_pub["publication_year"] = pd.to_numeric(df_pub["publication_year"])
df_pub["publication_month"] = pd.to_numeric(df_pub["publication_month"])
df_pub["pmid"] = pd.to_numeric(df_pub["pmid"])

In [1266]:
# add timestamp
df_pub["timestamp"] = pd.Timestamp.now()

In [1267]:
df_pub.head()

Unnamed: 0,title,abstract,publication_year,publication_month,journal,volume,issue,pages,doi,pmid,pmcid,id,timestamp
0,COVID-19: The New Threat,,2020.0,3.0,International Journal of Infection,7.0,1.0,,10.5812/iji.102184,,,0,2020-03-25 18:16:28.508540
1,Prominent changes in blood coagulation of pati...,Abstract Background As the number of patients...,2020.0,3.0,Clinical Chemistry and Laboratory Medicine,0.0,0.0,,10.1515/cclm-2020-0188,,,1,2020-03-25 18:16:28.508540
2,What Is Needed to Make Interventional Radiolog...,,2020.0,1.0,Korean Journal of Radiology,21.0,,,10.3348/kjr.2020.0163,,,2,2020-03-25 18:16:28.508540
3,COVID-19 Infection in Iranian Children: A Case...,,2020.0,4.0,Journal of Pediatrics Review,,,139-144,10.32598/jpr.8.2.139,,,3,2020-03-25 18:16:28.508540
4,Computed Tomographic Findings in COVID-19,,2020.0,1.0,Korean Journal of Radiology,21.0,,,10.3348/kjr.2020.0164,,,4,2020-03-25 18:16:28.508540


### Dump to MySQL

In [1268]:
dtype_dict = {'id':Integer, 'title':String, 'abstract':String, 'publication_year':Integer, 'publication_month':Integer, 'journal':String,
       'volume':String, 'issue':String, 'pages':String, 'doi':String, 'pmid':Integer, 'pmcid':String, 'timestamp':DateTime}

In [1269]:
# get API key
import configparser
config = configparser.ConfigParser()
config.read("credentials/conf.ini")
mysql_username = config["MYSQL"]["username"]
mysql_password = config["MYSQL"]["password"]
mysql_database = config["MYSQL"]["database"]

In [1270]:
sqlEngine = create_engine('mysql+pymysql://%s:%s@127.0.0.1/%s'%(mysql_username,mysql_password,mysql_database), pool_recycle=3600)
dbConnection = sqlEngine.connect()

In [1271]:
# main table
table_name = "pub"
try:
    frame = df_pub.to_sql(table_name, dbConnection, if_exists='append', index=False, index_label="id", dtype=dtype_dict);
except ValueError as vx:
    print(vx)
except Exception as ex:   
    print(ex)
else:
    print("Table %s created successfully."%table_name);   
finally:
    dbConnection.close()

Table pub created successfully.


In [1272]:
sqlEngine = create_engine('mysql+pymysql://%s:%s@127.0.0.1/%s'%(mysql_username,mysql_password,mysql_database), pool_recycle=3600)
dbConnection = sqlEngine.connect()

In [1273]:
# other tables
try:
    frame = df_cord_metadata.to_sql("cord19_metadata", dbConnection, if_exists='append', index=True, index_label="id")
    frame = df_who_metadata.to_sql("who_metadata", dbConnection, if_exists='append', index=True, index_label="id")
    frame = df_dimensions_metadata.to_sql("dimensions_metadata", dbConnection, if_exists='append', index=True, index_label="id")
    frame = df_datasource.to_sql("datasource", dbConnection, if_exists='append', index=True, index_label="id")
except ValueError as vx:
    print(vx)
except Exception as ex:   
    print(ex)
else:
    print("Tables created successfully.");   
finally:
    dbConnection.close()

Tables created successfully.


In [1274]:
sqlEngine = create_engine('mysql+pymysql://%s:%s@127.0.0.1/%s'%(mysql_username,mysql_password,mysql_database), pool_recycle=3600)
dbConnection = sqlEngine.connect()

In [1275]:
# last table
try:
    frame = df_pub_to_datasource.to_sql("pub_to_datasource", dbConnection, if_exists='append', index=False, index_label=["pub_id","source_id"])
except ValueError as vx:
    print(vx)
except Exception as ex:   
    print(ex)
else:
    print("Table created successfully.");   
finally:
    dbConnection.close()

Table created successfully.


In [1277]:
### Export the df_pub dataframe for further use

df_pub.to_csv("datasets_output/df_pub.csv", compression="gzip")