In [None]:
"""
Pipeline for filtering Kaggle CORD-19 metadata.csv into papers relevant to Covid-19.

"""

In [13]:
import pandas as pd
from datetime import datetime

In [9]:
"""
Input

metadata_csv_path - Path to latest metadata.csv file from Kaggle's CORD-19 dataset.
covid_19_term_list_path - Path to text file containing a list of Covid-19 synonyms (1 per line).

"""

metadata_csv_path = 'C://Users//SuresMal//Documents//Coronawhy//data//cord19//metadata.csv'
covid_19_term_list_path = 'C:/Users/SuresMal/Documents/Coronawhy/data/virus_words.txt'

pub_date_cutoff = '2019-10-01'

"""
Output

Filtered metadata dataframe is saved to csv file.

"""
covid_date_filtered_outpath = 'C:/Users/SuresMal/Documents/Coronawhy/data/cord19/processed/covid19_date_filt_metadata_180720.csv'
filt_metadata_outpath = '../data/v31_processed/metadata_covid19_df_180720.csv'

In [18]:
def filter_metadata_for_covid19(metadata_path: str, virus_lex_path: str, pub_date_cutoff: str):
    """
    Filter metadata to publications containing a COVID-19 synonym in title or abstract and published after cut-off date.

    :param metadata_path: path to CORD-19 metadata.csv file
    :param virus_lex_path: path to COVID-19 lexicon
    :param pub_date_cutoff: cut-off for paper publication date in the format 'yyyy-mm-dd'
    :return: Metadata for publications containing a COVID-19 synonym in title or abstract and published after cut-off date
    """
    metadata_df = pd.read_csv(metadata_path)
        
    # Concatenate title and abstract text into a single, lower-cased column
    metadata_df = metadata_df.fillna('')
    metadata_df.loc[:, 'title_abstract'] = metadata_df.loc[:, 'title'].str.lower() + ' ' + metadata_df.loc[:, 'abstract'].str.lower()
    metadata_df.loc[:, 'title_abstract'] = metadata_df.loc[:, 'title_abstract'].fillna('')

    # Load file with COVID-19 lexicon (1 per line) and generate a search pattern
    with open(virus_lex_path) as f:
        covid_19_terms = f.read().splitlines()
        covid_19_term_pattern = '|'.join([i.lower() for i in covid_19_terms])
    
    covid19_df = metadata_df.loc[metadata_df.title_abstract.str.contains(covid_19_term_pattern)]\
                            .copy().reset_index(drop=True)
    
    covid19_df['publish_time'] = pd.to_datetime(covid19_df['publish_time'])
    covid19_df['publish_time'] = covid19_df['publish_time'].dt.strftime('%Y-%m-%d')
    covid19_df['publish_time'] = pd.to_datetime(covid19_df['publish_time'])
    covid19_df = covid19_df.loc[covid19_df['publish_time'] > datetime.strptime(pub_date_cutoff, "%Y-%m-%d")]\
                            .copy().reset_index(drop=True)
    
    return covid19_df

In [3]:
# def filter_metadata_df_by_title_abstract_terms(metadata_df, covid_19_term_list_path):
#     """
#     Filter metadata dataframe to publications containing a Covid-19 synonym in title or abstract.
#     """
        
#     #Concatenate title and abstract text into a single, lower-cased column
    
#     metadata_df = metadata_df.fillna('')
#     metadata_df.loc[:, 'title_abstract'] = metadata_df.loc[:, 'title'].str.lower() + ' ' + metadata_df.loc[:, 'abstract'].str.lower()
#     metadata_df.loc[:, 'title_abstract'] = metadata_df.loc[:, 'title_abstract'].fillna('')

#     #Load text file containing Covid-19 synonyms (1 per line) and generate a search pattern
#     with open(covid_19_term_list_path) as f:
#         covid_19_terms = f.read().splitlines()
#         covid_19_term_pattern = '|'.join([i.lower() for i in covid_19_terms])

#     covid19_df = metadata_df.loc[metadata_df.title_abstract.str.contains(covid_19_term_pattern)]
    
#     return covid19_df

def filter_metadata_df_by_publ_date_cutoff(metadata_df, pub_date_cutoff):
    
    date_filtered_df = metadata_df.loc[metadata_df['publish_time'] > pub_date_cutoff]

    return date_filtered_df


#QC functions
def filter_metadata_df_to_null_value_in_col(metadata_df, col_name):
    """
    Filter metadata_df to rows containing null values for specified column.
    """    
    na_mask = metadata_df[col_name].isna()
    return metadata_df[na_mask]

def count_df_rows_with_null_values_in_cols(metadata_df, col_names):
    """
    Identify rows with null values in specified columns and return as dict: 
    
    {cord_uid : [columns with null values]}
    """
    cord_uid_null_col_dict = {}
    
    for col_name in col_names:
        null_value_df = filter_metadata_df_to_null_value_in_col(metadata_df, col_name)
        null_value_cord_uids = null_value_df.cord_uid.tolist()
        
        print("%d rows have null values in column %s" % (len(null_value_cord_uids, col_name)))
        
        for cord_uid in null_value_cord_uids:
            cord_uid_null_col_dict.setdefault(cord_uid, []).append(col_name)
            
    return cord_uid_null_col_dict



In [19]:
#Main

#Load metadata.csv as dataframe and filter to 
metadata_df = pd.read_csv(metadata_csv_path)
covid19_df = filter_metadata_for_covid19(metadata_csv_path, covid_19_term_list_path, pub_date_cutoff)
# covid19_df = filter_metadata_df_by_title_abstract_terms(metadata_df, covid_19_term_list_path)

# #Filter covid19_df by publication date cutoff
# covid19_date_filtered_df = filter_metadata_df_by_publ_date_cutoff(covid19_df, pub_date_cutoff)

#covid19_date_filtered_df.to_csv(covid_date_filtered_outpath)
covid19_df.to_csv(covid_date_filtered_outpath)

NameError: name 'covid19_date_filtered_df' is not defined

In [24]:
print('Total number of papers in current cord19 version:',len(metadata_df.cord_uid.unique()))
#print('Number of papers extracted:',len(covid19_date_filtered_df.cord_uid.unique()))
print('Number of papers extracted:',len(covid19_df.cord_uid.unique()))

Total number of papers in current cord19 version: 191175
Number of papers extracted: 77739


In [14]:
#v31 output

# print('Total number of papers in current cord19 version:',len(metadata_df.cord_uid.unique()))
# print('Number of papers extracted:',len(covid19_date_filtered_df.cord_uid.unique()))

Total number of papers in current cord19 version: 157254
Number of papers extracted: 52610


In [6]:
"""
Duplicated cord_uids.

Some cord_uids are non-unique.  This appears to be the result of the same paper being provided by two different sources?
"""

dup_metadata_df_mask =  metadata_df.cord_uid.duplicated(keep=False)
dup_metadata_df = metadata_df[dup_metadata_df_mask]

dup_cord_uids = set(dup_metadata_df.cord_uid.tolist())
print("Number of duplicated cord uids: %d" % len(dup_cord_uids))

print("Examples of duplicated cord uids:")
display(dup_metadata_df.sort_values(by='cord_uid'))

Number of duplicated cord uids: 389
Examples of duplicated cord uids:


Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
25635,028avudf,,Medline,Burden and prevalence of prognostic factors fo...,10.1007/s10654-020-00646-z,,32425318.0,unk,The World Health Organization and European Cen...,2020-05-18,"Gémes, Katalin; Talbäck, Mats; Modig, Karin; A...",European journal of epidemiology,,,,,,https://doi.org/10.1007/s10654-020-00646-z; ht...,218675407.0
137704,028avudf,dff1eca0bb8c62e1c957c8038f3cdd0aaf624afe; 4e11...,Medline; PMC,Burden and prevalence of prognostic factors fo...,10.1007/s10654-020-00646-z,PMC7233678,32424571.0,cc-by,The World Health Organization and European Cen...,2020-05-18,"Gémes, Katalin; Talbäck, Mats; Modig, Karin; A...",Eur J Epidemiol,,,,document_parses/pdf_json/dff1eca0bb8c62e1c957c...,document_parses/pmc_json/PMC7233678.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/32424571/;...,218689939.0
148542,04jbodmf,,Medline; WHO,Epidemiologic characteristics of early cases w...,10.4178/epih.e2020007,,32035431.0,unk,Since the first case of 2019 novel coronavirus...,2020,"Ki, Moran; nCoV, Task Force For",Epidemiol Health,,#567,,,,https://www.ncbi.nlm.nih.gov/pubmed/32035431/;...,211072168.0
68156,04jbodmf,,WHO,Epidemiologic characteristics of early cases w...,10.4178/epih.e2020007,,,unk,In about 20 days since the diagnosis of the fi...,2020,"Ki, Moran; Task Force for -nCo, V.",Epidemiology and health,,#10145,,,,https://doi.org/10.4178/epih.e2020007,211072168.0
53562,05my504t,,Medline,COVID-19 and Paediatric Inflammatory Bowel Dis...,10.1097/mpg.0000000000002729,,32235161.0,unk,INTRODUCTION With the current COVID-19 pandemi...,2020-03-31,"Turner, Dan; Huang, Ying; Martín-de-Carpi, Jav...",Journal of pediatric gastroenterology and nutr...,,,,,,https://doi.org/10.1097/mpg.0000000000002729; ...,214772386.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137172,zu1xmmec,febe489672e0e1d997ac09c0a89973907d0ec885,Elsevier; PMC; WHO,Reply: Introducing special cutaneous “sign” tr...,10.1016/j.clindermatol.2020.04.012,PMC7166102,,no-cc,,2020-04-18,"Darlenski, Razvigor; Tsankov, Nikolai",Clin Dermatol,,#72328,,document_parses/pdf_json/febe489672e0e1d997ac0...,document_parses/pmc_json/PMC7166102.xml.json,https://doi.org/10.1016/j.clindermatol.2020.04...,215808232.0
2847,zwkl1ywk,5a2d0804fc8a4c5d6c661deca4e3150e5cbc77d5,PMC,The lifecycle of the Ebola virus in host cells,10.18632/oncotarget.18498,PMC5589696,28903457.0,cc-by,Ebola haemorrhagic fever causes deadly disease...,2017-06-15,"Yu, Dong-Shan; Weng, Tian-Hao; Wu, Xiao-Xin; W...",Oncotarget,,,,document_parses/pdf_json/5a2d0804fc8a4c5d6c661...,document_parses/pmc_json/PMC5589696.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
33356,zwkl1ywk,,Medline,The lifecycle of the Ebola virus in host cells.,10.18632/oncotarget.18498,,28653957.0,unk,Ebola haemorrhagic fever causes deadly disease...,2017,"Yu, Dong-Shan; Weng, Tian-Hao; Wu, Xiao-Xin; W...",Oncotarget,,,,,,https://doi.org/10.18632/oncotarget.18498; htt...,10664454.0
154549,zzpw375i,,Medline; WHO,Discussing the ABCs of Health Security-Antibio...,10.1001/jama.2019.21022,,32074261.0,unk,According to the World Health Organization (WH...,2020-02-19,"Desai, Angel N",JAMA,,#32150360,,,,https://www.ncbi.nlm.nih.gov/pubmed/32074261/;...,211213857.0


In [7]:
"""
Date filtered publications.
"""

covid19_cord_uids = covid19_df.cord_uid.tolist()
date_incl_cord_uids = covid19_date_filtered_df.cord_uid.tolist()
date_excl_cord_uids = set(covid19_cord_uids) - set(date_incl_cord_uids)

print("Covid-19 cord_uids: %d" % len(set(covid19_cord_uids)))
print("Covid-19 cord_uids published after date cutoff %s : %d" % (pub_date_cutoff, len(set(date_incl_cord_uids))))
print("Covid-19 cord_uids published before date cutoff %s : %d" % (pub_date_cutoff, len(set(date_excl_cord_uids))))

Covid-19 cord_uids: 52666
Covid-19 cord_uids published after date cutoff 2019-10-01 : 52610
Covid-19 cord_uids published before date cutoff 2019-10-01 : 56
