In [1]:
import os
import re
import ast
import numpy as np
import pandas as pd
import glob
import json

from pandas.io.json import json_normalize

from zipfile import ZipFile

from sklearn.metrics.pairwise import cosine_similarity

from numba import jit

In [3]:
"""
Preprocessing Cord-19 text files

Goal: Filter text and vector files containing publications (1 sentence per row) to publications 
relevant to Covid-19 and specified columns.  Merge into a single file for extraction of sentences/vectors
belonging to specified papers/sections.

Steps:
    1. Filter metadata.csv (from Kaggle Cord-19 dataset) to paper cord_uids for papers where:
        -Title or Abstract contains 1 or more Covid-19 synonyms.
        -Publication date is after 2019-10-01.
        This has been moved to notebook covid_vt_contra_metadata_filtering.ipynb
        
    2.  Load all full text files (json format) as pandas dataframes and truncate them to:
            -Rows containing cord_uids from step 1
            -Columns cord_uid, sentence, and section

        Write truncated text dataframe to json.
    
    5.  Filter to title, abstract and discussion sections
    
    6.  Filter out noisy sentences (with <3 words, etc.)
"""

'\nPreprocessing Cord-19 text files\n\nGoal: Filter text and vector files containing publications (1 sentence per row) to publications \nrelevant to Covid-19 and specified columns.  Merge into a single file for extraction of sentences/vectors\nbelonging to specified papers/sections.\n\nSteps:\n    1. Filter metadata.csv (from Kaggle Cord-19 dataset) to paper cord_uids for papers where:\n        -Title or Abstract contains 1 or more Covid-19 synonyms.\n        -Publication date is after 2019-10-01.\n        This has been moved to notebook covid_vt_contra_metadata_filtering.ipynb\n        \n    2.  Load all full text files (parquet format) as pandas dataframes and truncate them to:\n            -Rows containing cord_uids from step 1\n            -Columns cord_uid, sentence_id, sentence, and section\n\n        Write truncated text dataframes to csv files.\n\n    3.  Merge truncated text dataframes and extract list of sentence_ids for truncating vector files.\n    \n    4.  Load all vector

In [85]:
"""
Input and Output file paths.  This will be turned into a config file that can be passed as an argument.
"""

#Input
resources_path = '../data/kaggle_cord19/'

metadata_csv_path = '../data/v31_processed/covid19_date_filt_metadata_200620.csv'
covid_19_term_list_file = '../data/kaggle_cord19/virus_words.txt'

json_text_file_dir = '../data/kaggle_cord19/json_v31.zip'
json_temp_path = '..\\data\\kaggle_cord19\\Extracted\\'
json_temp_path_fmt = '../data/kaggle_cord19/Extracted/'

pub_date_cutoff = '2019-10-01'

#Output

output_path = '../data/v31_processed/'

full_text_df_outfile_path = '%s..\\data\\v31_processed\\full_text_200620.json' %output_path
filt_text_df_outfile_path = '%sfilt_text_df_200620.csv' % output_path
processed_text_df_outfile_path = '%scord_titles_abstracts_conclusions_200620.csv' % output_path

In [71]:
#Main

print("Step 1: Filtering metadata csv by Covid-19 synoynms and publication date...")
#Replacing this step with the output of notebook covid_vt_contra_metadata_filtering.ipynb
covid19_df = pd.read_csv(metadata_csv_path)
uid_list = covid19_df.cord_uid.tolist()

pdf_filenames = list(covid19_df.pdf_json_files)
pmc_filenames = list(covid19_df.pmc_json_files)

covid19_dict = {}

print("Step 2: Loading full text dataframes, filtering to extracted uids...")
#truncate_pq_text_files(pq_text_file_dir, pq_text_files_cols_oi, csv_trunc_text_file_dir, uid_list)
with ZipFile(json_text_file_dir, 'r') as zipObj:
    listOfFileNames = zipObj.namelist()
    print('Number of files to iterate over:',len(listOfFileNames))
    k = 0
    iter_num = 0
    for fileName in listOfFileNames:
        iter_num = iter_num + 1
        # Check filename ends with json and file exists in filtered list of cord papers
        if (fileName in pdf_filenames) or (fileName in pmc_filenames):
            zipObj.extract(fileName,json_temp_path)
            json_op = []
            with open(json_temp_path_fmt+fileName,'r', encoding='utf8') as f:
                temp = []
                temp.append("".join([line.replace('\n', '').replace('\r','').replace('\t','') for line in f]))
                for jsonObj in temp:
                    json_dict = json.loads(jsonObj, encoding='utf8')
                    json_op.append(json_dict)
                    for i in range(len(json_op)):
                        temp_json = json_normalize(json_op[i])
                        try:
                            covid19_dict[k]={'cord_uid':list(covid19_df.loc[((fileName==covid19_df.pdf_json_files) | (fileName==covid19_df.pmc_json_files)),'cord_uid'])[0],
                                         'sentence':temp_json['abstract'][0][0]['text'],
                                         'section':temp_json['abstract'][0][0]['section']}
                            k = k + 1
                        except:
                            abstract = ''
                        
                        for j,temp_dict in enumerate(temp_json['body_text'][0]):
                            covid19_dict[k]={'cord_uid':list(covid19_df.loc[((fileName==covid19_df.pdf_json_files) | (fileName==covid19_df.pmc_json_files)),'cord_uid'])[0],
                                             'sentence':temp_json['body_text'][0][j]['text'],
                                             'section':temp_json['body_text'][0][j]['section']}
                            k = k + 1
                        
        if iter_num%100==0:
            print('Number of files read:',iter_num)
            #break
print('Number of rows added:',k)

Step 1: Filtering metadata csv by Covid-19 synoynms and publication date...
Step 2: Loading full text dataframes, filtering to extracted uids, and writing to csv files...
Number of files to iterate over: 129922




Number of files read: 100
Number of files read: 200
Number of files read: 300
Number of files read: 400
Number of files read: 500
Number of files read: 600
Number of files read: 700
Number of files read: 800
Number of files read: 900
Number of files read: 1000
Number of files read: 1100
Number of files read: 1200
Number of files read: 1300
Number of files read: 1400
Number of files read: 1500
Number of files read: 1600
Number of files read: 1700
Number of files read: 1800
Number of files read: 1900
Number of files read: 2000
Number of files read: 2100
Number of files read: 2200
Number of files read: 2300
Number of files read: 2400
Number of files read: 2500
Number of files read: 2600
Number of files read: 2700
Number of files read: 2800
Number of files read: 2900
Number of files read: 3000
Number of files read: 3100
Number of files read: 3200
Number of files read: 3300
Number of files read: 3400
Number of files read: 3500
Number of files read: 3600
Number of files read: 3700
Number of 

Number of files read: 29800
Number of files read: 29900
Number of files read: 30000
Number of files read: 30100
Number of files read: 30200
Number of files read: 30300
Number of files read: 30400
Number of files read: 30500
Number of files read: 30600
Number of files read: 30700
Number of files read: 30800
Number of files read: 30900
Number of files read: 31000
Number of files read: 31100
Number of files read: 31200
Number of files read: 31300
Number of files read: 31400
Number of files read: 31500
Number of files read: 31600
Number of files read: 31700
Number of files read: 31800
Number of files read: 31900
Number of files read: 32000
Number of files read: 32100
Number of files read: 32200
Number of files read: 32300
Number of files read: 32400
Number of files read: 32500
Number of files read: 32600
Number of files read: 32700
Number of files read: 32800
Number of files read: 32900
Number of files read: 33000
Number of files read: 33100
Number of files read: 33200
Number of files read

Number of files read: 59100
Number of files read: 59200
Number of files read: 59300
Number of files read: 59400
Number of files read: 59500
Number of files read: 59600
Number of files read: 59700
Number of files read: 59800
Number of files read: 59900
Number of files read: 60000
Number of files read: 60100
Number of files read: 60200
Number of files read: 60300
Number of files read: 60400
Number of files read: 60500
Number of files read: 60600
Number of files read: 60700
Number of files read: 60800
Number of files read: 60900
Number of files read: 61000
Number of files read: 61100
Number of files read: 61200
Number of files read: 61300
Number of files read: 61400
Number of files read: 61500
Number of files read: 61600
Number of files read: 61700
Number of files read: 61800
Number of files read: 61900
Number of files read: 62000
Number of files read: 62100
Number of files read: 62200
Number of files read: 62300
Number of files read: 62400
Number of files read: 62500
Number of files read

Number of files read: 88400
Number of files read: 88500
Number of files read: 88600
Number of files read: 88700
Number of files read: 88800
Number of files read: 88900
Number of files read: 89000
Number of files read: 89100
Number of files read: 89200
Number of files read: 89300
Number of files read: 89400
Number of files read: 89500
Number of files read: 89600
Number of files read: 89700
Number of files read: 89800
Number of files read: 89900
Number of files read: 90000
Number of files read: 90100
Number of files read: 90200
Number of files read: 90300
Number of files read: 90400
Number of files read: 90500
Number of files read: 90600
Number of files read: 90700
Number of files read: 90800
Number of files read: 90900
Number of files read: 91000
Number of files read: 91100
Number of files read: 91200
Number of files read: 91300
Number of files read: 91400
Number of files read: 91500
Number of files read: 91600
Number of files read: 91700
Number of files read: 91800
Number of files read

Number of files read: 117100
Number of files read: 117200
Number of files read: 117300
Number of files read: 117400
Number of files read: 117500
Number of files read: 117600
Number of files read: 117700
Number of files read: 117800
Number of files read: 117900
Number of files read: 118000
Number of files read: 118100
Number of files read: 118200
Number of files read: 118300
Number of files read: 118400
Number of files read: 118500
Number of files read: 118600
Number of files read: 118700
Number of files read: 118800
Number of files read: 118900
Number of files read: 119000
Number of files read: 119100
Number of files read: 119200
Number of files read: 119300
Number of files read: 119400
Number of files read: 119500
Number of files read: 119600
Number of files read: 119700
Number of files read: 119800
Number of files read: 119900
Number of files read: 120000
Number of files read: 120100
Number of files read: 120200
Number of files read: 120300
Number of files read: 120400
Number of file

In [72]:
concat_text_trunc_df = pd.DataFrame.from_dict(covid19_dict,orient='index')
concat_text_trunc_df

Unnamed: 0,cord_uid,sentence,section
0,o81b9htu,How do we protect our 'high-risk' patient popu...,
1,o81b9htu,A hospital-wide high-risk patient working grou...,
2,o81b9htu,The inclusion of immunocompromised patients in...,
3,o81b9htu,"With the current outbreak, reports of 2 heart ...",
4,o81b9htu,Not only has evidence that immunosuppression c...,
...,...,...,...
619224,kuvfdl4q,"In addition to the known pulmonary findings, i...",Two-Dimensional Carbides and Nitrides (MXenes)...
619225,kuvfdl4q,"When the kidneys fail, hemodialysis offers a l...",Two-Dimensional Carbides and Nitrides (MXenes)...
619226,kuvfdl4q,The COVID-19 global emergency is making humans...,"Rethinking the Future: One Health, Contaminati..."
619227,kuvfdl4q,"To address such a complex challenge, cooperati...","Rethinking the Future: One Health, Contaminati..."


In [74]:
concat_text_trunc_df.to_json(full_text_df_outfile_path)

In [78]:
print('Number of papers extracted:',len(concat_text_trunc_df.cord_uid.unique()))

Number of papers extracted: 19883


In [75]:
"""
Functions to generate regex match patterns from synonymous words/phrases for filtering subject headers
"""

import re

def extract_regex_pattern(section_list, pattern):
    r = re.compile(pattern, re.IGNORECASE)
    extracted_list = list(filter(r.match, section_list))
    remaining_list = list(set(section_list) - set(extracted_list))
    
    return remaining_list, extracted_list

def construct_regex_match_pattern(terms_dict):
    fuzzy_terms = ['.*%s.*' % i for i in terms_dict['fuzzy']]
    exact_terms = terms_dict['exact']
    
    fuzzy_pattern = '|'.join(fuzzy_terms)
    #exact_pattern = '|'.join(exact_terms)
    
    full_pattern = fuzzy_pattern
    
    return full_pattern

In [99]:
"""
Extract putative discussion headers.
"""

#Construct regex pattern for discussion header terms and extract list of matching headers.

disc_terms_dict = {
    'exact': [],
    'fuzzy' : [
        'conclusion',
        'discussion',
        'interpretation',
        'added value of this study',
        'research in context',
        'concluding',
        'closing remarks',
        'summary of findings',
        'outcome'
    ]
}
conc_pattern = construct_regex_match_pattern(disc_terms_dict)
print(conc_pattern)

.*conclusion.*|.*discussion.*|.*interpretation.*|.*added value of this study.*|.*research in context.*|.*concluding.*|.*closing remarks.*|.*summary of findings.*|.*outcome.*


In [76]:
import pprint

#unique_sections = set(merged_text_vec_df.section.tolist())
unique_sections = set(concat_text_trunc_df.section.tolist())

rem_header_list, ext_header_list = extract_regex_pattern(unique_sections, conc_pattern)

print("Number of discussion headers: %d" % len(ext_header_list))
print("Example discussion headers:")
pprint.pprint(ext_header_list[:20])


Number of discussion headers: 3912
Example discussion headers:
['IV -Discussion',
 'Additional considerations for large-scale manufacturing and dissemination '
 '::: Discussion',
 'Discussion of case reports',
 'Food purchase and preparation behaviors and attitudes toward COVID-19 '
 'lockdown ::: Results and discussion',
 'Survival ::: DISCUSSION',
 'Clinical characteristics and outcomes of COVID-19 stratified by the number '
 'of comorbidities',
 'Determining the utility of LFAs for SARS-CoV-2 antibody detection 2.1 Visual '
 'interpretation of LFA testing results',
 'MS/MS Data Interpretation and label free protein quantification',
 'Considerations of dosing regimen for special population ::: Discussion',
 '5. Discussion',
 'Limitations and conclusion',
 'IV. CONCLUSIONS:',
 'Limitation ::: Discussions',
 '3.3.9. Republic of Congo ::: 3.3. Central Africa ::: 3. Results and '
 'Discussion',
 'Experimental Protocols, Classification Results and Discussion',
 'Clinical Characteristics a

In [77]:
#For now, lower-casing all section headers for matching

section_exact_match_list = set([i.lower() for i in ext_header_list])
section_exact_match_list.add('abstract')
section_exact_match_list.add('title')

print("Number of sections filtering to: %d" % len(section_exact_match_list))

Number of sections filtering to: 3635


In [79]:
print('Step 3: Filtering to title, abstract and discussion sections...')
merged_df_filtered_to_sections_oi = concat_text_trunc_df.loc[concat_text_trunc_df.section.str.lower().isin(section_exact_match_list)]

merged_df_filtered_to_sections_oi.to_csv(filt_text_df_outfile_path, index=False)

In [80]:
print('Number of papers after filtering to sections:',len(merged_df_filtered_to_sections_oi.cord_uid.unique()))

Number of papers after filtering to sections: 13465


In [81]:
"""
Filtering out sentences with less than three non-header words.

Author: Malavika Suresh

"""
input_data = merged_df_filtered_to_sections_oi

In [82]:
#Keep only sentences containing at least 3 words other than those defined below
#This also removes any sentences that do not contain any words at all

print('Step 4: Filtering our noisy sentences...')
rep = {"text": "", "cite_spans": "", "ref_spans": "", "section": "", "Abstract": "",\
       "bioRxiv preprint": "", "medRxiv preprint": "", "doi:": ""}
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))
sentences_temp = [pattern.sub(lambda m: rep[re.escape(m.group(0))], s) for s in input_data.sentence]
pattern = re.compile(".*[A-Za-z].*")
sentences_to_keep = [(bool(re.search(pattern,s))) & (len(s.split(' '))>2) for s in sentences_temp]
input_processed = input_data.loc[sentences_to_keep,:]
sentences_to_drop = [not i for i in sentences_to_keep]
input_excluded = input_data.loc[sentences_to_drop,:]


In [84]:
print('Number of papers after filtering sentences:',len(input_processed.cord_uid.unique()))

Number of papers after filtering sentences: 13390


In [86]:
input_processed.to_csv(processed_text_df_outfile_path)
# input_excluded.to_csv('%scord_titles_abstracts_conclusions_excluded.csv' % output_path)

In [83]:
input_processed

Unnamed: 0,cord_uid,sentence,section
17,dwrazfli,The crisis generated by the COVID-19 pandemic ...,Conclusions
22,5k0ktboh,Asymptomatic and atypical presentations of COV...,Discussion
23,5k0ktboh,The patient's radiological findings were enoug...,Discussion
24,5k0ktboh,The atypical presentation and lack of communic...,Discussion
25,5k0ktboh,Asymptomatic and atypical presentations of COV...,Conclusion
...,...,...,...
619135,pgluidj8,The CT values of both ORF1ab and N\ngenes betw...,CT Value Analysis ::: Results and Discussion
619136,pgluidj8,A total of 181 samples (patients) were tested....,Characterization of the Samples ::: Results an...
619137,pgluidj8,"Nevertheless, as the clinical information on t...",Characterization of the Samples ::: Results an...
619138,pgluidj8,"The distinctive features of high sensitivity, ...",Applicable Scene ::: Results and Discussion


In [97]:
title_data = input_processed.loc[input_processed.section.str.lower()=='title',:]
abstract_data = input_processed.loc[input_processed.section.str.lower()=='abstract',:]
conclusion_data = input_processed.loc[(input_processed.section.str.lower()!='title') & (input_processed.section.str.lower()!='abstract'),:]

In [98]:
print('Number of papers:', input_processed.cord_uid.nunique())
print('Number of papers with title:', title_data.cord_uid.nunique())
print('Number of papers with abstract:', abstract_data.cord_uid.nunique())
print('Number of papers with conclusion:', conclusion_data.cord_uid.nunique())

Number of papers: 13390
Number of papers with title: 2
Number of papers with abstract: 10539
Number of papers with conclusion: 10408
