In [7]:
import os
import re
import ast
import numpy as np
import pandas as pd
import glob
import json

import tarfile

from datetime import datetime

from pandas.io.json import json_normalize

from zipfile import ZipFile

from sklearn.metrics.pairwise import cosine_similarity

from numba import jit

In [3]:
"""
Preprocessing Cord-19 text files

Goal: Filter text and vector files containing publications (1 sentence per row) to publications 
relevant to Covid-19 and specified columns.  Merge into a single file for extraction of sentences/vectors
belonging to specified papers/sections.

Steps:
    1. Filter metadata.csv (from Kaggle Cord-19 dataset) to paper cord_uids for papers where:
        -Title or Abstract contains 1 or more Covid-19 synonyms.
        -Publication date is after 2019-10-01.
        This has been moved to notebook covid_vt_contra_metadata_filtering.ipynb
        
    2.  Load all full text files (json format) as pandas dataframes and truncate them to:
            -Rows containing cord_uids from step 1
            -Columns cord_uid, sentence, and section

        Write truncated text dataframe to json.
    
    5.  Filter to title, abstract and discussion sections
    
    6.  Filter out noisy sentences (with <3 words, etc.)
"""

'\nPreprocessing Cord-19 text files\n\nGoal: Filter text and vector files containing publications (1 sentence per row) to publications \nrelevant to Covid-19 and specified columns.  Merge into a single file for extraction of sentences/vectors\nbelonging to specified papers/sections.\n\nSteps:\n    1. Filter metadata.csv (from Kaggle Cord-19 dataset) to paper cord_uids for papers where:\n        -Title or Abstract contains 1 or more Covid-19 synonyms.\n        -Publication date is after 2019-10-01.\n        This has been moved to notebook covid_vt_contra_metadata_filtering.ipynb\n        \n    2.  Load all full text files (parquet format) as pandas dataframes and truncate them to:\n            -Rows containing cord_uids from step 1\n            -Columns cord_uid, sentence_id, sentence, and section\n\n        Write truncated text dataframes to csv files.\n\n    3.  Merge truncated text dataframes and extract list of sentence_ids for truncating vector files.\n    \n    4.  Load all vector

In [26]:
json_text_file_dir = 'C:\\Users\\SuresMal\\Documents\\Coronawhy\\data\\cord19\\2020-08-15\\document_parses.tar.gz'
json_temp_path = 'C:\\Users\\SuresMal\\Documents\\Coronawhy\\data\\cord19\\extracted\\'
processed_text_df_outfile_path = 'C:\\Users\\SuresMal\\Documents\\Coronawhy\\data\\cord19\\2020-08-15\\processed\\cord_titles_abstracts_conclusions_150820.csv'

In [9]:
covid19_metadata = pd.read_csv("C:\\Users\\SuresMal\\Documents\\Coronawhy\\data\\cord19\\2020-08-15\\processed\\covid19_date_filt_metadata_150820.csv")
pdf_filenames = list(covid19_metadata.pdf_json_files)
pmc_filenames = list(covid19_metadata.pmc_json_files)

  interactivity=interactivity, compiler=compiler, result=result)


In [14]:
covid19_dict = {}
replace_dict = {'â€œ' : '“',
                'â€' : '”',
                'â€™' : '’',
                'â€˜' : '‘',
                'â€”' : '–',
                'â€“' : '—',
                'â€¢' : '-',
                'â€¦' : '…'}

if '.zip' in json_text_file_dir:
    zipobj = ZipFile(json_text_file_dir, 'r')
    list_of_filenames = zipobj.namelist()
elif 'tar.gz' in json_text_file_dir:
    tarf = tarfile.open(json_text_file_dir, 'r:gz')
    list_of_filenames = tarf.getnames()
else:
    raise Exception("Incorrcet file extension. Must be '.zip' or '.tar.gz'")
print('Number of files to iterate over:',len(list_of_filenames))
k = 0
# TODO: Parallelize the code below
for iter_num, filename in enumerate(list_of_filenames):
    # Check filename ends with json and file exists in filtered list of cord papers
    if (filename in pdf_filenames) or (filename in pmc_filenames):
        if '.zip' in json_text_file_dir:
            zipobj.extract(filename, json_temp_path)
        elif 'tar.gz' in json_text_file_dir:
            tarf.extract(tarf.getmembers()[iter_num], json_temp_path)
        with open(json_temp_path + filename, 'r', encoding='utf8') as f:
            # Read each line in the file separately, remove tabs, spaces and newlines
            # and concatenate all lines together for further parsing
            json_str = "".join([" ".join(line.split()) for line in f])
            # Parse the json string into the json dictionary format
            json_dict = json.loads(json_str, encoding='utf8')
            # Convert the json dictionary object to a pandas dataframe
            paper_df = json_normalize(json_dict)
            # In the covid19 metadata dataframe,
            # filter to the row representing the current json file being processed
            # and extract the cord_uid
            check_file_name = ((filename == covid19_metadata.pdf_json_files)
                               | (filename == covid19_metadata.pmc_json_files))  # noqa: W503
            cord_uid = list(covid19_metadata.loc[check_file_name, 'cord_uid'])[0]
            # If an abstract section exists, extract the text
            try:
                text = paper_df['abstract'][0][0]['text']
                section = paper_df['abstract'][0][0]['section']
                # Replace characters with their readable format
                for key, v in replace_dict.items():
                    text = text.replace(key, v)
                    section = section.replace(key, v)
                covid19_dict[k] = {'cord_uid': cord_uid,
                                   'sentence': text,
                                   'section': section}
                k = k + 1
            # If an abstract section does not exist, skip
            except KeyError:
                pass
            except IndexError:
                pass

            for temp_dict in paper_df['body_text'][0]:
                text = temp_dict['text']
                section = temp_dict['section']
                # Replace characters with their readable format
                for key, v in replace_dict.items():
                    text = text.replace(key, v)
                    section = section.replace(key, v)
                covid19_dict[k] = {'cord_uid': cord_uid,
                                   'sentence': text,
                                   'section': section}
                k = k + 1

    if iter_num%100==0:
        print('Number of files read:', iter_num+1)

Number of files to iterate over: 165488
Number of files read: 1
Number of files read: 101
Number of files read: 201
Number of files read: 301
Number of files read: 401
Number of files read: 501
Number of files read: 601
Number of files read: 701
Number of files read: 801
Number of files read: 901
Number of files read: 1001
Number of files read: 1101
Number of files read: 1201
Number of files read: 1301
Number of files read: 1401
Number of files read: 1501
Number of files read: 1601
Number of files read: 1701
Number of files read: 1801
Number of files read: 1901
Number of files read: 2001
Number of files read: 2101
Number of files read: 2201
Number of files read: 2301
Number of files read: 2401
Number of files read: 2501
Number of files read: 2601
Number of files read: 2701
Number of files read: 2801
Number of files read: 2901
Number of files read: 3001
Number of files read: 3101
Number of files read: 3201
Number of files read: 3301
Number of files read: 3401
Number of files read: 3501




Number of files read: 9001
Number of files read: 9101
Number of files read: 9201
Number of files read: 9301
Number of files read: 9401
Number of files read: 9501
Number of files read: 9601
Number of files read: 9701
Number of files read: 9801
Number of files read: 9901
Number of files read: 10001
Number of files read: 10101
Number of files read: 10201
Number of files read: 10301
Number of files read: 10401
Number of files read: 10501
Number of files read: 10601
Number of files read: 10701
Number of files read: 10801
Number of files read: 10901
Number of files read: 11001
Number of files read: 11101
Number of files read: 11201
Number of files read: 11301
Number of files read: 11401
Number of files read: 11501
Number of files read: 11601
Number of files read: 11701
Number of files read: 11801
Number of files read: 11901
Number of files read: 12001
Number of files read: 12101
Number of files read: 12201
Number of files read: 12301
Number of files read: 12401
Number of files read: 12501
Nu

Number of files read: 38301
Number of files read: 38401
Number of files read: 38501
Number of files read: 38601
Number of files read: 38701
Number of files read: 38801
Number of files read: 38901
Number of files read: 39001
Number of files read: 39101
Number of files read: 39201
Number of files read: 39301
Number of files read: 39401
Number of files read: 39501
Number of files read: 39601
Number of files read: 39701
Number of files read: 39801
Number of files read: 39901
Number of files read: 40001
Number of files read: 40101
Number of files read: 40201
Number of files read: 40301
Number of files read: 40401
Number of files read: 40501
Number of files read: 40601
Number of files read: 40701
Number of files read: 40801
Number of files read: 40901
Number of files read: 41001
Number of files read: 41101
Number of files read: 41201
Number of files read: 41301
Number of files read: 41401
Number of files read: 41501
Number of files read: 41601
Number of files read: 41701
Number of files read

Number of files read: 67601
Number of files read: 67701
Number of files read: 67801
Number of files read: 67901
Number of files read: 68001
Number of files read: 68101
Number of files read: 68201
Number of files read: 68301
Number of files read: 68401
Number of files read: 68501
Number of files read: 68601
Number of files read: 68701
Number of files read: 68801
Number of files read: 68901
Number of files read: 69001
Number of files read: 69101
Number of files read: 69201
Number of files read: 69301
Number of files read: 69401
Number of files read: 69501
Number of files read: 69601
Number of files read: 69701
Number of files read: 69801
Number of files read: 69901
Number of files read: 70001
Number of files read: 70101
Number of files read: 70201
Number of files read: 70301
Number of files read: 70401
Number of files read: 70501
Number of files read: 70601
Number of files read: 70701
Number of files read: 70801
Number of files read: 70901
Number of files read: 71001
Number of files read

Number of files read: 96901
Number of files read: 97001
Number of files read: 97101
Number of files read: 97201
Number of files read: 97301
Number of files read: 97401
Number of files read: 97501
Number of files read: 97601
Number of files read: 97701
Number of files read: 97801
Number of files read: 97901
Number of files read: 98001
Number of files read: 98101
Number of files read: 98201
Number of files read: 98301
Number of files read: 98401
Number of files read: 98501
Number of files read: 98601
Number of files read: 98701
Number of files read: 98801
Number of files read: 98901
Number of files read: 99001
Number of files read: 99101
Number of files read: 99201
Number of files read: 99301
Number of files read: 99401
Number of files read: 99501
Number of files read: 99601
Number of files read: 99701
Number of files read: 99801
Number of files read: 99901
Number of files read: 100001
Number of files read: 100101
Number of files read: 100201
Number of files read: 100301
Number of files 

Number of files read: 125301
Number of files read: 125401
Number of files read: 125501
Number of files read: 125601
Number of files read: 125701
Number of files read: 125801
Number of files read: 125901
Number of files read: 126001
Number of files read: 126101
Number of files read: 126201
Number of files read: 126301
Number of files read: 126401
Number of files read: 126501
Number of files read: 126601
Number of files read: 126701
Number of files read: 126801
Number of files read: 126901
Number of files read: 127001
Number of files read: 127101
Number of files read: 127201
Number of files read: 127301
Number of files read: 127401
Number of files read: 127501
Number of files read: 127601
Number of files read: 127701
Number of files read: 127801
Number of files read: 127901
Number of files read: 128001
Number of files read: 128101
Number of files read: 128201
Number of files read: 128301
Number of files read: 128401
Number of files read: 128501
Number of files read: 128601
Number of file

Number of files read: 153601
Number of files read: 153701
Number of files read: 153801
Number of files read: 153901
Number of files read: 154001
Number of files read: 154101
Number of files read: 154201
Number of files read: 154301
Number of files read: 154401
Number of files read: 154501
Number of files read: 154601
Number of files read: 154701
Number of files read: 154801
Number of files read: 154901
Number of files read: 155001
Number of files read: 155101
Number of files read: 155201
Number of files read: 155301
Number of files read: 155401
Number of files read: 155501
Number of files read: 155601
Number of files read: 155701
Number of files read: 155801
Number of files read: 155901
Number of files read: 156001
Number of files read: 156101
Number of files read: 156201
Number of files read: 156301
Number of files read: 156401
Number of files read: 156501
Number of files read: 156601
Number of files read: 156701
Number of files read: 156801
Number of files read: 156901
Number of file

In [15]:
concat_text_trunc_df = pd.DataFrame.from_dict(covid19_dict,orient='index')
concat_text_trunc_df

Unnamed: 0,cord_uid,sentence,section
0,wvx6q999,The situation has continued to evolve rapidly ...,
1,wvx6q999,"In this issue of Eurosurveillance, we are publ...",
2,wvx6q999,"Today, the WHO Director-General Tedros Adhanom...",
3,wvx6q999,International health organisations such as the...,
4,wvx6q999,"At the end of 2019, on 31 December, the World ...",
...,...,...,...
1109048,byehf0wn,The molecular mechanisms underlying the differ...,Hypotheses
1109049,byehf0wn,The discussed data show the importance of macr...,Conclusions
1109050,byehf0wn,None.,Disclosures
1109051,byehf0wn,The author is supported by MIUR and University...,Funding


In [16]:
concat_text_trunc_df.to_json(full_text_df_outfile_path)

NameError: name 'full_text_df_outfile_path' is not defined

In [18]:
print('Number of papers extracted:',len(concat_text_trunc_df.cord_uid.unique()))

Number of papers extracted: 33218


In [17]:
"""
Functions to generate regex match patterns from synonymous words/phrases for filtering subject headers
"""

import re

def extract_regex_pattern(section_list, pattern):
    r = re.compile(pattern, re.IGNORECASE)
    extracted_list = list(filter(r.match, section_list))
    remaining_list = list(set(section_list) - set(extracted_list))
    
    return remaining_list, extracted_list

def construct_regex_match_pattern(terms_dict):
    fuzzy_terms = ['.*%s.*' % i for i in terms_dict['fuzzy']]
    exact_terms = terms_dict['exact']
    
    fuzzy_pattern = '|'.join(fuzzy_terms)
    #exact_pattern = '|'.join(exact_terms)
    
    full_pattern = fuzzy_pattern
    
    return full_pattern

In [19]:
"""
Extract putative discussion headers.
"""

#Construct regex pattern for discussion header terms and extract list of matching headers.

disc_terms_dict = {
    'exact': [],
    'fuzzy' : [
        'conclusion',
        'discussion',
        'interpretation',
        'added value of this study',
        'research in context',
        'concluding',
        'closing remarks',
        'summary of findings',
        'outcome'
    ]
}
conc_pattern = construct_regex_match_pattern(disc_terms_dict)
print(conc_pattern)

.*conclusion.*|.*discussion.*|.*interpretation.*|.*added value of this study.*|.*research in context.*|.*concluding.*|.*closing remarks.*|.*summary of findings.*|.*outcome.*


In [20]:
import pprint

#unique_sections = set(merged_text_vec_df.section.tolist())
unique_sections = set(concat_text_trunc_df.section.tolist())

rem_header_list, ext_header_list = extract_regex_pattern(unique_sections, conc_pattern)

print("Number of discussion headers: %d" % len(ext_header_list))
print("Example discussion headers:")
pprint.pprint(ext_header_list[:20])


Number of discussion headers: 7873
Example discussion headers:
['Outcome measures and definitions ::: MATERIAL AND METHODS',
 '4 Discussion and limitation',
 'Public health expenditure and Covid-19 outcomes ::: Conceptual background '
 'and hypotheses',
 'CONCLUSIONS AND IMPLICATIONS',
 'Docking results of Lonafarnib against SARS‐CoV‐2 and SARS‐CoV NSP12‐NSP7 ::: '
 'RESULTS AND DISCUSSION',
 'Treatment ::: Treatment and Outcome',
 'Discussion and suggestions of safety for urban public transportation',
 '4.1. Temporal Trend ::: 4. Discussion',
 'Conclusion: Further Regulatory Action is Needed',
 'Types of outcome measures ::: Methods and analysis',
 'Limitations. ::: DISCUSSION',
 'Case Discussion',
 '“Parallel pleura sign” ::: Other accompanying signs ::: Discussion',
 'Safety outcome ::: Results',
 'Secondary outcomes ::: Outcome measures ::: Methods',
 'Conclusion and Future Scopes',
 'Outcomes after COVID-19',
 'Discussion of East Asian prejudice vs. Neutral',
 'Cyclic amide deriva

In [21]:
#For now, lower-casing all section headers for matching

section_exact_match_list = set([i.lower() for i in ext_header_list])
section_exact_match_list.add('abstract')
section_exact_match_list.add('title')

print("Number of sections filtering to: %d" % len(section_exact_match_list))

Number of sections filtering to: 7379


In [22]:
print('Step 3: Filtering to title, abstract and discussion sections...')
merged_df_filtered_to_sections_oi = concat_text_trunc_df.loc[concat_text_trunc_df.section.str.lower().isin(section_exact_match_list)]

#merged_df_filtered_to_sections_oi.to_csv(filt_text_df_outfile_path, index=False)

Step 3: Filtering to title, abstract and discussion sections...


In [15]:
#version 20200718
print('Number of papers after filtering to sections:',len(merged_df_filtered_to_sections_oi.cord_uid.unique()))

Number of papers after filtering to sections: 18759


In [80]:
#v31 output
print('Number of papers after filtering to sections:',len(merged_df_filtered_to_sections_oi.cord_uid.unique()))

Number of papers after filtering to sections: 13465


In [23]:
"""
Filtering out sentences with less than three non-header words.

Author: Malavika Suresh

"""
input_data = merged_df_filtered_to_sections_oi

In [24]:
#Keep only sentences containing at least 3 words other than those defined below
#This also removes any sentences that do not contain any words at all

print('Step 4: Filtering our noisy sentences...')
rep = {"text": "", "cite_spans": "", "ref_spans": "", "section": "", "abstract": "",\
       "biorxiv preprint": "", "medrxiv preprint": "", "doi:": ""}
rep = dict((re.escape(k), v) for k, v in rep.items())
pattern = re.compile("|".join(rep.keys()))
sentences_temp = [pattern.sub(lambda m: rep[re.escape(m.group(0))], s) for s in input_data.sentence.str.lower()]
pattern = re.compile(".*[A-Za-z].*")
sentences_to_keep = [(bool(re.search(pattern,s))) & (len(s.split(' '))>2) for s in sentences_temp]
input_processed = input_data.loc[sentences_to_keep,:]
sentences_to_drop = [not i for i in sentences_to_keep]
input_excluded = input_data.loc[sentences_to_drop,:]


Step 4: Filtering our noisy sentences...


In [25]:
print('Number of papers after filtering sentences:',len(input_processed.cord_uid.unique()))

Number of papers after filtering sentences: 23435


In [18]:
#version 20200718
print('Number of papers after filtering sentences:',len(input_processed.cord_uid.unique()))

Number of papers after filtering sentences: 18662


In [84]:
#v31 output
print('Number of papers after filtering sentences:',len(input_processed.cord_uid.unique()))

Number of papers after filtering sentences: 13390


In [27]:
input_processed.to_csv(processed_text_df_outfile_path)
# input_excluded.to_csv('%scord_titles_abstracts_conclusions_excluded.csv' % output_path)

In [20]:
input_processed

Unnamed: 0,cord_uid,sentence,section
17,dwrazfli,The crisis generated by the COVID-19 pandemic ...,Conclusions
22,5k0ktboh,Asymptomatic and atypical presentations of COV...,Discussion
23,5k0ktboh,The patient's radiological findings were enoug...,Discussion
24,5k0ktboh,The atypical presentation and lack of communic...,Discussion
25,5k0ktboh,Asymptomatic and atypical presentations of COV...,Conclusion
...,...,...,...
867265,6lr8i54a,"Currently, there is a Phase III trial (ChiCTR2...",Discussion
867266,6lr8i54a,Although there is good evidence of tocilizumab...,Discussion
867267,6lr8i54a,Although tocilizumab seems like a good option ...,Discussion
867268,6lr8i54a,"To summarize, we report the usage of tocilizum...",Discussion


In [28]:
title_data = input_processed.loc[input_processed.section.str.lower()=='title',:]
abstract_data = input_processed.loc[input_processed.section.str.lower()=='abstract',:]
conclusion_data = input_processed.loc[(input_processed.section.str.lower()!='title') & (input_processed.section.str.lower()!='abstract'),:]

In [29]:
print('Number of papers:', input_processed.cord_uid.nunique())
print('Number of papers with title:', title_data.cord_uid.nunique())
print('Number of papers with abstract:', abstract_data.cord_uid.nunique())
print('Number of papers with conclusion:', conclusion_data.cord_uid.nunique())

Number of papers: 23435
Number of papers with title: 3
Number of papers with abstract: 18149
Number of papers with conclusion: 18621


In [22]:
#version 20200718
print('Number of papers:', input_processed.cord_uid.nunique())
print('Number of papers with title:', title_data.cord_uid.nunique())
print('Number of papers with abstract:', abstract_data.cord_uid.nunique())
print('Number of papers with conclusion:', conclusion_data.cord_uid.nunique())

Number of papers: 18662
Number of papers with title: 3
Number of papers with abstract: 14467
Number of papers with conclusion: 14717


In [98]:
#v31 output
print('Number of papers:', input_processed.cord_uid.nunique())
print('Number of papers with title:', title_data.cord_uid.nunique())
print('Number of papers with abstract:', abstract_data.cord_uid.nunique())
print('Number of papers with conclusion:', conclusion_data.cord_uid.nunique())

Number of papers: 13390
Number of papers with title: 2
Number of papers with abstract: 10539
Number of papers with conclusion: 10408
