In [3]:
import json
#import spacy
#import en_core_sci_lg
#from spacy.pipeline import EntityRuler
import re
import pandas as pd

In [4]:
"""
Input
"""

input_path = '../data/v31_processed/'
merged_text_vec_csv_path = '%scord_titles_abstracts_conclusions_200620.csv' % input_path

drug_terms_path = '../data/DrugNames.txt'


"""
Output
"""

output_path = '../data/v31_processed/'

section_text_with_drugs_outpath = '%ssection_text_with_drug_mentions_200620.csv' %output_path
section_text_with_drugs_ann_outpath = '%ssection_text_with_drug_mentions_ann_200620.csv' %output_path

In [5]:
#nlp = en_core_sci_lg.load()

merged_text_vec_df = pd.read_csv(merged_text_vec_csv_path, index_col=0)

In [6]:
"""
Merge all sentences belonging to each section of each paper into contiguous text passages.
"""

merged_text_vec_df

concat_sent_df = merged_text_vec_df.groupby(['cord_uid', 'section'], as_index=False).agg({'sentence': ' '.join})
concat_sent_df.loc[:, 'sentence'] = concat_sent_df.loc[:, 'sentence'].str.lower()

concat_sent_df= concat_sent_df.rename(columns={'sentence' :'text'})

In [7]:
"""
Read drug terms from reference file and generate regex pattern for matching.
"""

with open(drug_terms_path) as f:
    drug_terms = ['%s' % i.lower() for i in f.read().splitlines()]
    
drug_terms_pattern =  '\W' + '\W|\W'.join(drug_terms) + '\W'

In [8]:
"""
Filter to rows where section text contains drug term regex pattern.
"""

contain_drug_mask = concat_sent_df['text'].str.contains(drug_terms_pattern, case=False)
section_text_with_drugs_df = concat_sent_df[contain_drug_mask]

In [9]:
section_text_with_drugs_df.to_csv(section_text_with_drugs_outpath)

In [11]:
print('Number of papers after filtering for drug mentions:',len(section_text_with_drugs_df.cord_uid.unique()))

Number of papers after filtering for drug mentions: 2564


In [13]:
title_data = section_text_with_drugs_df.loc[section_text_with_drugs_df.section.str.lower()=='title',:]
abstract_data = section_text_with_drugs_df.loc[section_text_with_drugs_df.section.str.lower()=='abstract',:]
conclusion_data = section_text_with_drugs_df.loc[(section_text_with_drugs_df.section.str.lower()!='title') & (section_text_with_drugs_df.section.str.lower()!='abstract'),:]

In [14]:
print('Number of papers:', section_text_with_drugs_df.cord_uid.nunique())
print('Number of papers with title:', title_data.cord_uid.nunique())
print('Number of papers with abstract:', abstract_data.cord_uid.nunique())
print('Number of papers with conclusion:', conclusion_data.cord_uid.nunique())

Number of papers: 2564
Number of papers with title: 0
Number of papers with abstract: 978
Number of papers with conclusion: 2116


In [15]:
"""
Manual annotation of drug terms used.

"""

section_text_with_drugs_ann_df = section_text_with_drugs_df.copy()

section_text_with_drugs_ann_df['drug_terms_used'] = ''

for index, row in section_text_with_drugs_ann_df.iterrows():
    
    drugs_used = []
    for drug in drug_terms:
        if drug in row.text:
            drugs_used.append(drug)
    
    section_text_with_drugs_ann_df.at[index, 'drug_terms_used'] = ','.join(drugs_used)
    
section_text_with_drugs_ann_df.to_csv(section_text_with_drugs_ann_outpath)

In [16]:
section_text_with_drugs_ann_df

Unnamed: 0,cord_uid,section,text,drug_terms_used
26,019rcbpg,Potential biological mechanisms of SARS-CoV-2 ...,there are indications in the literature of a n...,ifn-gamma
30,01es0zv4,Abstract,coronavirus disease 2019 has become a global p...,"chloroquine,lopinavir,remdesivir,ritonavir,azi..."
31,01es0zv4,CONCLUSION,covid-19 is a pandemic with high morbidity and...,"chloroquine,lopinavir,remdesivir,ritonavir,azi..."
32,01es0zv4,CONCLUSION:,covid-19 is a pandemic with high morbidity and...,"chloroquine,lopinavir,remdesivir,ritonavir,azi..."
39,01lyavy2,Abstract,"then, the really positive treatment could be t...",protein s
...,...,...,...,...
28040,zrg2z17n,Abstract,"the covid-19 pandemic has claimed over 150,000...",vitamin d
28070,zsz9xnxt,Discussion,we demonstrate that mechanically ventilated pa...,carbon dioxide
28094,zv0ysi8m,Abstract,sars-cov-2 is a novel strain of coronavirus th...,"chloroquine,favipiravir,remdesivir,chloroquine"
28095,zv0ysi8m,Conclusions,sars-cov-2 has been declared a pandemic that c...,"chloroquine,favipiravir,remdesivir,chloroquine"


In [20]:
"""
Manual checking for drug mentions excluded by regex search, using direct string matches.

"""

sections_texts = concat_sent_df.text.tolist()

cord_uids_contain_drugs_manual_search_dict = {}

idx = 0
for idx, row in concat_sent_df.iterrows():
    text = row.text
    for drug in drug_terms:
        if drug in text:
            cord_uids_contain_drugs_manual_search_dict.setdefault(row.cord_uid, []).append(drug)
            
    idx +=1
    
cord_uids_with_manual_search_drugs = set(cord_uids_contain_drugs_manual_search_dict.keys())

#Difference in cord_uid lists
diff_cord_uids = cord_uids_with_manual_search_drugs - set(section_text_with_drugs_df.cord_uid)

In [23]:
len(diff_cord_uids)

3517

In [22]:
diff_cord = 'px0twvhs'

drugs_used = cord_uids_contain_drugs_manual_search_dict[diff_cord]
test_df = concat_sent_df.loc[concat_sent_df.cord_uid == diff_cord]

for i,sent in enumerate(test_df.text.tolist()):
    #if 'lustra' in sent:
    print(i,sent)


0 the covid-19 outbreak has been a serious public health threat worldwide. we use individually documented case descriptions of covid-19 from china (excluding hubei province) to estimate the distributions of the generation time, incubation period, and periods from symptom onset to isolation and to diagnosis. the recommended 14-day quarantine period may lead to a 6.7% failure for quarantine. we recommend a 22-day quarantine period. the mean generation time is 3.3 days and the mean incubation period is 7.2 days. it took 3.7 days to isolate and 6.6 days to diagnose a patient after his/her symptom onset. patients may become infectious on average 3.9 days before showing major symptoms. this makes contact tracing and quarantine ineffective. the basic reproduction number is estimated to be 1.54 with contact tracing, quarantine and isolation, mostly driven by super spreaders.
