In [5]:
"""
Using a spaCy EntityRuler pipeline component to identify entities (virus terms, drug terms, etc.) in text passages.

Edit - Not sure if this is the direction we're going, so just used regex and pandas to filter the section texts for now.
"""

'\nUsing a spaCy EntityRuler pipeline component to identify entities (virus terms, drug terms, etc.) in text passages.\n\n'

In [6]:
import json
import spacy
import en_core_sci_lg
from spacy.pipeline import EntityRuler
import re
import pandas as pd

In [280]:
"""
Input
"""

input_path = '../resources/'
ent_ruler_jsonl_path = '%sspacy_entity_rulers/vt_task_entity_ruler_200501.jsonl' % input_path
merged_text_vec_csv_path = '%scord_titles_abstracts_conclusions.csv' % input_path

drug_terms_path = '%sspacy_entity_rulers/input/DrugNames.txt' % input_path


"""
Output
"""

output_path = '../output/'

section_text_with_drugs_outpath = '%ssection_text_with_drug_mentions_200501.csv'
section_text_with_drugs_ann_outpath = '%ssection_text_with_drug_mentions_ann_200501.csv'

In [281]:
nlp = en_core_sci_lg.load()

merged_text_vec_df = pd.read_csv(merged_text_vec_csv_path, index_col=0)

In [9]:
disabled = nlp.disable_pipes("tagger", "parser", "ner")
entity_ruler = EntityRuler(nlp)
entity_ruler.from_disk(ent_ruler_jsonl_path)
disabled.restore()

nlp.add_pipe(entity_ruler, before='ner')
nlp.remove_pipe('ner')


('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7eff784ade80>)

In [10]:
def annotate_text_with_named_entities(text):
    """
    Using an EntityRuler component, identify named entities and merge their component tokens.
    :param passage (str): a text passage.
    :return (Spacy.doc): a doc object that includes the identified named entities.
    """

    doc = nlp(text.lower())
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            retokenizer.merge(ent)

    return doc

In [257]:
"""
Merge all sentences belonging to each section of each paper into contiguous text passages.
"""

merged_text_vec_df

concat_sent_df = merged_text_vec_df.groupby(['cord_uid', 'section'], as_index=False).agg({'sentence': ' '.join})
concat_sent_df.loc[:, 'sentence'] = concat_sent_df.loc[:, 'sentence'].str.lower()

concat_sent_df= concat_sent_df.rename(columns={'sentence' :'text'})

In [215]:
"""
Read drug terms from reference file and generate regex pattern for matching.
"""

with open(drug_terms_path) as f:
    drug_terms = ['%s' % i.lower() for i in f.read().splitlines()]
    
drug_terms_pattern =  '\W' + '\W|\W'.join(drug_terms) + '\W'

In [217]:
"""
Filter to rows where section text contains drug term regex pattern.
"""

contain_drug_mask = concat_sent_df['text'].str.contains(drug_terms_pattern, case=False)
section_text_with_drugs_df = concat_sent_df[contain_drug_mask]

In [249]:
section_text_with_drugs_df.to_csv(se)

In [None]:
section_text_with_drugs_df.at

In [273]:
"""
Manual annotation of drug terms used.

"""

section_text_with_drugs_ann_df = section_text_with_drugs_df.copy()

section_text_with_drugs_ann_df['drug_terms_used'] = ''

for index, row in section_text_with_drugs_ann_df.iterrows():
    
    drugs_used = []
    for drug in drug_terms:
        if drug in row.text:
            drugs_used.append(drug)
    
    section_text_with_drugs_ann_df.at[index, 'drug_terms_used'] = ','.join(drugs_used)
    
section_text_with_drugs_ann_df.to_csv('../output/section_texts_with_drug_mentions_ann_200501.csv')

In [232]:
"""
Manual checking for drug mentions excluded by regex search, using direct string matches.

"""

sections_texts = concat_sent_df.sentence.tolist()

cord_uids_contain_drugs_manual_search_dict = {}

idx = 0
for idx, row in concat_sent_df.iterrows():
    text = row.sentence
    for drug in drug_terms:
        if drug in text:
            cord_uids_contain_drugs_manual_search_dict.setdefault(row.cord_uid, []).append(drug)
            
    idx +=1
    
cord_uids_with_manual_search_drugs = set(cord_uids_contain_drugs_manual_search_dict.keys())

#Difference in cord_uid lists
diff_cord_uids = cord_uids_with_manual_search_drugs - set(cord_uids_with_drugs)

In [267]:
diff_cord = 'ela022bo'

drugs_used = cord_uids_contain_drugs_manual_search_dict[diff_cord]
test_df = concat_sent_df.loc[concat_sent_df.cord_uid == diff_cord]

for sent in test_df.text.tolist():
    if 'lustra' in sent:
        print(sent)


the sars-cov-2 epidemic is one of the biggest challenges healthcare systems worldwide have ever had to face. to curb transmission many countries have adopted social distancing measures and travel restrictions. estimating the effect of these measures in each context is challenging and requires mathematical models of the transmission dynamics. projections for the future course of the epidemic strongly rely on model predictions and accurate representation of real-time data as they accumulate. here i develop an seir modeling framework for covid-19, to evaluate reported cases and fatalities, and to enable forecasting using evidence-based bayesian parameter estimation. this bayesian framework offers a tool to parametrize real-time dynamics of covid-19 cases, and explore the effect of control as it unfolds in any setting. i apply the model to covid-19 data from albania, where drastic control measures were put in place already on the day of the first confirmed case. evaluating the dynamics of 

In [248]:
test_df = concat_sent_df.loc[concat_sent_df.cord_uid == 'ela022bo']

for sent in test_df.sentence.tolist():
    if 'lustra' in sent:
        print(sent)

the sars-cov-2 epidemic is one of the biggest challenges healthcare systems worldwide have ever had to face. to curb transmission many countries have adopted social distancing measures and travel restrictions. estimating the effect of these measures in each context is challenging and requires mathematical models of the transmission dynamics. projections for the future course of the epidemic strongly rely on model predictions and accurate representation of real-time data as they accumulate. here i develop an seir modeling framework for covid-19, to evaluate reported cases and fatalities, and to enable forecasting using evidence-based bayesian parameter estimation. this bayesian framework offers a tool to parametrize real-time dynamics of covid-19 cases, and explore the effect of control as it unfolds in any setting. i apply the model to covid-19 data from albania, where drastic control measures were put in place already on the day of the first confirmed case. evaluating the dynamics of 

In [276]:
len(section_text_with_drugs_df.cord_uid.unique())

544

In [272]:
concat_sent_df.loc[concat_sent_df.cord_uid == 'zph6r4il'].loc[concat_sent_df.section == 'abstract'].text.tolist()[0]

'objective: the sars-cov-2-infected disease (covid-19) outbreak is a major threat to human beings. previous studies mainly focused on wuhan and typical symptoms. we analysed 74 confirmed covid-19 cases with gi symptoms in the zhejiang province to determine epidemiological, clinical and virological characteristics. design: covid-19 hospital patients were admitted in the zhejiang province from 17 january 2020 to 8 february 2020. epidemiological, demographic, clinical, laboratory, management and outcome data of patients with gi symptoms were analysed using multivariate analysis for risk of severe/critical type. bioinformatics were used to analyse features of sars-cov-2 from zhejiang province. results: among enrolled 651 patients, 74 (11.4%) presented with at least one gi symptom (nausea, vomiting or diarrhoea), average age of 46.14 years, 4-day incubation period and 10.8% had pre-existing liver disease. of patients with covid-19 with gi symptoms, 17 (22.97%) and 23 (31.08%) had severe/cri

In [278]:
section_text_with_drugs_ann_df

Unnamed: 0,cord_uid,section,text,drug_terms_used
14,02q9y011,RESULTS AND DISCUSSION,hypervariability in the nsp3 macro x domain-th...,"amino acids,glycine,protein c,tandem"
56,08vsaov7,DISCUSSION,the predominant pathological features of covid...,angiotensin ii
66,0a49okho,Discussion,"the term 'flatten the curve', originating from...","stimate,tandem"
82,0d77ojnb,Discussion and Conclusion,the rapid spread of sars-cov-2 represents a si...,"heparin,cosamin,heparin,protein s"
83,0d77ojnb,abstract,[{'text': 'many pathogens take advantage of th...,"heparin,cosamin,heparin"
...,...,...,...,...
9828,ziepfnpz,abstract,the epidemiological and clinical characteristi...,tacrolimus
9872,zn87f1lk,title,"hydroxychloroquine, a less toxic derivative of...","chloroquine,chloroquine,hydroxychloroquine"
9893,zph6r4il,abstract,objective: the sars-cov-2-infected disease (co...,"lactate,ultiva"
9949,zwqci59h,Discussion,although recent studies have reported epidemio...,"compro,creatinine,cyclosporine,methylprednisol..."


True