# Covid-19 NLP Pipeline


In [None]:
#| hide
from nbdev.showdoc import show_doc
from IPython.display import display, HTML
%load_ext autoreload
%autoreload 2

In [None]:
#| output: false
# ! pip install spacy
# ! python -m spacy download en_core_web_sm

In [None]:
import spacy

```python
! pip install spannerlib
```

In [None]:
import re
import csv
import pandas as pd
from pandas import DataFrame
from pathlib import Path

from spannerlib import get_magic_session,Session,Span
sess = get_magic_session()

## IE function

Defining some generic ie functions that will be used in every stage of the pipline:

In [None]:
def rewrite(text,span_label_pairs):
    """rewrites a string given a dataframe with spans and the string to rewrite them to
    assumes that the spans belong to the text


    Args:
        text (str like): string to rewrite
        span_label_pairs (pd.Dataframe) dataframe with two columns, first is spans in the doc to rewrite
            second is what to rewrite to
    Returns:
        The rewritten string
    """    
    if isinstance(text,Span):
        text = text.as_str()
    span_label_pairs = sorted(list(span_label_pairs.itertuples(index=False,name=None)), key=lambda x: x[0].start)

    rewritten_text = ''
    current_pos = 0
    for span,label in span_label_pairs:
        rewritten_text += text[current_pos:span.start] + label 
        current_pos = span.end

    rewritten_text += text[current_pos:]

    return rewritten_text


In [None]:
doc = Span('hello darkness my old friend, I come ...',name='doc')
doc

[@doc,0,40) "hello dark..."

In [None]:
spans_to_replace = pd.DataFrame([
    [doc.slice(18,21),'young'],
    [doc.slice(22,28),'nemesis'],
])
spans_to_replace.map(repr)


Unnamed: 0,0,1
0,"[@doc,18,21) ""old""",'young'
1,"[@doc,22,28) ""friend""",'nemesis'


In [None]:
rewritten_doc=rewrite(doc,spans_to_replace)
assert rewritten_doc == 'hello darkness my young nemesis, I come ...'
rewritten_doc

'hello darkness my young nemesis, I come ...'

In [None]:
def split_sentence(text):
    """
    This function reads a text file, processes its content using spaCy's English language model,
    tokenizing it into sentences and returns each individual sentence in the processed text using a generator.
    
    Parameters:
        text_path (str): The path to the text file to be annotated.

    Returns:
        str: Individual sentences extracted from the input text.
    """

    nlp = spacy.load("en_core_web_sm")
    doc = nlp(str(text))

    start = 0
    for sentence in doc.sents:
        end = start+len(sentence.text)
        yield Span(text,start,end)
        start = end + 1

sess.register('split_sentence',split_sentence,[(str,Span)],[Span])

In [None]:
text = Path('sample1.txt').read_text()

In [None]:
assert list(split_sentence(text)) == ['Patient presents to be tested for COVID-19.',
 'His wife recently tested positive for novel coronavirus.',
 'SARS-COV-2 results came back positive.']
list(split_sentence(text))

[[@a6c01c,0,43) "Patient pr...",
 [@a6c01c,44,100) "His wife r...",
 [@a6c01c,101,139) "SARS-COV-2..."]

In [None]:
class LemmaFromList():
    def __init__(self,lemma_list):
        self.lemma_list = lemma_list

    def __call__(self,text):
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(str(text))
        for word in doc:
            start = word.idx
            end = start + len(word.text)
            if word.lemma_ in self.lemma_list:
                yield (Span(text,start,end),word.lemma_)
            elif word.like_num:
                yield (Span(text,start,end),'like_num')
            else:
                pass

lemma_list = Path('lemma_words.txt').read_text().split()
lemmatizer = LemmaFromList(lemma_list)


sess.register('lemma',lemmatizer,[(Span,str)],[Span,str])

In [None]:
assert list(lemmatizer('the boy was sick')) == [("was","be")]

In [None]:
text = 'the boy was sick'
res = rewrite(text,pd.DataFrame(lemmatizer(text))) 
assert res == 'the boy be sick' 

In [None]:
class PosFromList():
    def __init__(self,pos_list):
        self.pos_list = pos_list
    def __call__(self,text):
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(str(text))
        for word in doc:
            start = word.idx
            end = start + len(word.text)
            if word.pos_ in self.pos_list:
                yield (Span(text,start,end),word.pos_)


In [None]:
pos_annotator = PosFromList(["NOUN", "PROPN", "PRON", "ADJ"])
assert list(pos_annotator('sick boy')) == [('sick','ADJ'),('boy','NOUN')]
list(pos_annotator('sick boy'))

[([@01e12d,0,4) "sick", 'ADJ'), ([@01e12d,5,8) "boy", 'NOUN')]

In [None]:
sess.register('pos',pos_annotator,[(Span,str)],[Span,str])

## Pipeline

In [None]:
# text versions:
# raw version
# lemmatized version
# Lemma concept tagged version
# POS annotated version on top of lemm concepts
# Pos annotated only on top of lemma concepts


In [None]:
from glob import glob

In [None]:
file_paths = pd.DataFrame(sorted(glob('sample*.txt')))
sess.import_rel('Files',file_paths)
sess.export('?Files(P)')

Unnamed: 0,P
0,sample1.txt
1,sample10.txt
2,sample2.txt
3,sample3.txt
4,sample4.txt
5,sample5.txt
6,sample6.txt
7,sample7.txt
8,sample8.txt
9,sample9.txt


In [None]:
%%spannerlog 
?Files(P)

'?Files(P)'

Unnamed: 0,P
0,'sample1.txt'
1,'sample10.txt'
2,'sample2.txt'
3,'sample3.txt'
4,'sample4.txt'
5,'sample5.txt'
6,'sample6.txt'
7,'sample7.txt'
8,'sample8.txt'
9,'sample9.txt'


In [None]:
%%spannerlog
Docs2(D,'raw_text')<-Files(P),read(P)->(D)
?Docs2(D,Ver)

#TODO from here, avoid carrying path everywhere
#TODO add a deconstruct span ie function for getting the document name back


'?Docs2(D,Ver)'

Unnamed: 0,D,Ver
0,"[@sample7.txt,0,83) ""Elevated c...""",'raw_text'
1,"[@sample9.txt,0,77) ""Patient ha...""",'raw_text'
2,"[@sample1.txt,0,139) ""Patient pr...""",'raw_text'
3,"[@sample8.txt,0,63) ""Patient wa...""",'raw_text'
4,"[@sample3.txt,0,53) ""Problem Li...""",'raw_text'
5,"[@sample6.txt,0,45) ""The patien...""",'raw_text'
6,"[@sample2.txt,0,115) ""The patien...""",'raw_text'
7,"[@sample4.txt,0,21) ""neg covid ...""",'raw_text'
8,"[@sample10.txt,0,60) ""patient wa...""",'raw_text'
9,"[@sample5.txt,0,26) ""positive c...""",'raw_text'


In [None]:
input_paths = pd.DataFrame(
    [(path,Path(path).read_text(),"raw_text") for path in sorted(glob('sample*.txt'))]
    )
input_paths



Unnamed: 0,0,1,2
0,sample1.txt,Patient presents to be tested for COVID-19. Hi...,raw_text
1,sample10.txt,patient was screened for cov-19. results came ...,raw_text
2,sample2.txt,The patient was tested for Coronavirus 2019. R...,raw_text
3,sample3.txt,Problem List: 1. Pneumonia 2. Novel Coronaviru...,raw_text
4,sample4.txt,neg covid education.,raw_text
5,sample5.txt,positive covid precaution.,raw_text
6,sample6.txt,The patient have reported novel coronavirus.,raw_text
7,sample7.txt,Elevated cholesterol levels require further as...,raw_text
8,sample8.txt,Patient was sent for a covid test. Someone was...,raw_text
9,sample9.txt,Patient had contact patient with coronavirus. ...,raw_text


In [None]:
sess.import_rel('Docs',input_paths)

In [None]:
%%spannerlog
Lemmas(P,D,Word,Lem)<-Docs(P,D,"raw_text"),lemma(D)->(Word,Lem)
?Lemmas(P,D,Word,Lem)

'?Lemmas(P,D,Word,Lem)'

Unnamed: 0,P,D,Word,Lem
0,'sample1.txt','Patient presents to be tested for COVID-19. H...,"[@a6c01c,0,7) ""Patient""",'patient'
1,'sample1.txt','Patient presents to be tested for COVID-19. H...,"[@a6c01c,20,22) ""be""",'be'
2,'sample10.txt','patient was screened for cov-19. results came...,"[@9f417c,0,7) ""patient""",'patient'
3,'sample10.txt','patient was screened for cov-19. results came...,"[@9f417c,8,11) ""was""",'be'
4,'sample2.txt','The patient was tested for Coronavirus 2019. ...,"[@591f89,4,11) ""patient""",'patient'
...,...,...,...,...
15,'sample8.txt','Patient was sent for a covid test. Someone wa...,"[@aad8ff,8,11) ""was""",'be'
16,'sample8.txt','Patient was sent for a covid test. Someone wa...,"[@aad8ff,43,46) ""was""",'be'
17,'sample9.txt','Patient had contact patient with coronavirus....,"[@0e1178,8,11) ""had""",'have'
18,'sample9.txt','Patient had contact patient with coronavirus....,"[@0e1178,12,19) ""contact""",'contact'


In [None]:
lemma_tags = sess.export('?Lemmas(P,D,W,L)')
lemma_tags.map(repr)

Unnamed: 0,P,D,W,L
0,'sample1.txt','Patient presents to be tested for COVID-19. H...,"[@a6c01c,0,7) ""Patient""",'patient'
1,'sample1.txt','Patient presents to be tested for COVID-19. H...,"[@a6c01c,20,22) ""be""",'be'
2,'sample10.txt','patient was screened for cov-19. results came...,"[@9f417c,0,7) ""patient""",'patient'
3,'sample10.txt','patient was screened for cov-19. results came...,"[@9f417c,8,11) ""was""",'be'
4,'sample2.txt','The patient was tested for Coronavirus 2019. ...,"[@591f89,4,11) ""patient""",'patient'
5,'sample2.txt','The patient was tested for Coronavirus 2019. ...,"[@591f89,12,15) ""was""",'be'
6,'sample2.txt','The patient was tested for Coronavirus 2019. ...,"[@591f89,39,43) ""2019""",'like_num'
7,'sample2.txt','The patient was tested for Coronavirus 2019. ...,"[@591f89,53,56) ""are""",'be'
8,'sample2.txt','The patient was tested for Coronavirus 2019. ...,"[@591f89,67,74) ""Patient""",'patient'
9,'sample2.txt','The patient was tested for Coronavirus 2019. ...,"[@591f89,100,104) ""2019""",'like_num'


In [None]:
input_paths

Unnamed: 0,0,1,2
0,sample1.txt,Patient presents to be tested for COVID-19. Hi...,raw_text
1,sample10.txt,patient was screened for cov-19. results came ...,raw_text
2,sample2.txt,The patient was tested for Coronavirus 2019. R...,raw_text
3,sample3.txt,Problem List: 1. Pneumonia 2. Novel Coronaviru...,raw_text
4,sample4.txt,neg covid education.,raw_text
5,sample5.txt,positive covid precaution.,raw_text
6,sample6.txt,The patient have reported novel coronavirus.,raw_text
7,sample7.txt,Elevated cholesterol levels require further as...,raw_text
8,sample8.txt,Patient was sent for a covid test. Someone was...,raw_text
9,sample9.txt,Patient had contact patient with coronavirus. ...,raw_text


In [None]:
def rewrite_docs(docs,span_label,new_version):
    new_tuples =[]
    span_label.columns = ['P','D','W','L']
    for path,doc,_ in docs.itertuples(index=False,name=None):
        span_label_per_doc = span_label[span_label['P'] == path][['W','L']]
        new_text = rewrite(doc,span_label_per_doc)
        new_tuples.append((path,new_text,new_version))
    return pd.DataFrame(new_tuples,columns=['P','D','V'])
    

In [None]:
lemma_docs = rewrite_docs(input_paths,lemma_tags,'lemma')
sess.import_rel('Docs',lemma_docs)

In [None]:
%%spannerlog
?Docs(P,D,V)

'?Docs(P,D,V)'

Unnamed: 0,P,D,V
0,'sample1.txt','Patient presents to be tested for COVID-19. H...,'raw_text'
1,'sample1.txt','patient presents to be tested for COVID-19. H...,'lemma'
2,'sample10.txt','patient be screened for cov-19. results came ...,'lemma'
3,'sample10.txt','patient was screened for cov-19. results came...,'raw_text'
4,'sample2.txt','The patient be tested for Coronavirus like_nu...,'lemma'
...,...,...,...
15,'sample7.txt','Elevated cholesterol levels require further a...,'raw_text'
16,'sample8.txt','Patient be sent for a covid test. Someone be ...,'lemma'
17,'sample8.txt','Patient was sent for a covid test. Someone wa...,'raw_text'
18,'sample9.txt','Patient had contact patient with coronavirus....,'raw_text'


### [Concept Tagger](https://github.com/abchapman93/VA_COVID-19_NLP_BSV/blob/master/cov_bsv/knowledge_base/concept_tag_rules.py):


In [None]:
sess.import_rel("ConceptTagRules","concept_tags_rules.csv" , delim=",")

In [None]:
%%spannerlog
?ConceptTagRules(Rule,Tag,TextType)

'?ConceptTagRules(Rule,Tag,TextType)'

Unnamed: 0,Rule,Tag,TextType
0,'(?i)(?:(?:229(?:e)?|oc(?:-)?(?:43)?|o43|0c43|...,'OTHER_CORONAVIRUS','lemma'
1,'(?i)(?:(?:antibody|antibodies|ab) test)','antibody test','lemma'
2,'(?i)(?:(?:coronavirus|hcovs?|ncovs?|covs?)(?:...,'OTHER_CORONAVIRUS','lemma'
3,'(?i)(?:(?:diagnos(?:is|ed)|dx(?:\\.)?)(?:of|w...,'diagnosis','lemma'
4,'(?i)(?:\\+(?: ve)?|\\(\\+\\)|positive|\\bpos\...,'positive','lemma'
...,...,...,...
13,'(?i)(?:patient|pt(?:\\.)?|vt|veteran)','patient','lemma'
14,'(?i)(?:pneum(?:onia)?|pna|hypoxia|septic shoc...,'associated_diagnosis','lemma'
15,'(?i)(?:resident|pts|patients|coworker|coworke...,'other_experiencer','pos'
16,'(?i)(?:someone|somebody|person|anyone|anybody...,'other_experiencer','pos'


### Lemma Rules:

In [None]:
%%spannerlog
LemmaConceptMatches(Path,Doc,Span,Label) <- \
    Docs(Path,Doc,"lemma"),\
    ConceptTagRules(Pattern, Label, "lemma"),\
    rgx(Pattern,Doc) -> (Span)
?LemmaConceptMatches(Path,Doc,Span,Label)

'?LemmaConceptMatches(Path,Doc,Span,Label)'

Unnamed: 0,Path,Doc,Span,Label
0,'sample1.txt','patient presents to be tested for COVID-19. H...,"[@4d073b,0,7) ""patient""",'patient'
1,'sample1.txt','patient presents to be tested for COVID-19. H...,"[@4d073b,69,77) ""positive""",'positive'
2,'sample1.txt','patient presents to be tested for COVID-19. H...,"[@4d073b,82,99) ""novel coro...""",'COVID-19'
3,'sample1.txt','patient presents to be tested for COVID-19. H...,"[@4d073b,101,111) ""SARS-COV-2""",'COVID-19'
4,'sample1.txt','patient presents to be tested for COVID-19. H...,"[@4d073b,130,138) ""positive""",'positive'
...,...,...,...,...
18,'sample9.txt','Patient have contact patient with coronavirus...,"[@539a7c,0,7) ""Patient""",'patient'
19,'sample9.txt','Patient have contact patient with coronavirus...,"[@539a7c,21,28) ""patient""",'patient'
20,'sample9.txt','Patient have contact patient with coronavirus...,"[@539a7c,34,45) ""coronaviru...""",'COVID-19'
21,'sample9.txt','Patient have contact patient with coronavirus...,"[@539a7c,57,65) ""positive""",'positive'


In [None]:
lemma_concept_matches = sess.export('?LemmaConceptMatches(Path,Doc,Span,Label)')
#TODO ask mahmoud why the covid in sample 5 doesnt work
lemma_concept_matches.map(repr)

Unnamed: 0,Path,Doc,Span,Label
0,'sample1.txt','patient presents to be tested for COVID-19. H...,"[@4d073b,0,7) ""patient""",'patient'
1,'sample1.txt','patient presents to be tested for COVID-19. H...,"[@4d073b,69,77) ""positive""",'positive'
2,'sample1.txt','patient presents to be tested for COVID-19. H...,"[@4d073b,82,99) ""novel coro...""",'COVID-19'
3,'sample1.txt','patient presents to be tested for COVID-19. H...,"[@4d073b,101,111) ""SARS-COV-2""",'COVID-19'
4,'sample1.txt','patient presents to be tested for COVID-19. H...,"[@4d073b,130,138) ""positive""",'positive'
5,'sample10.txt','patient be screened for cov-19. results came ...,"[@f3a9fd,0,7) ""patient""",'patient'
6,'sample10.txt','patient be screened for cov-19. results came ...,"[@f3a9fd,50,58) ""positive""",'positive'
7,'sample2.txt','The patient be tested for Coronavirus like_nu...,"[@a5d37d,4,11) ""patient""",'patient'
8,'sample2.txt','The patient be tested for Coronavirus like_nu...,"[@a5d37d,26,37) ""Coronaviru...""",'COVID-19'
9,'sample2.txt','The patient be tested for Coronavirus like_nu...,"[@a5d37d,59,67) ""positive""",'positive'


In [None]:
lemma_concepts = rewrite_docs(lemma_docs,lemma_concept_matches,'lemma_concept')
sess.import_rel('Docs',lemma_concepts)
lemma_concepts

Unnamed: 0,P,D,V
0,sample1.txt,patient presents to be tested for COVID-19. Hi...,lemma_concept
1,sample10.txt,patient be screened for cov-19. results came b...,lemma_concept
2,sample2.txt,The patient be tested for COVID-19 like_num. R...,lemma_concept
3,sample3.txt,Problem List: like_num. Pneumonia like_num. CO...,lemma_concept
4,sample4.txt,neg covid education.,lemma_concept
5,sample5.txt,positive covid precaution.,lemma_concept
6,sample6.txt,The patient have reported COVID-19.,lemma_concept
7,sample7.txt,Elevated cholesterol levels require further as...,lemma_concept
8,sample8.txt,patient be sent for a covid test. Someone be t...,lemma_concept
9,sample9.txt,patient have contact patient with COVID-19. sc...,lemma_concept


In [None]:
sess.export('?Docs("sample2.txt",D,V)')

Unnamed: 0,D,V
0,The patient be tested for COVID-19 like_num. R...,lemma_concept
1,The patient be tested for Coronavirus like_num...,lemma
2,The patient was tested for Coronavirus 2019. R...,raw_text


### POS Rules:

In [None]:
%%spannerlog
?ConceptTagRules(Rule,Tag,"pos")

'?ConceptTagRules(Rule,Tag,"pos")'

Unnamed: 0,Rule,Tag
0,'(?i)(?:boyfriend|persons|person|church|conven...,'other_experiencer'
1,'(?i)(?:cashier|landlord|worked|works|^mate|no...,'other_experiencer'
2,'(?i)(?:grandchild|grandson|cousin|grandmother...,'family'
3,'(?i)(?:resident|pts|patients|coworker|coworke...,'other_experiencer'
4,'(?i)(?:someone|somebody|person|anyone|anybody...,'other_experiencer'
5,'(?i)(?:wife|husband|spouse|family|member|girl...,'family'


In [None]:
sess.export('?Docs("sample8.txt",D,"lemma_concept")').iloc[0,0]

'patient be sent for a covid test. Someone be tested positive.'

In [None]:
%%spannerlog

# here we get the spans of all POS
Pos(P,D,Word,Lem)<-Docs(P,D,"lemma_concept"),pos(D)->(Word,Lem)
# small debugging print helps in building new rules
?Pos("sample8.txt",D,Word,Lem)
# here we look for concept rule matches where the matched word is also tagged via POS
PosConceptMatches(Path,Doc,Span,Label) <- \
    Docs(Path,Doc,"lemma_concept"),\
    ConceptTagRules(Pattern, Label, "pos"),\
    rgx(Pattern,Doc) -> (Span),\
    Pos(Path,Doc,Span,POSLabel)



'?Pos("sample8.txt",D,Word,Lem)'

Unnamed: 0,D,Word,Lem
0,'patient be sent for a covid test. Someone be ...,"[@7b39f8,0,7) ""patient""",'NOUN'
1,'patient be sent for a covid test. Someone be ...,"[@7b39f8,22,27) ""covid""",'ADJ'
2,'patient be sent for a covid test. Someone be ...,"[@7b39f8,28,32) ""test""",'NOUN'
3,'patient be sent for a covid test. Someone be ...,"[@7b39f8,34,41) ""Someone""",'PRON'
4,'patient be sent for a covid test. Someone be ...,"[@7b39f8,52,60) ""positive""",'ADJ'


In [None]:
pos_concept_matches = sess.export('?PosConceptMatches(P,D,W,L)')
pos_concept_matches

Unnamed: 0,P,D,W,L
0,sample1.txt,patient presents to be tested for COVID-19. Hi...,"(w, i, f, e)",family
1,sample8.txt,patient be sent for a covid test. Someone be t...,"(S, o, m, e, o, n, e)",other_experiencer


In [None]:
pos_concept_docs = rewrite_docs(lemma_concepts,pos_concept_matches,'pos_concept')
sess.import_rel('Docs',pos_concept_docs)
sess.export('?Docs("sample8.txt",D,V)')

Unnamed: 0,D,V
0,Patient be sent for a covid test. Someone be t...,lemma
1,Patient was sent for a covid test. Someone was...,raw_text
2,patient be sent for a covid test. Someone be t...,lemma_concept
3,patient be sent for a covid test. other_experi...,pos_concept


As we can see for example in sample8.txt, Someone changed to other_experiencer.

### [Target Rules](https://github.com/abchapman93/VA_COVID-19_NLP_BSV/blob/master/cov_bsv/knowledge_base/target_rules.py):


In [None]:
sess.import_rel("TargetTagRules","target_rules.csv",delim=",")

In [None]:
%%spannerlog
?TargetTagRules(Rule,Tag)

'?TargetTagRules(Rule,Tag)'

Unnamed: 0,Rule,Tag
0,'(?i)((?:person|patient) with confirm COVID-19)','1 2 3 4'
1,'(?i)(?:(?:(?:contact|exposure) (?:with|to)? )...,'OTHER_PERSON'
2,'(?i)(?:(?:patient|person) (?:who|that) test (...,'OTHER_PERSON'
3,'(?i)(?:COVID-19 (?:restriction|emergency|epid...,'1 2'
4,'(?i)(?:COVID-19 positive (?:patient|person|pe...,'OTHER_PERSON'
...,...,...
14,'(?i)(?:in order to decrease the spread of the...,'1 2 3 4 5 6 7 8 9 10'
15,'(?i)(?:known(?: positive)? COVID-19(?: positi...,'COVID-19'
16,'(?i)(?:positive COVID-19 (?:tested )?other_ex...,'COVID-19'
17,'(?i)(?:results confirm|(?:neg|pos)\\S+ pressu...,'1 2'


In [None]:
%%spannerlog
TargetMatches(Path,Doc, Span, Label) <- \
    Docs(Path,Doc,"pos_concept"),\
    TargetTagRules(Pattern, Label), rgx(Pattern,Doc) -> (Span)

In [None]:
target_matches = sess.export('?TargetMatches(P,D,W,L)')
target_matches.map(repr)

Unnamed: 0,P,D,W,L
0,'sample9.txt','patient have contact patient with COVID-19. s...,"[@e00245,44,71) ""screening ...""",'positive coronavirus screening'


In [None]:
target_rule_docs = rewrite_docs(pos_concept_docs,target_matches,'target_concept')
sess.import_rel('Docs',target_rule_docs)

In [None]:
for doc,doc_type in sess.export('?Docs("sample9.txt",D,V)').itertuples(index=False,name=None):
    print(doc_type)
    print(doc)
    print("="*80)

raw_text
Patient had contact patient with coronavirus. screening positive coronavirus.
lemma
Patient have contact patient with coronavirus. screening positive coronavirus.
target_concept
patient have contact patient with COVID-19. positive coronavirus screening.
lemma_concept
patient have contact patient with COVID-19. screening positive COVID-19.
pos_concept
patient have contact patient with COVID-19. screening positive COVID-19.


### Breaking text into sections

In [None]:
section_tags = pd.read_csv('section_tags.csv',names=['literal','tag'])
section_tags.head()

Unnamed: 0,literal,tag
0,Lab results:,labs
1,ADDENDUM:,addendum
2,Addendum:,addendum
3,ALLERGIC REACTIONS:,allergies
4,ALLERGIES:,allergies


In [None]:
section_delimeter_pattern = section_tags['literal'].str.cat(sep='|')
sess.import_var('section_delimeter_pattern',section_delimeter_pattern)
section_delimeter_pattern

'Lab results:|ADDENDUM:|Addendum:|ALLERGIC REACTIONS:|ALLERGIES:|CC:|CHIEF COMPLAINT:|Chief Complaint:|COMMENTS:|ADMISSION DIAGNOSES:|DIAGNOSES:|Diagnosis:|Primary Diagnosis:|Primary:|SECONDARY DIAGNOSES:|Secondary Diagnoses:|Secondary Diagnosis:|Secondary:|Family History:|Brief Hospital Course:|CONCISE SUMMARY OF HOSPITAL COURSE BY ISSUE/SYSTEM:|HOSPITAL COURSE:|SUMMARY OF HOSPITAL COURSE:|IMAGING:|INTERPRETATION:|Imaging:|MRI:|Radiology:|ADMISSION LABS:|Admission Labs:|Discharge Labs:|ECHO:|FINDINGS:|Findings:|INDICATION:|LABS:|Labs:|MICRO:|Micro:|Microbiology:|Pertinent Results:|STUDIES:|Studies:|ACTIVE MEDICATIONS LIST:|ACTIVE MEDICATIONS:|ADMISSION MEDICATIONS:|CURRENT MEDICATIONS:|DISCHARGE MEDICATIONS:|Discharge Medications:|HOME MEDICATIONS:|MEDICATIONS AT HOME:|MEDICATIONS LIST:|MEDICATIONS ON ADMISSION:|MEDICATIONS ON DISCHARGE:|MEDICATIONS ON TRANSFER:|MEDICATIONS PRIOR TO ADMISSION:|MEDICATIONS:|MEDICATIONS:|Neuro:|A/P:|ASSESSMENT/PLAN:|ASSESSMENT:|Assessment/Plan:|Clinical

In [None]:
sess.import_rel("SectionTags","section_tags.csv",delim=",")
sess.import_rel("PositiveSectionTags","positive_section_tags.csv",delim=",")


In [None]:
%%spannerlog
Sections(P,D,Sec,Content)<-Docs(P,D,"target_concept"),\
    rgx_split($section_delimeter_pattern,D)->(SecSpan,Content),\
    as_str(SecSpan)->(Sec)
?Sections(P,D,Sec,Content)

PositiveSections(P,D,Sec,Content)<-Sections(P,D,Sec,Content),SectionTags(Sec,Tag),PositiveSectionTags(Tag)
?PositiveSections(P,D,Sec,Content)

'?Sections(P,D,Sec,Content)'

Unnamed: 0,P,D,Sec,Content
0,'sample3.txt','Problem List: like_num. Pneumonia like_num. C...,'Problem List:',"[@882253,13,62) "" like_num...."""


'?PositiveSections(P,D,Sec,Content)'

Unnamed: 0,P,D,Sec,Content
0,'sample3.txt','Problem List: like_num. Pneumonia like_num. C...,'Problem List:',"[@882253,13,62) "" like_num...."""


### Breaking texts into sentences


In [None]:
%%spannerlog
Sents(P,S)<-Docs(P,D,"target_concept"),split_sentence(D)->(S)
?Sents(P,S)

'?Sents(P,S)'

Unnamed: 0,P,S
0,'sample1.txt',"[@931cb5,0,43) ""patient pr..."""
1,'sample1.txt',"[@931cb5,44,93) ""His family..."""
2,'sample1.txt',"[@931cb5,94,130) ""COVID-19 r..."""
3,'sample10.txt',"[@f3a9fd,0,31) ""patient be..."""
4,'sample10.txt',"[@f3a9fd,32,59) ""results ca..."""
...,...,...
14,'sample7.txt',"[@a2c41c,0,82) ""Elevated c..."""
15,'sample8.txt',"[@8ec8c6,0,33) ""patient be..."""
16,'sample8.txt',"[@8ec8c6,34,71) ""other_expe..."""
17,'sample9.txt',"[@6d2862,0,43) ""patient ha..."""


#### Pair of sentences

We will show 2 ways of getting pairs of adjacent sentences,
The first is simply to make an ie function outof them

In [None]:
from itertools import pairwise

def sentence_pairs(text):
    yield from pairwise(split_sentence(text))

sess.register('sentence_pairs',sentence_pairs,[(str,Span)],[Span,Span])

In [None]:
%%spannerlog
SentPairs_ver1(P,S1,S2)<-Docs(P,D,"target_concept"),sentence_pairs(D)->(S1,S2)
?SentPairs_ver1(P,S1,S2)

'?SentPairs_ver1(P,S1,S2)'

Unnamed: 0,P,S1,S2
0,'sample1.txt',"[@931cb5,0,43) ""patient pr...""","[@931cb5,44,93) ""His family..."""
1,'sample1.txt',"[@931cb5,44,93) ""His family...""","[@931cb5,94,130) ""COVID-19 r..."""
2,'sample10.txt',"[@f3a9fd,0,31) ""patient be...""","[@f3a9fd,32,59) ""results ca..."""
3,'sample2.txt',"[@e4b074,0,44) ""The patien...""","[@e4b074,45,65) ""Results be..."""
4,'sample2.txt',"[@e4b074,45,65) ""Results be...""","[@e4b074,66,115) ""patient un..."""
5,'sample3.txt',"[@882253,0,23) ""Problem Li...""","[@882253,24,43) ""Pneumonia ..."""
6,'sample3.txt',"[@882253,24,43) ""Pneumonia ...""","[@882253,44,61) ""COVID-19 l..."""
7,'sample8.txt',"[@8ec8c6,0,33) ""patient be...""","[@8ec8c6,34,71) ""other_expe..."""
8,'sample9.txt',"[@6d2862,0,43) ""patient ha...""","[@6d2862,44,75) ""positive c..."""


We can also do it like this

In [None]:
# TODO from here implement is adjacent

# TODO make the output magic arg not print queries to file

In [None]:
def is_adjacent(span1,span2):
    yield span1.end +1 == span2.start

sess.register('is_adjacent',is_adjacent,[Span,Span],[bool])

In [None]:
%%spannerlog
SentPairs_ver2(P,S1,S2)<-Sents(P,S1),Sents(P,S2),is_adjacent(S1,S2)->(True)
?SentPairs_ver2(P,S1,S2)

'?SentPairs_ver2(P,S1,S2)'

Unnamed: 0,P,S1,S2
0,'sample1.txt',"[@931cb5,0,43) ""patient pr...""","[@931cb5,44,93) ""His family..."""
1,'sample1.txt',"[@931cb5,44,93) ""His family...""","[@931cb5,94,130) ""COVID-19 r..."""
2,'sample10.txt',"[@f3a9fd,0,31) ""patient be...""","[@f3a9fd,32,59) ""results ca..."""
3,'sample2.txt',"[@e4b074,0,44) ""The patien...""","[@e4b074,45,65) ""Results be..."""
4,'sample2.txt',"[@e4b074,45,65) ""Results be...""","[@e4b074,66,115) ""patient un..."""
5,'sample3.txt',"[@882253,0,23) ""Problem Li...""","[@882253,24,43) ""Pneumonia ..."""
6,'sample3.txt',"[@882253,24,43) ""Pneumonia ...""","[@882253,44,61) ""COVID-19 l..."""
7,'sample8.txt',"[@8ec8c6,0,33) ""patient be...""","[@8ec8c6,34,71) ""other_expe..."""
8,'sample9.txt',"[@6d2862,0,43) ""patient ha...""","[@6d2862,44,75) ""positive c..."""


Or like this

In [None]:
%%spannerlog
SentPairs(P,S1,S2)<-Sents(P,S1),Sents(P,S2),expr_eval("{0}.end +1 == {1}.start",S1,S2)->(True)
?SentPairs(P,S1,S2)

'?SentPairs(P,S1,S2)'

Unnamed: 0,P,S1,S2
0,'sample1.txt',"[@931cb5,0,43) ""patient pr...""","[@931cb5,44,93) ""His family..."""
1,'sample1.txt',"[@931cb5,44,93) ""His family...""","[@931cb5,94,130) ""COVID-19 r..."""
2,'sample10.txt',"[@f3a9fd,0,31) ""patient be...""","[@f3a9fd,32,59) ""results ca..."""
3,'sample2.txt',"[@e4b074,0,44) ""The patien...""","[@e4b074,45,65) ""Results be..."""
4,'sample2.txt',"[@e4b074,45,65) ""Results be...""","[@e4b074,66,115) ""patient un..."""
5,'sample3.txt',"[@882253,0,23) ""Problem Li...""","[@882253,24,43) ""Pneumonia ..."""
6,'sample3.txt',"[@882253,24,43) ""Pneumonia ...""","[@882253,44,61) ""COVID-19 l..."""
7,'sample8.txt',"[@8ec8c6,0,33) ""patient be...""","[@8ec8c6,34,71) ""other_expe..."""
8,'sample9.txt',"[@6d2862,0,43) ""patient ha...""","[@6d2862,44,75) ""positive c..."""


### Attribute Assertion:

 Next, we will explore how to assert attributes indicating whether a mention of COVID-19 is positive or not. In our project, we have created a table     named 'CovidAttributes' that contains all attributes for each COVID-19 mention. This table will be used for classifying documents.

In [None]:
df = sess.export('?Docs(P,D,V)')
df

Unnamed: 0,P,D,V
0,sample1.txt,Patient presents to be tested for COVID-19. Hi...,raw_text
1,sample1.txt,patient presents to be tested for COVID-19. Hi...,pos_concept
2,sample1.txt,patient presents to be tested for COVID-19. Hi...,target_concept
3,sample1.txt,patient presents to be tested for COVID-19. Hi...,lemma_concept
4,sample1.txt,patient presents to be tested for COVID-19. Hi...,lemma
5,sample10.txt,patient be screened for cov-19. results came b...,lemma
6,sample10.txt,patient be screened for cov-19. results came b...,lemma_concept
7,sample10.txt,patient be screened for cov-19. results came b...,pos_concept
8,sample10.txt,patient be screened for cov-19. results came b...,target_concept
9,sample10.txt,patient was screened for cov-19. results came ...,raw_text


In [None]:
%%spannerlog
CovidMentions(Path, Span) <- Docs(Path,D,"target_concept"), rgx("COVID-19",D) -> (Span)
CovidMentionSents(P,Mention,Sent)<-CovidMentions(P,Mention),Sents(P,Sent),span_contained(Mention,Sent)->(True)

?CovidMentions(Path, Span)
?CovidMentionSents(P,Mention,Sent)

'?CovidMentions(Path,Span)'

Unnamed: 0,Path,Span
0,'sample1.txt',"[@931cb5,34,42) ""COVID-19"""
1,'sample1.txt',"[@931cb5,84,92) ""COVID-19"""
2,'sample1.txt',"[@931cb5,94,102) ""COVID-19"""
3,'sample2.txt',"[@e4b074,26,34) ""COVID-19"""
4,'sample2.txt',"[@e4b074,87,95) ""COVID-19"""
5,'sample3.txt',"[@882253,44,52) ""COVID-19"""
6,'sample6.txt',"[@b2612f,26,34) ""COVID-19"""
7,'sample9.txt',"[@6d2862,34,42) ""COVID-19"""


'?CovidMentionSents(P,Mention,Sent)'

Unnamed: 0,P,Mention,Sent
0,'sample1.txt',"[@931cb5,34,42) ""COVID-19""","[@931cb5,0,43) ""patient pr..."""
1,'sample1.txt',"[@931cb5,84,92) ""COVID-19""","[@931cb5,44,93) ""His family..."""
2,'sample1.txt',"[@931cb5,94,102) ""COVID-19""","[@931cb5,94,130) ""COVID-19 r..."""
3,'sample2.txt',"[@e4b074,26,34) ""COVID-19""","[@e4b074,0,44) ""The patien..."""
4,'sample2.txt',"[@e4b074,87,95) ""COVID-19""","[@e4b074,66,115) ""patient un..."""
5,'sample3.txt',"[@882253,44,52) ""COVID-19""","[@882253,44,61) ""COVID-19 l..."""
6,'sample6.txt',"[@b2612f,26,34) ""COVID-19""","[@b2612f,0,35) ""The patien..."""
7,'sample9.txt',"[@6d2862,34,42) ""COVID-19""","[@6d2862,0,43) ""patient ha..."""


In [None]:
%%spannerlog

CovidTags(Path,Mention,'positive','section')<-\
    PositiveSections(Path,D,Title,Section),\
    CovidMentions(Path,Mention),\
    span_contained(Mention,Section)->(True)
?CovidTags(Path, Mention,Tag,DerivedFrom)

'?CovidTags(Path,Mention,Tag,DerivedFrom)'

Unnamed: 0,Path,Mention,Tag,DerivedFrom
0,'sample3.txt',"[@882253,44,52) ""COVID-19""",'positive','section'


### [Context Rules](https://github.com/abchapman93/VA_COVID-19_NLP_BSV/blob/master/cov_bsv/knowledge_base/context_rules.py):

These rules assign an attribute for each COVID-19 label based on the context, these attributes will be used later to classify each text.

Example for this rule is: 

    ConTextRule(
        literal="Not Detected",
        category="NEGATED_EXISTENCE",
        direction="BACKWARD",
        pattern=[
            {"LOWER": {"IN": ["not", "non"]}},
            {"IS_SPACE": True, "OP": "*"},
            {"TEXT": "-", "OP": "?"},
            {"LOWER": {"REGEX": "detecte?d"}},
        ],
        allowed_types={"COVID-19"},
    ),
   **direction** specify if the allowed_types should be before or after the pattern,
   **allowed_types** specify on what labels should this rule be applied on 

In [None]:
context_rules = pd.read_csv('sentence_context_rules.csv',delimiter='#',header=None,names=['pattern','tag'])
context_rules

Unnamed: 0,pattern,tag
0,(?i)(?:positive COVID-19|COVID-19 (?:\([^)]*\)...,positive
1,(?i)(?:COVID-19 status : positive),positive
2,(?i)(?:associated_diagnosis COVID-19|associate...,positive
3,(?i)(?:COVID-19 positive(?: patient| precautio...,positive
4,(?i)(?:(?:current|recent) COVID-19 diagnosis),positive
...,...,...
167,(?i)(?:COVID-19(?: (?!<IGNORE>)\S+)*? (?:(?:so...,negated
168,(?i)(?:(?:(?:someone|person) who (?:has|have) ...,negated
169,"(?i)(?:COVID-19(?: (?!<IGNORE>)\S+){0,0} (?:\(...",positive
170,(?i)(?:COVID-19(?: (?!<IGNORE>)\S+)*? (?:socia...,IGNORE


In [None]:
sess.import_rel("SentenceContextRules",context_rules)

In [None]:
%%spannerlog
#covid_attributes: negated, other_experiencer, is_future, not_relevant, uncertain, positive
# TODO make syntax and semantic hide their traceback
# TODO make ie function show traceback only in the function itself


CovidTags(Path,Mention,Tag,'sentence context')<-\
    CovidMentionSents(Path,Mention,Sent),\
    SentenceContextRules(Pattern,Tag),\
    rgx(Pattern,Sent)->(ContextSpan),\
    span_contained(Mention,ContextSpan)->(True)

?CovidTags(Path, Mention,Tag,DerivedFrom)

'?CovidTags(Path,Mention,Tag,DerivedFrom)'

Unnamed: 0,Path,Mention,Tag,DerivedFrom
0,'sample1.txt',"[@931cb5,84,92) ""COVID-19""",'negated','sentence context'
1,'sample1.txt',"[@931cb5,84,92) ""COVID-19""",'positive','sentence context'
2,'sample1.txt',"[@931cb5,94,102) ""COVID-19""",'positive','sentence context'
3,'sample2.txt',"[@e4b074,87,95) ""COVID-19""",'future','sentence context'
4,'sample2.txt',"[@e4b074,87,95) ""COVID-19""",'negated','sentence context'
5,'sample3.txt',"[@882253,44,52) ""COVID-19""",'positive','section'
6,'sample6.txt',"[@b2612f,26,34) ""COVID-19""",'patient_experiencer','sentence context'


### [Postprocessor](https://github.com/abchapman93/VA_COVID-19_NLP_BSV/blob/master/cov_bsv/knowledge_base/postprocess_rules.py):

The postprocessor is designed to apply extra adjustments to the processed text using custom logic or specific requirements not addressed by the spaCy pipeline. These rules modify, remove, or change attributes for each mention of COVID-19 based on either their existing attributes, the context of the sentences in which they appear, or a combination of both. This flexibility allows us to address data issues and implement targeted improvements. For instance, it proves useful in identifying and rectifying incorrectly labeled positive cases, thereby enhancing the accuracy of our classification.

**How we implemented it:**  
As mentioned earlier, postprocess rules are responsible for modifying, removing, or changing attributes for each mention of COVID-19. In the original project, these attributes are represented as boolean variables stored in an object class for each COVID-19 mention. The rules simply switch the corresponding boolean variable to assign or remove the attribute. However, in spannerlog, we don't have the luxury of creating classes. In our project, when we want to remove a specific attribute, we introduce an additional attribute that acts as its negation. For instance, for the attribute 'positive,' we add 'no_positive,' causing the document classifier to behave as if there is no positive attribute.

Additionally, in some cases, the entire COVID-19 mention is removed by eliminating its object. In our project, we introduce an 'IGNORE' attribute, which results in the exclusion of the mention from consideration in the document classifier stage.
<br>

**In the subsequent cells, we will explore three types of postprocess rules:**
1) Rules based on patterns
2) Rules utilizing existing attributes and patterns
3) Rules applied to the next sentence.

#### 1 - Postprocess rules based on patterns:

Example rule in the original project:

```
PostprocessingRule(
        patterns=[
            PostprocessingPattern(lambda ent: ent.label_ == "COVID-19"),
            PostprocessingPattern(
                postprocessing_functions.sentence_contains,
                condition_args=({"deny", "denies", "denied"},),
            ),
            PostprocessingPattern(
                postprocessing_functions.sentence_contains,
                condition_args=({"contact", "contacts", "confirmed"},),
            ),
        \],
        action=postprocessing_functions.remove_ent,
        description="Remove a coronavirus entity if 'denies' and 'contact' are in. This will help get rid of false positives from screening.",
    ),    
```
This rule iterates through each entity and checks a series of conditions which are the "PostprocessingPattern". If all conditions evaluate as True, then some action is taken on the entity, which is 'remove' action in this example.


In our case, we assign "IGNORE" attribute to the COVID-19 mention causing it to be excluded from consideration during the document classification process.

Each rule in the CSV file follows this format: regexPattern, Attribute

In [None]:
post_process_pattern_rules = pd.read_csv('postprocess_pattern_rules.csv',delimiter='#',header=None,names=['pattern','tag'])
post_process_pattern_rules

Unnamed: 0,pattern,tag
0,.*education.*,IGNORE
1,.* \?,IGNORE
2,(?=.*\b(?:deny|denies|denied)\b)(?=.*\b(?:cont...,IGNORE
3,(?=.*\b(?:setting of|s/o)\b)(?!.*\b(?:COVID-19...,no_positive
4,(?i)(.*benign.*),uncertain
5,admitted to COVID-19 unit,positive


In [None]:
sess.import_rel("PostprocessPatternRules",post_process_pattern_rules)

In [None]:
%%spannerlog
CovidTags(Path,Mention,Tag,'post pattern')<-\
    CovidMentionSents(Path,Mention,Sent),\
    PostprocessPatternRules(Pattern,Tag),\
    rgx(Pattern,Sent)->(ContextSpan),\
    span_contained(Mention,ContextSpan)->(True)

?CovidTags(Path, Mention,Tag,DerivedFrom)

'?CovidTags(Path,Mention,Tag,DerivedFrom)'

Unnamed: 0,Path,Mention,Tag,DerivedFrom
0,'sample1.txt',"[@931cb5,84,92) ""COVID-19""",'negated','sentence context'
1,'sample1.txt',"[@931cb5,84,92) ""COVID-19""",'positive','sentence context'
2,'sample1.txt',"[@931cb5,94,102) ""COVID-19""",'positive','sentence context'
3,'sample2.txt',"[@e4b074,87,95) ""COVID-19""",'IGNORE','post pattern'
4,'sample2.txt',"[@e4b074,87,95) ""COVID-19""",'future','sentence context'
5,'sample2.txt',"[@e4b074,87,95) ""COVID-19""",'negated','sentence context'
6,'sample3.txt',"[@882253,44,52) ""COVID-19""",'positive','section'
7,'sample6.txt',"[@b2612f,26,34) ""COVID-19""",'patient_experiencer','sentence context'


#### 2 - Postprocess rules utilizing existing attributes and patterns:
```
PostprocessingRule(
        patterns=[
        
            PostprocessingPattern(lambda ent: ent.label_ == "COVID-19"),
            PostprocessingPattern(
                postprocessing_functions.is_modified_by_category,
                condition_args=("DEFINITE_POSITIVE_EXISTENCE",),
            ),
            # PostprocessingPattern(postprocessing_functions.is_modified_by_category, condition_args=("TEST",)),
            PostprocessingPattern(
                postprocessing_functions.sentence_contains,
                condition_args=(
                    {
                        "should",
                        "unless",
                        "either",
                        "if comes back",
                        "if returns",
                        "if s?he tests positive",
                    },
                    True,
                ),
            ),
        ],
        action=set_is_uncertain,
        action_args=(True,),
        description="Subjunctive of test returning positive. 'Will contact patient should his covid-19 test return positive.'",
    ),
```
This rule examines whether a COVID-19 mention possesses a positive attribute and if the sentence containing it includes any of the words specified in 'condition_args' If these conditions are met, the uncertain attribute is set to true.


In our case, we check for each COVID-19 mention in the 'CovidAttributes' table if it's labeled as 'positive', also, we check if any of the specified words in 'condition_args' are present in the same sentence using a regex search. If the conditions are met, then we simply assign it an 'uncertain' attribute.

Each rule in the CSV file follows this format: regexPattern, ExistingAttribute, NewAttribute


In [None]:
postprocess_attribute_rules = pd.read_csv('postprocess_attributes_rules.csv',delimiter='#',header=None,names=['pattern','old_tag','new_tag'])
postprocess_attribute_rules

Unnamed: 0,pattern,old_tag,new_tag
0,.*pending.*,negated,no_negated
1,.*(?:should|unless|either|if comes back|if ret...,positive,uncertain
2,.*precaution.*,positive,no_future
3,.*(?:re[ -]?test|second test|repeat).*,negated,no_negated
4,.*(?:sign|symptom|s/s).*,positive,uncertain


In [None]:
sess.import_rel("PostprocessRulesWithAttributes",postprocess_attribute_rules)

In [None]:
%%spannerlog
CovidTags(Path,Mention,Tag,"post attribute change")<-\
    CovidTags(Path,Mention,OldTag,Derivation),\
    PostprocessRulesWithAttributes(Pattern,OldTag,Tag),\
    CovidMentionSents(Path,Mention,Sent),\
    rgx(Pattern,Sent)->(ContextSpan),\
    span_contained(Mention,ContextSpan)->(True)

?CovidTags(Path, Mention,Tag,DerivedFrom)

'?CovidTags(Path,Mention,Tag,DerivedFrom)'

Unnamed: 0,Path,Mention,Tag,DerivedFrom
0,'sample1.txt',"[@931cb5,84,92) ""COVID-19""",'negated','sentence context'
1,'sample1.txt',"[@931cb5,84,92) ""COVID-19""",'positive','sentence context'
2,'sample1.txt',"[@931cb5,94,102) ""COVID-19""",'positive','sentence context'
3,'sample2.txt',"[@e4b074,87,95) ""COVID-19""",'IGNORE','post pattern'
4,'sample2.txt',"[@e4b074,87,95) ""COVID-19""",'future','sentence context'
5,'sample2.txt',"[@e4b074,87,95) ""COVID-19""",'negated','sentence context'
6,'sample3.txt',"[@882253,44,52) ""COVID-19""",'positive','section'
7,'sample6.txt',"[@b2612f,26,34) ""COVID-19""",'patient_experiencer','sentence context'


#### 3 - Postprocess rules applied to the next sentence:
There's a rule that checks if the following sentence contains positive mentions. If it does, the COVID-19 mentions in the current sentence are also
marked as positive. To Implement this rule in our project, we defined a new relation that pairs each sentence with its subsequent sentence.


In [None]:
next_sentence_postproccessing_rules = pd.read_csv('postprocess_pattern_next_sentence_rules.csv',header=None,names=['pattern','tag'])
sess.import_rel("NextSentencePostprocessPatternRules",next_sentence_postproccessing_rules)
next_sentence_postproccessing_rules


Unnamed: 0,pattern,tag
0,(?i)(?:^(?:positive|detected)|results?(?: be)?...,positive


In [None]:
%%spannerlog

CovidTags(Path,Mention,Tag,"next sentence")<-\
    CovidMentionSents(Path,Mention,Sent),\
    SentPairs(Path,Sent,NextSent),\
    PostprocessPatternRules(Pattern,Tag),\
    rgx(Pattern,NextSent)->(ContextSpan)

?CovidTags(Path, Mention,Tag,DerivedFrom)

'?CovidTags(Path,Mention,Tag,DerivedFrom)'

Unnamed: 0,Path,Mention,Tag,DerivedFrom
0,'sample1.txt',"[@931cb5,84,92) ""COVID-19""",'negated','sentence context'
1,'sample1.txt',"[@931cb5,84,92) ""COVID-19""",'positive','sentence context'
2,'sample1.txt',"[@931cb5,94,102) ""COVID-19""",'positive','sentence context'
3,'sample2.txt',"[@e4b074,87,95) ""COVID-19""",'IGNORE','post pattern'
4,'sample2.txt',"[@e4b074,87,95) ""COVID-19""",'future','sentence context'
5,'sample2.txt',"[@e4b074,87,95) ""COVID-19""",'negated','sentence context'
6,'sample3.txt',"[@882253,44,52) ""COVID-19""",'positive','section'
7,'sample6.txt',"[@b2612f,26,34) ""COVID-19""",'patient_experiencer','sentence context'


### [Document Classifier](https://github.com/abchapman93/VA_COVID-19_NLP_BSV/blob/master/cov_bsv/knowledge_base/document_classifier.py):

Now we have the basic pieces in place to make our document classification. Each document is classified as either 'POS', 'UNK', or 'NEG' determined by the attributes of its COVID-19 mentions. The Results are stored in a DataFrame.

Document Classifier stage has 2 parts:
 1) **Attribute filtering**: Our pipeline assigns various attributes to each COVID-19 mention. However, during this stage, each COVID-19 case is refined to possess only one attribute. This filtering process operates based on specific conditions outlined in the 'attribute_filter' function.
 2) **Document classification**: Documents are classified based on distinct conditions, as detailed in the 'classify_doc_helper' function. This step ensures the accurate categorization of each document according to the specified criteria.


In [None]:
def agg_mention(group):
    """
    aggregates attribute groups of covid spans

    Returns:
        str: Filtered "CovidSpan" attribute determined by the following rules:
            - If 'IGNORE' is present, returns 'IGNORE'.
            - If 'negated' is present (and 'no_negated' is not present), returns 'negated'.
            - If 'future' is present (and 'no_future' is not present), returns 'negated'.
            - If 'other experiencer' or 'not relevant' is present, returns 'negated'.
            - If 'positive' is present (and 'uncertain' and 'no_positive' are not present), returns 'positive'.
            - Otherwise, returns 'uncertain'.
    """
    if 'IGNORE' in group.values:
        return 'IGNORE'
    elif 'negated' in group.values and not 'no_negated' in group.values:
        return 'negated'
    elif 'future' in group.values and not 'no_future' in group.values:
        return 'negated'
    elif 'other experiencer' in group.values or 'not relevant' in group.values:
        return 'negated'
    elif 'positive' in group.values and not 'uncertain' in group.values and not 'no_positive' in group.values:
        return 'positive'
    else:
        return 'uncertain'

sess.register_agg('agg_mention',agg_mention,[str],[str])

In [None]:
%%spannerlog
# TODO why dont we see the docid in the span
AggregatedCovidTags(Path,Mention,agg_mention(Tag))<-\
    CovidTags(Path,Mention,Tag,Derivation)

?AggregatedCovidTags(Path,Mention,Tag)

'?AggregatedCovidTags(Path,Mention,Tag)'

Unnamed: 0,Path,Mention,Tag
0,'sample1.txt',"[@931cb5,84,92) ""COVID-19""",'negated'
1,'sample1.txt',"[@931cb5,94,102) ""COVID-19""",'positive'
2,'sample2.txt',"[@e4b074,87,95) ""COVID-19""",'IGNORE'
3,'sample3.txt',"[@882253,44,52) ""COVID-19""",'positive'
4,'sample6.txt',"[@b2612f,26,34) ""COVID-19""",'uncertain'


In [None]:
def AggDocumentTags(group):
    """
Classifies a document as 'POS', 'UNK', or 'NEG' based on COVID-19 attributes.

Parameters:
    group (pandas.Series): A pandas Series representing COVID-19 attributes for each document within a DataFrame.
    
Returns:
    str: Document classification determined as follows:
         - 'POS': If at least one COVID-19 attribute with "positive" is present in the group.
         - 'UNK': If at least one COVID-19 attribute with "uncertain" is present in the group and no "positive" attributes,
                  or there's at least one COVID-19 attribute with 'IGNORE' and no other COVID-19 attributes exist.
         - 'NEG': Otherwise.
"""
    if 'positive' in group.values:
        return 'POS'
    elif 'uncertain' in group.values:
        return 'UNK'
    elif 'negated' in group.values:
        return 'NEG'
    else:
        return 'UNK'

sess.register_agg('agg_doc_tags',AggDocumentTags,[str],[str])

In [None]:
%%spannerlog
DocumentTags(Path,agg_doc_tags(Tag))<-\
    AggregatedCovidTags(Path,Mention,Tag)

?DocumentTags(Path,Tag)

'?DocumentTags(Path,Tag)'

Unnamed: 0,Path,Tag
0,'sample1.txt','POS'
1,'sample2.txt','UNK'
2,'sample3.txt','POS'
3,'sample6.txt','UNK'


#### Handling unmentioned paths:
At this step, we assign a classification result 'UNK' to paths not identified in the previous DataFrame result. This occurs when our pipeline doesn't detect any mention of COVID-19 or its synonyms in the text of those paths. As a result, these paths are excluded from all types of relations, consistent with our primary focus on COVID-19 entities.

In [None]:
df_path = (sess.run_commands("?FilesPaths(Path)", print_results=False, format_results=True))[0]
df = (pd.merge(df, df_path, on='Path', how='outer'))
df['DocResult'] = df['DocResult'].fillna("UNK")
df

Unnamed: 0,Path,DocResult
0,sample1.txt,POS
1,sample2.txt,POS
2,sample3.txt,POS
3,sample4.txt,UNK
4,sample5.txt,POS
5,sample6.txt,UNK
6,sample7.txt,UNK


## Bringing It All Together

In this section, we will directly compare the original Python Spacy pipeline project with its spannerlog counterpart. Our emphasis is on showcasing the overall brevity of the spannerlog implementation in contrast to the Python Spacy pipeline.

### Code Metrics

Let's commence by providing an estimated count of total lines in each implementation:

- **Total Number of Lines in the original Python implementation:** **4435**
- **Total Number of Lines in our spannerlog implementation:** **596** (7 times smaller!)

And here's a detailed comparison:

![code line comparison](img/line_counting.png)

With the caveat that number of lines do not fully capture code complexity, let us analyze the lines of code a little more in depth.
Analyzing our implementation vs the original we note that:

- We used the same libraries as the original implementations, so both
  - the core computations, that should turn into ie functions
  - the wrapping logic which remains in pure python
  did not significantly change in size.
- even if we assume that our 203 lines of python code are worth over 300 lines of the original implementations core and wrapping logic, we are still left with over 4000 lines of code that were converted into 393 (107+251+35) of either declarative code and data.
- This means that over 90% of the original code base, which constitutes control flow and data ingestion logic, underwent a ten-fold decrease in size while providing less surface areas for errors since declarative languages and data can be statically analyzed to a greater extent than imperative code.

### Implementation - raw lines of code

Now, we will present the combined spannerlog and python code (excluding "generic ie" functions and excluding queries) to visually illustrate the compactness of the implementation:

#### Concept tagger:
```python
def lemmatize_text(text_path, lemma_words_path):
    # Define a list of words to be lemmatized
    lemma_words = [line.strip() for line in open(f"{lemma_words_path}") if line.strip()]

    with open(text_path, 'r') as file:
        contents = file.read()

    nlp = spacy.load("en_core_web_sm")
    doc = nlp(contents)

    lemmatized_text = ""
    for token in doc:
        if token.lemma_ in lemma_words:
            lemmatized_text += token.lemma_
        elif token.like_num:
            lemmatized_text += "like_num"
        else:
            lemmatized_text += token.text
        lemmatized_text += " "

    # Write the lemmatized text back to the same file
    with open(text_path, 'w') as file:
        file.writelines(lemmatized_text)

    yield lemmatized_text

def annotate_text_with_pos(text_path):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(contents)

    for token in doc:
        if token.pos_ in ["NOUN", "PROPN", "PRON", "ADJ"]:
            yield token.pos_, Span(token.idx, token.idx + len(token.text))
        else:
            yield tuple()
```

```python
session.import_rel("concept_tags_rules.csv", relation_name="ConceptTagRules", delimiter=",")

%%spannerlog
LemmaMatches(Label, Span, Path) <- FilesContent(Path, Content), ConceptTagRules(Pattern, Label, "lemma"), py_rgx_span(Content, Pattern) -> (Span)
replace_spans("LemmaMatches", "FilesPaths")
POSTable(POS, Span, Path) <- FilesContent(Path, Content), annotate_text_with_pos(Path) -> (POS, Span)
POSMatches(Label, Span, Path) <- FilesContent(Path, Content), ConceptTagRules(Pattern, Label, "pos"), py_rgx_span(Content, Pattern) -> (Span)
POSRuleMatches(Label, Span, Path) <- POSTable(POS, Span, Path), POSMatches(Label, Span, Path)
replace_spans("POSRuleMatches", "FilesPaths")
```

#### Target matcher:
```python
sess.import_rel("target_rules.csv", relation_name="TargetTagRules", delimiter=",")

%%spannerlog
TargetTagMatches(Label, Span, Path) <- FilesContent(Path, Content), TargetTagRules(Pattern, Label), py_rgx_span(Content,Pattern) -> (Span)
replace_spans("TargetTagMatches", "FilesPaths")
```

#### Sectionizer:
```python

sess.import_rel("section_rules.csv", relation_name="SectionRules", delimiter=",")

%%spannerlog
SectionRulesMatches(Label, Span, Path) <- FilesContent(Path, Content), SectionRules(Pattern, Label), py_rgx_span(Content,Pattern) -> (Span)
replace_spans("SectionRulesMatches", "FilesPaths")

pattern = "(?i)(?:diagnoses :|observation_and_plan :|past_medical_history :|problem_list :)(?:(?!labs :|addendum :|allergies :|chief_complaint :|comments :|family_history :|hospital_course :|imaging :|labs_and_studies :|medications :|neurological :|other :|patient_education :|physical_exam :|reason_for_examination :|signature :|social_history :).)*"

new SectionRulesAttribute(str, str)
SectionRulesAttribute(pattern, "positive")
SectionMatches(Path, Span, CovidAttribute) <- FilesContent(Path, Content), SectionRulesAttribute(Pattern, CovidAttribute), py_rgx_span(Content, Pattern) -> (Span)
CovidMatches(Path, Span) <- FilesContent(Path, Content), py_rgx_span(Content, "COVID-19") -> (Span)
SectionCovidAttributes(Path, CovidSpan, CovidAttribute) <- SectionMatches(Path, Span1, CovidAttribute), CovidMatches(Path, Span2), is_span_contained(Span1, Span2) -> (CovidSpan)

Sents(Path, Sent) <- FilesPaths(Path), sent_tokenization(Path) -> (Sent)
SentSpans(Path, Sent, SentSpan) <- FilesContent(Path, Content), Sents(Path, Sent), py_rgx_span(Content, Sent) -> (SentSpan)

CovidAttributes(Path, CovidSpan, CovidAttribute, Sent) <- SectionCovidAttributes(Path, AbsCovidSpan, CovidAttribute),\
SentSpans(Path, Sent, SentSpan) ,get_relative_span(AbsCovidSpan, SentSpan) -> (CovidSpan)
 
```

#### Context matcher:
```python
sess.import_rel("context_rules.csv", relation_name="ContextRules", delimiter="#")

%%spannerlog
ContextMatches(CovidAttribute, Span, Path, Sent) <- Sents(Path, Sent), ContextRules(Pattern, CovidAttribute),\
py_rgx_span(Sent, Pattern) -> (Span)
CovidSpans(Path, Span, Sent) <- Sents(Path, Sent), py_rgx_span(Sent, "COVID-19") -> (Span)
CovidAttributes(Path, CovidSpan, CovidAttribute, Sent) <- ContextMatches(CovidAttribute, Span1, Path, Sent), CovidSpans(Path, Span2, Sent), is_span_contained(Span1, Span2) -> (CovidSpan)
```

#### Postprocessor:
```python
def next_sent(text_path):
    with open(text_path, 'r') as file:
        contents = file.read()

    nlp = spacy.load("en_core_web_sm")
    doc = nlp(contents)

    # Tokenize sentences
    sentences = list(doc.sents)
    for i in range(len(sentences) - 1):  # Iterate until the second-to-last sentence
        yield(sentences[i].text, sentences[i + 1].text)

sess.register(ie_function=next_sent, ie_function_name = "next_sent", in_rel=[DataTypes.string], out_rel=[DataTypes.string,DataTypes.string])
```

```python
sess.import_relation_from_csv("postprocess_pattern_rules.csv", relation_name="PostprocessRules", delimiter="#")

%%spannerlog
PostprocessMatches(CovidAttribute, Span, Path, Sent) <- Sents(Path, Sent), PostprocessRules(Pattern, CovidAttribute),\
py_rgx_span(Sent, Pattern) -> (Span)
CovidAttributes(Path, CovidSpan, CovidAttribute, Sent) <- PostprocessMatches(CovidAttribute, Span1, Path, Sent), CovidSpans(Path, Span2, Sent), is_span_contained(Span1, Span2) -> (CovidSpan)

NextSent(Path, Sent1, Sent2) <- FilesPaths(Path), next_sent(Path) -> (Sent1, Sent2)
new PostProcessWithNextSentenceRules(str, str)
PostProcessWithNextSentenceRules("(?i)(?:^(?:positive|detected)|results?(?: be)? positive)", "positive")
PostProcessWithNextSentenceMatches(CovidAttribute, Span, Path, Sent) <- Sents(Path, Sent), PostProcessWithNextSentenceRules(Pattern, CovidAttribute),\
py_rgx_span(Sent, Pattern) -> (Span)
CovidAttributes(Path, CovidSpan, CovidAttribute, Sent1) <- CovidSpans(Path, CovidSpan, Sent1), NextSent(Path, Sent1, Sent2), PostProcessWithNextSentenceMatches(CovidAttribute, Span, Path, Sent2)
```

#### Document Classifier:
```python
def attribute_filter(group):
    if 'IGNORE' in group.values:
        return 'IGNORE'
    elif 'negated' in group.values and not 'no_negated' in group.values:
        return 'negated'
    elif 'future' in group.values and not 'no_future' in group.values:
        return 'negated'
    elif 'other experiencer' in group.values or 'not relevant' in group.values:
        return 'negated'
    elif 'positive' in group.values and not 'uncertain' in group.values and not 'no_positive' in group.values:
        return 'positive'
    else:
        return 'uncertain'

df = (sess.run_commands("?CovidAttributes(Path, CovidSpan, CovidAttribute, Sent)", print_results=False, format_results=True))[0]
if len(df) == 0:
    df = DataFrame(columns=["Path","CovidSpan","CovidAttribute"])
df['CovidAttribute'] = df.groupby(['CovidSpan', 'Sent'])['CovidAttribute'].transform(attribute_filter)
df = df.drop_duplicates().reset_index(drop=True)

def classify_doc_helper(group):
    if 'positive' in group.values:
        return 'POS'
    elif 'uncertain' in group.values:
        return 'UNK'
    elif 'negated' in group.values:
        return 'NEG'
    else:
        return 'UNK'
        
df['DocResult'] = df.groupby('Path')['CovidAttribute'].transform(classify_doc_helper)
df = df[['Path', 'DocResult']]
df = df.drop_duplicates().reset_index(drop=True)

df_path = (sess.run_commands("?FilesPaths(Path)", print_results=False, format_results=True))[0]
df = (pd.merge(df, df_path, on='Path', how='outer'))
df['DocResult'] = df['DocResult'].fillna("UNK")
df
```