In [None]:
#| default_exp banchmark.covid

In [None]:
#| hide
from nbdev.showdoc import show_doc
from IPython.display import display, HTML
%load_ext autoreload
%autoreload 2
from itables import init_notebook_mode,show
init_notebook_mode(all_interactive=False,connected=False)


In [None]:
#| exports
# importing dependencies
import re
import csv
import time
import pandas as pd
from pandas import DataFrame
from pathlib import Path
from spannerlib import get_magic_session,Session
from spannerlib.ie_func.basic import rgx, rgx_is_match, rgx_split, span_arity, span_contained

VERSION = "OLD"
# VERSION = "SPANNERFLOW"
VERSION = "SPANNERFLOW_PYTHON_IE"
if VERSION in ["SPANNERFLOW", "SPANNERFLOW_PYTHON_IE"]:
    from spannerflow.span import Span
else:
    from spannerlib import Span
sess = get_magic_session()


sess.register('rgx', rgx, [str, Span], span_arity)
sess.register('rgx_split', rgx_split, [str, Span], [Span,Span])
sess.register('rgx_is_match', rgx_is_match, [str, Span], [bool])
sess.register('span_contained', span_contained, [Span, Span], [bool])    

import spacy
nlp = spacy.load("en_core_web_sm")


In [None]:
start_time = time.time()

In [None]:
#|exports
# configurations
slog_file = Path('covid_data/covid_logic.pl')
input_dir = Path('covid_data/sample_inputs')
data_dir = Path('covid_data/rules_data')

In [None]:
#| exports
def split_sentence(text):
    """
    Splits a text into individual sentences. using spacy's sentence detection.
    
    Returns:
        str: Individual sentences extracted from the input text.
    """

    doc = nlp(str(text))
    start = 0
    for sentence in doc.sents:
        end = start+len(sentence.text)
        # note that we yield a Span object, so we can keep track of the locations of the sentences
        yield Span(text,start,end)
        start = end + 1

In [None]:
#| exports
class LemmaFromList():
    def __init__(self,lemma_list):
        self.lemma_list = lemma_list

    def __call__(self,text):
        doc = nlp(str(text))
        for word in doc:
            start = word.idx
            end = start + len(word.text)
            if word.lemma_ in self.lemma_list:
                yield (Span(text,start,end),word.lemma_)
            elif word.like_num:
                yield (Span(text,start,end),'like_num')
            else:
                pass

lemma_list = (data_dir/'lemma_words.txt').read_text().split()
lemmatizer = LemmaFromList(lemma_list)

In [None]:
#| exports
class PosFromList():
    def __init__(self,pos_list):
        self.pos_list = pos_list
    def __call__(self,text):
        doc = nlp(str(text))
        for word in doc:
            start = word.idx
            end = start + len(word.text)
            if word.pos_ in self.pos_list:
                yield (Span(text,start,end),word.pos_)

pos_annotator = PosFromList(["NOUN", "PROPN", "PRON", "ADJ"])

In [None]:
#| exports
if VERSION in ["SPANNERFLOW", "SPANNERFLOW_PYTHON_IE"]:
    sess.register('split_sentence',split_sentence,[Span],[Span])
    sess.register('pos',pos_annotator,[Span],[Span,str])
    sess.register('lemma',lemmatizer,[Span],[Span,str])
else:
    sess.register('split_sentence',split_sentence,[str],[Span])
    sess.register('pos',pos_annotator,[str],[Span,str])
    sess.register('lemma',lemmatizer,[str],[Span,str])

In [None]:
#| exports
def rewrite(text,span_label_pairs):
    """rewrites a string given a dataframe with spans and the string to rewrite them to
    assumes that the spans belong to the text

    Args:
        text (str like): string to rewrite
        span_label_pairs (pd.Dataframe) dataframe with two columns, first is spans in the doc to rewrite
            second is what to rewrite to
    Returns:
        The rewritten string
    """    
    if isinstance(text,Span):
        text = text.as_str()
    span_label_pairs = sorted(list(span_label_pairs.itertuples(index=False,name=None)), key=lambda x: x[0].start)

    rewritten_text = ''
    current_pos = 0
    for span,label in span_label_pairs:
        rewritten_text += text[current_pos:span.start] + label 
        current_pos = span.end

    rewritten_text += text[current_pos:]

    return rewritten_text


In [None]:
#| export
def rewrite_docs(docs,span_label,new_version):
    """Given a dataframe of documents of the form (path,doc,version) and a dataframe of spans to rewrite
    of the form (path,word,from_span,to_tag), rewrites the documents and returns a new dataframe of the form
    (path,doc,new_version)

    """
    new_tuples =[]
    span_label.columns = ['P','D','W','L']
    for path,doc,_ in docs.itertuples(index=False,name=None):
        span_label_per_doc = span_label[span_label['P'] == path][['W','L']]
        new_text = rewrite(doc,span_label_per_doc)
        new_tuples.append((path,new_text,new_version))
    return pd.DataFrame(new_tuples,columns=['P','D','V'])
    

In [None]:
#| export
sess.import_rel("ConceptTagRules",data_dir/"concept_tags_rules.csv" , delim=",")
sess.import_rel("TargetTagRules",data_dir/"target_rules.csv",delim=",")
sess.import_rel("SectionTags",data_dir/"section_tags.csv",delim=",")
sess.import_rel("PositiveSectionTags",data_dir/"positive_section_tags.csv",delim=",")
sess.import_rel("SentenceContextRules",data_dir/'sentence_context_rules.csv',delim="#")
sess.import_rel("PostprocessPatternRules",data_dir/'postprocess_pattern_rules.csv',delim="#")
sess.import_rel("PostprocessRulesWithAttributes",data_dir/'postprocess_attributes_rules.csv',delim="#")
sess.import_rel("NextSentencePostprocessPatternRules",data_dir/'postprocess_pattern_next_sentence_rules.csv',delim=',')


In [None]:
#| export
from glob import glob
file_paths = [Path(p) for p in glob(str(input_dir/'*.txt'))]
raw_docs = pd.DataFrame([
    [p.name,p.read_text(),'raw_text'] for p in file_paths
],columns=['Path','Doc','Version']
)
if VERSION in ["SPANNERFLOW", "SPANNERFLOW_PYTHON_IE"]:
    sess.import_rel('Docs',raw_docs, scheme=[str, Span, str])
else:
    sess.import_rel('Docs',raw_docs)
raw_docs

Unnamed: 0,Path,Doc,Version
0,sample10.txt,patient was screened for cov-19. results came ...,raw_text
1,sample8.txt,Patient was sent for a covid test. Someone was...,raw_text
2,sample9.txt,Patient had contact patient with coronavirus. ...,raw_text
3,sample7.txt,Elevated cholesterol levels require further as...,raw_text
4,sample6.txt,The patient have reported novel coronavirus.,raw_text
5,sample4.txt,neg covid education.,raw_text
6,sample5.txt,positive covid precaution.,raw_text
7,sample1.txt,Patient presents to be tested for COVID-19. Hi...,raw_text
8,sample2.txt,The patient was tested for Coronavirus 2019. R...,raw_text
9,sample3.txt,Problem List: 1. Pneumonia 2. Novel Coronaviru...,raw_text


In [None]:
%%spannerlog -o {slog_file}
Lemmas(P,D,Word,Lem)<-Docs(P,D,"raw_text"),lemma(D)->(Word,Lem).
?Lemmas(A,B,C,D)

I0000 00:00:1734274325.212999  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


[@a2c41c,0,83) "Elevated cholesterol levels require further assessment and lifestyle adjustments . "
[@0e1178,0,77) "Patient had contact patient with coronavirus. screening positive coronavirus."
[@a6c01c,0,139) "Patient presents to be tested for COVID-19. His wife recently tested positive for novel coronavirus. SARS-COV-2 results came back positive."
[@aad8ff,0,63) "Patient was sent for a covid test. Someone was tested positive."
[@45bf63,0,53) "Problem List: 1. Pneumonia 2. Novel Coronavirus 2019 "
[@2473a3,0,45) "The patient have reported novel coronavirus. "
[@591f89,0,115) "The patient was tested for Coronavirus 2019. Results are positive. Patient underwent no Coronavirus 2019 education."
[@3ac307,0,21) "neg covid education. "
[@9f417c,0,60) "patient was screened for cov-19. results came back positive."
[@2e40a3,0,26) "positive covid precaution."
[@a2c41c,0,83) "Elevated cholesterol levels require further assessment and lifestyle adjustments . "
[@0e1178,0,77) "Patient had contact

[@0e1178,0,77) "Patient had contact patient with coronavirus. screening positive coronavirus."
[@0e1178,12,19) "contact"
[@0e1178,0,77) "Patient had contact patient with coronavirus. screening positive coronavirus."
[@0e1178,8,11) "had"
[@0e1178,0,77) "Patient had contact patient with coronavirus. screening positive coronavirus."
[@0e1178,20,27) "patient"
[@a6c01c,0,139) "Patient presents to be tested for COVID-19. His wife recently tested positive for novel coronavirus. SARS-COV-2 results came back positive."
[@a6c01c,20,22) "be"
[@a6c01c,0,139) "Patient presents to be tested for COVID-19. His wife recently tested positive for novel coronavirus. SARS-COV-2 results came back positive."
[@a6c01c,0,7) "Patient"
[@aad8ff,0,63) "Patient was sent for a covid test. Someone was tested positive."
[@aad8ff,8,11) "was"
[@aad8ff,0,63) "Patient was sent for a covid test. Someone was tested positive."
[@aad8ff,43,46) "was"
[@45bf63,0,53) "Problem List: 1. Pneumonia 2. Novel Coronavirus 2019 "
[@45b

'?Lemmas(A,B,C,D)'

A,B,C,D
sample1.txt,"[@a6c01c,0,139) ""Patient pr...""","[@a6c01c,0,7) ""Patient""",patient
sample1.txt,"[@a6c01c,0,139) ""Patient pr...""","[@a6c01c,20,22) ""be""",be
sample10.txt,"[@9f417c,0,60) ""patient wa...""","[@9f417c,0,7) ""patient""",patient
sample10.txt,"[@9f417c,0,60) ""patient wa...""","[@9f417c,8,11) ""was""",be
sample2.txt,"[@591f89,0,115) ""The patien...""","[@591f89,39,43) ""2019""",like_num
sample2.txt,"[@591f89,0,115) ""The patien...""","[@591f89,100,104) ""2019""",like_num
sample2.txt,"[@591f89,0,115) ""The patien...""","[@591f89,67,74) ""Patient""",patient
sample2.txt,"[@591f89,0,115) ""The patien...""","[@591f89,53,56) ""are""",be
sample2.txt,"[@591f89,0,115) ""The patien...""","[@591f89,4,11) ""patient""",patient
sample2.txt,"[@591f89,0,115) ""The patien...""","[@591f89,12,15) ""was""",be


In [None]:
#| export
lemma_tags = sess.export('?Lemmas(P,D,W,L)')
lemma_docs = rewrite_docs(raw_docs,lemma_tags,'lemma')
if VERSION in ["SPANNERFLOW", "SPANNERFLOW_PYTHON_IE"]:
    sess.import_rel('Docs',lemma_docs, scheme=[str, Span, str])
else:
    sess.import_rel('Docs',lemma_docs)


I0000 00:00:1734274328.871972  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


[@0e1178,0,77) "Patient had contact patient with coronavirus. screening positive coronavirus."
[@0e1178,12,19) "contact"
[@0e1178,0,77) "Patient had contact patient with coronavirus. screening positive coronavirus."
[@0e1178,8,11) "had"
[@0e1178,0,77) "Patient had contact patient with coronavirus. screening positive coronavirus."
[@0e1178,20,27) "patient"
[@a6c01c,0,139) "Patient presents to be tested for COVID-19. His wife recently tested positive for novel coronavirus. SARS-COV-2 results came back positive."
[@a6c01c,20,22) "be"
[@a6c01c,0,139) "Patient presents to be tested for COVID-19. His wife recently tested positive for novel coronavirus. SARS-COV-2 results came back positive."
[@a6c01c,0,7) "Patient"
[@aad8ff,0,63) "Patient was sent for a covid test. Someone was tested positive."
[@aad8ff,8,11) "was"
[@aad8ff,0,63) "Patient was sent for a covid test. Someone was tested positive."
[@aad8ff,43,46) "was"
[@45bf63,0,53) "Problem List: 1. Pneumonia 2. Novel Coronavirus 2019 "
[@45b

In [None]:

%%spannerlog
LemmaConceptMatches(Path,Doc,Span,Label) <- 
    Docs(Path,Doc,"lemma"),
    ConceptTagRules(Pattern, Label, "lemma"),
    # TODO CHANGE: on different version
    rgx(Pattern,Doc) -> (Span).
?LemmaConceptMatches(A,B,C,D)

I0000 00:00:1734274331.992565  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


[@2893ce,0,61) "Patient be sent for a covid test. Someone be tested positive."
[@2893ce,52,60) "positive"
[@2893ce,0,61) "Patient be sent for a covid test. Someone be tested positive."
[@2893ce,22,27) "covid"
[@2893ce,0,61) "Patient be sent for a covid test. Someone be tested positive."
[@2893ce,0,7) "Patient"
[@539a7c,0,78) "Patient have contact patient with coronavirus. screening positive coronavirus."
[@539a7c,57,65) "positive"
[@539a7c,0,78) "Patient have contact patient with coronavirus. screening positive coronavirus."
[@539a7c,34,45) "coronavirus"
[@539a7c,0,78) "Patient have contact patient with coronavirus. screening positive coronavirus."
[@539a7c,66,77) "coronavirus"
[@539a7c,0,78) "Patient have contact patient with coronavirus. screening positive coronavirus."
[@539a7c,0,7) "Patient"
[@539a7c,0,78) "Patient have contact patient with coronavirus. screening positive coronavirus."
[@539a7c,21,28) "patient"
[@389fbb,0,71) "Problem List: like_num. Pneumonia like_num. Novel Coron

'?LemmaConceptMatches(A,B,C,D)'

A,B,C,D
sample1.txt,"[@4d073b,0,139) ""patient pr...""","[@4d073b,34,42) ""COVID-19""",COVID-19
sample1.txt,"[@4d073b,0,139) ""patient pr...""","[@4d073b,101,111) ""SARS-COV-2""",COVID-19
sample1.txt,"[@4d073b,0,139) ""patient pr...""","[@4d073b,82,99) ""novel coro...""",COVID-19
sample1.txt,"[@4d073b,0,139) ""patient pr...""","[@4d073b,0,7) ""patient""",patient
sample1.txt,"[@4d073b,0,139) ""patient pr...""","[@4d073b,69,77) ""positive""",positive
sample1.txt,"[@4d073b,0,139) ""patient pr...""","[@4d073b,130,138) ""positive""",positive
sample10.txt,"[@f3a9fd,0,59) ""patient be...""","[@f3a9fd,0,7) ""patient""",patient
sample10.txt,"[@f3a9fd,0,59) ""patient be...""","[@f3a9fd,50,58) ""positive""",positive
sample2.txt,"[@a5d37d,0,121) ""The patien...""","[@a5d37d,26,37) ""Coronaviru...""",COVID-19
sample2.txt,"[@a5d37d,0,121) ""The patien...""","[@a5d37d,90,101) ""Coronaviru...""",COVID-19


In [None]:
#| export
lemma_concept_matches = sess.export('?LemmaConceptMatches(Path,Doc,Span,Label)')
lemma_concepts = rewrite_docs(lemma_docs,lemma_concept_matches,'lemma_concept')
if VERSION in ["SPANNERFLOW", "SPANNERFLOW_PYTHON_IE"]:
    sess.import_rel('Docs',lemma_concepts, scheme=[str, Span, str])
else:
    sess.import_rel('Docs',lemma_concepts)

I0000 00:00:1734274336.749429  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


[@2893ce,0,61) "Patient be sent for a covid test. Someone be tested positive."
[@2893ce,52,60) "positive"
[@2893ce,0,61) "Patient be sent for a covid test. Someone be tested positive."
[@2893ce,22,27) "covid"
[@2893ce,0,61) "Patient be sent for a covid test. Someone be tested positive."
[@2893ce,0,7) "Patient"
[@539a7c,0,78) "Patient have contact patient with coronavirus. screening positive coronavirus."
[@539a7c,57,65) "positive"
[@539a7c,0,78) "Patient have contact patient with coronavirus. screening positive coronavirus."
[@539a7c,34,45) "coronavirus"
[@539a7c,0,78) "Patient have contact patient with coronavirus. screening positive coronavirus."
[@539a7c,66,77) "coronavirus"
[@539a7c,0,78) "Patient have contact patient with coronavirus. screening positive coronavirus."
[@539a7c,0,7) "Patient"
[@539a7c,0,78) "Patient have contact patient with coronavirus. screening positive coronavirus."
[@539a7c,21,28) "patient"
[@389fbb,0,71) "Problem List: like_num. Pneumonia like_num. Novel Coron

In [None]:
%%spannerlog
# here we get the spans of all POS
Pos(P,D,Word,Lem)<-Docs(P,D,"lemma_concept"),pos(D)->(Word,Lem).
?Pos(A,B,C,D)

I0000 00:00:1734274340.526373  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


[@a2c41c,0,83) "Elevated cholesterol levels require further assessment and lifestyle adjustments . "
[@a2c41c,0,8) "Elevated"
[@a2c41c,0,83) "Elevated cholesterol levels require further assessment and lifestyle adjustments . "
[@a2c41c,36,43) "further"
[@a2c41c,0,83) "Elevated cholesterol levels require further assessment and lifestyle adjustments . "
[@a2c41c,9,20) "cholesterol"
[@a2c41c,0,83) "Elevated cholesterol levels require further assessment and lifestyle adjustments . "
[@a2c41c,21,27) "levels"
[@a2c41c,0,83) "Elevated cholesterol levels require further assessment and lifestyle adjustments . "
[@a2c41c,44,54) "assessment"
[@a2c41c,0,83) "Elevated cholesterol levels require further assessment and lifestyle adjustments . "
[@a2c41c,59,68) "lifestyle"
[@a2c41c,0,83) "Elevated cholesterol levels require further assessment and lifestyle adjustments . "
[@a2c41c,69,80) "adjustments"
[@882253,0,62) "Problem List: like_num. Pneumonia like_num. COVID-19 like_num "
[@882253,0,7) "Proble

'?Pos(A,B,C,D)'

A,B,C,D
sample1.txt,"[@668ee5,0,128) ""patient pr...""","[@668ee5,34,42) ""COVID-19""",NOUN
sample1.txt,"[@668ee5,0,128) ""patient pr...""","[@668ee5,82,90) ""COVID-19""",NOUN
sample1.txt,"[@668ee5,0,128) ""patient pr...""","[@668ee5,92,100) ""COVID-19""",PROPN
sample1.txt,"[@668ee5,0,128) ""patient pr...""","[@668ee5,44,47) ""His""",PRON
sample1.txt,"[@668ee5,0,128) ""patient pr...""","[@668ee5,0,7) ""patient""",ADJ
sample1.txt,"[@668ee5,0,128) ""patient pr...""","[@668ee5,69,77) ""positive""",ADJ
sample1.txt,"[@668ee5,0,128) ""patient pr...""","[@668ee5,119,127) ""positive""",ADJ
sample1.txt,"[@668ee5,0,128) ""patient pr...""","[@668ee5,8,16) ""presents""",NOUN
sample1.txt,"[@668ee5,0,128) ""patient pr...""","[@668ee5,101,108) ""results""",NOUN
sample1.txt,"[@668ee5,0,128) ""patient pr...""","[@668ee5,48,52) ""wife""",NOUN


In [None]:
%%spannerlog
# here we look for concept rule matches where the matched word is also tagged via POS
PosConceptMatches(Path,Doc,Span,Label) <- 
    Docs(Path,Doc,"lemma_concept"),
    ConceptTagRules(Pattern, Label, "pos"),
    # TODO CHANGE: on different version
    rgx(Pattern,Doc) -> (Span),
    Pos(Path,Doc,Span,POSLabel).
?PosConceptMatches(A,B,C,D)

I0000 00:00:1734274344.708065  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


[@1edc3c,0,64) "patient be sent for a COVID-19 test. Someone be tested positive."
[@1edc3c,37,44) "Someone"
[@668ee5,0,128) "patient presents to be tested for COVID-19. His wife recently tested positive for COVID-19. COVID-19 results came back positive."
[@668ee5,48,52) "wife"


'?PosConceptMatches(A,B,C,D)'

A,B,C,D
sample1.txt,"[@668ee5,0,128) ""patient pr...""","[@668ee5,48,52) ""wife""",family
sample8.txt,"[@1edc3c,0,64) ""patient be...""","[@1edc3c,37,44) ""Someone""",other_experiencer


In [None]:
#| export
pos_concept_matches = sess.export('?PosConceptMatches(P,D,W,L)')
pos_concept_docs = rewrite_docs(lemma_concepts,pos_concept_matches,'pos_concept')
if VERSION in ["SPANNERFLOW", "SPANNERFLOW_PYTHON_IE"]:
    sess.import_rel('Docs',pos_concept_docs, scheme=[str, Span, str])
else:
    sess.import_rel('Docs',pos_concept_docs)

I0000 00:00:1734274351.425787  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


[@1edc3c,0,64) "patient be sent for a COVID-19 test. Someone be tested positive."
[@1edc3c,37,44) "Someone"
[@668ee5,0,128) "patient presents to be tested for COVID-19. His wife recently tested positive for COVID-19. COVID-19 results came back positive."
[@668ee5,48,52) "wife"


In [None]:
%%spannerlog
TargetMatches(Path,Doc, Span, Label) <- 
    Docs(Path,Doc,"pos_concept"),
    # TODO CHANGE: on different version
    TargetTagRules(Pattern, Label), rgx(Pattern,Doc) -> (Span).
?TargetMatches(A,B,C,D)

I0000 00:00:1734274356.934909  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


[@e00245,0,72) "patient have contact patient with COVID-19. screening positive COVID-19."
[@e00245,44,71) "screening positive COVID-19"


'?TargetMatches(A,B,C,D)'

A,B,C,D
sample9.txt,"[@e00245,0,72) ""patient ha...""","[@e00245,44,71) ""screening ...""",positive coronavirus screening


In [None]:
#| export
target_matches = sess.export('?TargetMatches(P,D,W,L)')
target_rule_docs = rewrite_docs(pos_concept_docs,target_matches,'target_concept')
if VERSION in ["SPANNERFLOW", "SPANNERFLOW_PYTHON_IE"]:
    sess.import_rel('Docs',target_rule_docs, scheme=[str, Span, str])
else:
    sess.import_rel('Docs',target_rule_docs)

I0000 00:00:1734274361.267146  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


[@e00245,0,72) "patient have contact patient with COVID-19. screening positive COVID-19."
[@e00245,44,71) "screening positive COVID-19"


In [None]:
#| export
section_tags = pd.read_csv(data_dir/'section_tags.csv',names=['literal','tag'])

In [None]:
#| export
# we will programatically build a regex that matches all the section patterns
section_delimeter_pattern = section_tags['literal'].str.cat(sep='|')
sess.import_var('section_delimeter_pattern',section_delimeter_pattern)
section_delimeter_pattern

'Lab results:|ADDENDUM:|Addendum:|ALLERGIC REACTIONS:|ALLERGIES:|CC:|CHIEF COMPLAINT:|Chief Complaint:|COMMENTS:|ADMISSION DIAGNOSES:|DIAGNOSES:|Diagnosis:|Primary Diagnosis:|Primary:|SECONDARY DIAGNOSES:|Secondary Diagnoses:|Secondary Diagnosis:|Secondary:|Family History:|Brief Hospital Course:|CONCISE SUMMARY OF HOSPITAL COURSE BY ISSUE/SYSTEM:|HOSPITAL COURSE:|SUMMARY OF HOSPITAL COURSE:|IMAGING:|INTERPRETATION:|Imaging:|MRI:|Radiology:|ADMISSION LABS:|Admission Labs:|Discharge Labs:|ECHO:|FINDINGS:|Findings:|INDICATION:|LABS:|Labs:|MICRO:|Micro:|Microbiology:|Pertinent Results:|STUDIES:|Studies:|ACTIVE MEDICATIONS LIST:|ACTIVE MEDICATIONS:|ADMISSION MEDICATIONS:|CURRENT MEDICATIONS:|DISCHARGE MEDICATIONS:|Discharge Medications:|HOME MEDICATIONS:|MEDICATIONS AT HOME:|MEDICATIONS LIST:|MEDICATIONS ON ADMISSION:|MEDICATIONS ON DISCHARGE:|MEDICATIONS ON TRANSFER:|MEDICATIONS PRIOR TO ADMISSION:|MEDICATIONS:|MEDICATIONS:|Neuro:|A/P:|ASSESSMENT/PLAN:|ASSESSMENT:|Assessment/Plan:|Clinical

In [None]:
%%spannerlog
# we get section spans and their content using our regex pattern and the rgx_split ie function
Sections(P,D,Sec,Content)<-Docs(P,D,"target_concept"),
    rgx_split($section_delimeter_pattern,D)->(SecSpan,Content),
    as_str(SecSpan)->(Sec).
?Sections(A,B,C,D)

PositiveSections(P,D,Sec,Content)<-Sections(P,D,Sec,Content),SectionTags(Sec,Tag),PositiveSectionTags(Tag).
?PositiveSections(A,B,C,D)

I0000 00:00:1734274365.347851  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


[@882253,0,62) "Problem List: like_num. Pneumonia like_num. COVID-19 like_num "
[@882253,13,62) " like_num. Pneumonia like_num. COVID-19 like_num "


'?Sections(A,B,C,D)'

A,B,C,D
sample3.txt,"[@882253,0,62) ""Problem Li...""",Problem List:,"[@882253,13,62) "" like_num...."""


I0000 00:00:1734274370.194009  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


[@882253,0,62) "Problem List: like_num. Pneumonia like_num. COVID-19 like_num "
[@882253,13,62) " like_num. Pneumonia like_num. COVID-19 like_num "


'?PositiveSections(A,B,C,D)'

A,B,C,D
sample3.txt,"[@882253,0,62) ""Problem Li...""",Problem List:,"[@882253,13,62) "" like_num...."""


In [None]:
%%spannerlog
Sents(P,S)<-Docs(P,D,"target_concept"),split_sentence(D)->(S).
?Sents(A,B)

I0000 00:00:1734274376.343504  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


[@a2c41c,0,82) "Elevated cholesterol levels require further assessment and lifestyle adjustments ."
[@882253,0,23) "Problem List: like_num."
[@882253,24,43) "Pneumonia like_num."
[@882253,44,52) "COVID-19"
[@882253,53,61) "like_num"
[@e4b074,0,44) "The patient be tested for COVID-19 like_num."
[@e4b074,45,65) "Results be positive."
[@e4b074,66,115) "patient underwent no COVID-19 like_num education."
[@b2612f,0,35) "The patient have reported COVID-19."
[@77c574,0,23) "neg COVID-19 education."
[@f3a9fd,0,31) "patient be screened for cov-19."
[@f3a9fd,32,59) "results came back positive."
[@3db2e4,0,36) "patient be sent for a COVID-19 test."
[@3db2e4,37,74) "other_experiencer be tested positive."
[@6d2862,0,43) "patient have contact patient with COVID-19."
[@6d2862,44,75) "positive coronavirus screening."
[@931cb5,0,43) "patient presents to be tested for COVID-19."
[@931cb5,44,93) "His family recently tested positive for COVID-19."
[@931cb5,94,130) "COVID-19 results came back positive."
[@

'?Sents(A,B)'

A,B
sample1.txt,"[@931cb5,94,130) ""COVID-19 r..."""
sample1.txt,"[@931cb5,44,93) ""His family..."""
sample1.txt,"[@931cb5,0,43) ""patient pr..."""
sample10.txt,"[@f3a9fd,0,31) ""patient be..."""
sample10.txt,"[@f3a9fd,32,59) ""results ca..."""
sample2.txt,"[@e4b074,45,65) ""Results be..."""
sample2.txt,"[@e4b074,0,44) ""The patien..."""
sample2.txt,"[@e4b074,66,115) ""patient un..."""
sample3.txt,"[@882253,44,52) ""COVID-19"""
sample3.txt,"[@882253,24,43) ""Pneumonia ..."""


In [None]:
from itertools import pairwise

def sentence_pairs(text):
    yield from pairwise(split_sentence(text))

if VERSION in ["SPANNERFLOW", "SPANNERFLOW_PYTHON_IE"]:
    sess.register('sentence_pairs',sentence_pairs,[Span],[Span,Span])
else:
    sess.register('sentence_pairs',sentence_pairs,[str],[Span,Span])

In [None]:
def is_adjacent(span1,span2):
    yield span1.name==span2.name and span1.end +1 == span2.start

sess.register('is_adjacent',is_adjacent,[Span,Span],[bool])

In [None]:
%%spannerlog
SentPairs(P,S1,S2)<-Sents(P,S1),Sents(P,S2),is_adjacent(S1,S2)->(True).
?SentPairs(A,B,C)

I0000 00:00:1734274380.160619  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


[@882253,0,23) "Problem List: like_num."
[@882253,24,43) "Pneumonia like_num."
[@882253,24,43) "Pneumonia like_num."
[@882253,44,52) "COVID-19"
[@882253,44,52) "COVID-19"
[@882253,53,61) "like_num"
[@e4b074,0,44) "The patient be tested for COVID-19 like_num."
[@e4b074,45,65) "Results be positive."
[@e4b074,45,65) "Results be positive."
[@e4b074,66,115) "patient underwent no COVID-19 like_num education."
[@f3a9fd,0,31) "patient be screened for cov-19."
[@f3a9fd,32,59) "results came back positive."
[@3db2e4,0,36) "patient be sent for a COVID-19 test."
[@3db2e4,37,74) "other_experiencer be tested positive."
[@6d2862,0,43) "patient have contact patient with COVID-19."
[@6d2862,44,75) "positive coronavirus screening."
[@931cb5,0,43) "patient presents to be tested for COVID-19."
[@931cb5,44,93) "His family recently tested positive for COVID-19."
[@931cb5,44,93) "His family recently tested positive for COVID-19."
[@931cb5,94,130) "COVID-19 results came back positive."


'?SentPairs(A,B,C)'

A,B,C
sample1.txt,"[@931cb5,44,93) ""His family...""","[@931cb5,94,130) ""COVID-19 r..."""
sample1.txt,"[@931cb5,0,43) ""patient pr...""","[@931cb5,44,93) ""His family..."""
sample10.txt,"[@f3a9fd,0,31) ""patient be...""","[@f3a9fd,32,59) ""results ca..."""
sample2.txt,"[@e4b074,45,65) ""Results be...""","[@e4b074,66,115) ""patient un..."""
sample2.txt,"[@e4b074,0,44) ""The patien...""","[@e4b074,45,65) ""Results be..."""
sample3.txt,"[@882253,44,52) ""COVID-19""","[@882253,53,61) ""like_num"""
sample3.txt,"[@882253,24,43) ""Pneumonia ...""","[@882253,44,52) ""COVID-19"""
sample3.txt,"[@882253,0,23) ""Problem Li...""","[@882253,24,43) ""Pneumonia ..."""
sample8.txt,"[@3db2e4,0,36) ""patient be...""","[@3db2e4,37,74) ""other_expe..."""
sample9.txt,"[@6d2862,0,43) ""patient ha...""","[@6d2862,44,75) ""positive c..."""


In [None]:
%%spannerlog
# first we get the covid mentions and their surrounding sentences, using the span_contained ie function
# TODO CHANGE: on different version
CovidMentions(Path, Span) <- Docs(Path,D,"target_concept"), rgx("COVID-19",D) -> (Span).
?CovidMentions(A,B)
# TODO CHANGE: on different version
CovidMentionSents(P,Mention,Sent)<-CovidMentions(P,Mention),Sents(P,Sent),span_contained(Mention,Sent)->(True).
?CovidMentionSents(A,B,C)

I0000 00:00:1734274385.213628  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


[@882253,44,52) "COVID-19"
[@e4b074,26,34) "COVID-19"
[@e4b074,87,95) "COVID-19"
[@b2612f,26,34) "COVID-19"
[@77c574,4,12) "COVID-19"
[@3db2e4,22,30) "COVID-19"
[@6d2862,34,42) "COVID-19"
[@931cb5,34,42) "COVID-19"
[@931cb5,84,92) "COVID-19"
[@931cb5,94,102) "COVID-19"
[@ffb7c7,9,17) "COVID-19"


'?CovidMentions(A,B)'

A,B
sample1.txt,"[@931cb5,34,42) ""COVID-19"""
sample1.txt,"[@931cb5,84,92) ""COVID-19"""
sample1.txt,"[@931cb5,94,102) ""COVID-19"""
sample2.txt,"[@e4b074,26,34) ""COVID-19"""
sample2.txt,"[@e4b074,87,95) ""COVID-19"""
sample3.txt,"[@882253,44,52) ""COVID-19"""
sample4.txt,"[@77c574,4,12) ""COVID-19"""
sample5.txt,"[@ffb7c7,9,17) ""COVID-19"""
sample6.txt,"[@b2612f,26,34) ""COVID-19"""
sample8.txt,"[@3db2e4,22,30) ""COVID-19"""


I0000 00:00:1734274389.486780  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


[@882253,44,52) "COVID-19"
[@882253,44,52) "COVID-19"
[@e4b074,26,34) "COVID-19"
[@e4b074,0,44) "The patient be tested for COVID-19 like_num."
[@e4b074,87,95) "COVID-19"
[@e4b074,66,115) "patient underwent no COVID-19 like_num education."
[@b2612f,26,34) "COVID-19"
[@b2612f,0,35) "The patient have reported COVID-19."
[@77c574,4,12) "COVID-19"
[@77c574,0,23) "neg COVID-19 education."
[@3db2e4,22,30) "COVID-19"
[@3db2e4,0,36) "patient be sent for a COVID-19 test."
[@6d2862,34,42) "COVID-19"
[@6d2862,0,43) "patient have contact patient with COVID-19."
[@931cb5,34,42) "COVID-19"
[@931cb5,0,43) "patient presents to be tested for COVID-19."
[@931cb5,84,92) "COVID-19"
[@931cb5,44,93) "His family recently tested positive for COVID-19."
[@931cb5,94,102) "COVID-19"
[@931cb5,94,130) "COVID-19 results came back positive."
[@ffb7c7,9,17) "COVID-19"
[@ffb7c7,0,29) "positive COVID-19 precaution."


'?CovidMentionSents(A,B,C)'

A,B,C
sample1.txt,"[@931cb5,34,42) ""COVID-19""","[@931cb5,0,43) ""patient pr..."""
sample1.txt,"[@931cb5,84,92) ""COVID-19""","[@931cb5,44,93) ""His family..."""
sample1.txt,"[@931cb5,94,102) ""COVID-19""","[@931cb5,94,130) ""COVID-19 r..."""
sample2.txt,"[@e4b074,26,34) ""COVID-19""","[@e4b074,0,44) ""The patien..."""
sample2.txt,"[@e4b074,87,95) ""COVID-19""","[@e4b074,66,115) ""patient un..."""
sample3.txt,"[@882253,44,52) ""COVID-19""","[@882253,44,52) ""COVID-19"""
sample4.txt,"[@77c574,4,12) ""COVID-19""","[@77c574,0,23) ""neg COVID-..."""
sample5.txt,"[@ffb7c7,9,17) ""COVID-19""","[@ffb7c7,0,29) ""positive C..."""
sample6.txt,"[@b2612f,26,34) ""COVID-19""","[@b2612f,0,35) ""The patien..."""
sample8.txt,"[@3db2e4,22,30) ""COVID-19""","[@3db2e4,0,36) ""patient be..."""


In [None]:
%%spannerlog

# note that for ease of debugging, we extended our head to track which rule a fact was derived from

# a tag is positive if it is contained in a positive section
CovidTags(Path,Mention,'positive','section')<-
    PositiveSections(Path,D,Title,Section),
    CovidMentions(Path,Mention),
    # TODO CHANGE: on different version
    span_contained(Mention,Section)->(True).

?CovidTags(A,B,C,D)

# Context rules tags
CovidTags(Path,Mention,Tag,'sentence context')<-
    CovidMentionSents(Path,Mention,Sent),
    SentenceContextRules(Pattern,Tag,DisambiguationPattern),
    # TODO CHANGE: on different version
    rgx(Pattern,Sent)->(ContextSpan),
    # TODO CHANGE: on different version
    span_contained(Mention,ContextSpan)->(True),
    # TODO CHANGE: on different version
    rgx_is_match(DisambiguationPattern,Sent)->(False).

?CovidTags(A,B,C,D)

# post processing based on pattern
CovidTags(Path,Mention,Tag,'post pattern')<-
    CovidMentionSents(Path,Mention,Sent),
    PostprocessPatternRules(Pattern,Tag),
    # TODO CHANGE: on different version
    rgx(Pattern,Sent)->(ContextSpan),
    # TODO CHANGE: on different version
    span_contained(Mention,ContextSpan)->(True).

# post processing based on pattern and existing attributes
# notice the recursive call to CovidTags
CovidTags(Path,Mention,Tag,"post attribute change")<-
    CovidTags(Path,Mention,OldTag,Derivation),
    PostprocessRulesWithAttributes(Pattern,OldTag,Tag),
    CovidMentionSents(Path,Mention,Sent),
    # TODO CHANGE: on different version
    rgx(Pattern,Sent)->(ContextSpan),
    # TODO CHANGE: on different version
    span_contained(Mention,ContextSpan)->(True).

?CovidTags(A,B,C,D)

# post processing based on pattern in the next sentence
CovidTags(Path,Mention,Tag,"next sentence")<-
    CovidMentionSents(Path,Mention,Sent),
    SentPairs(Path,Sent,NextSent),
    PostprocessPatternRules(Pattern,Tag),
    # TODO CHANGE: on different version
    rgx(Pattern,NextSent)->(ContextSpan).

?CovidTags(A,B,C,D)

I0000 00:00:1734274396.075014  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


[@882253,44,52) "COVID-19"


'?CovidTags(A,B,C,D)'

A,B,C,D
sample3.txt,"[@882253,44,52) ""COVID-19""",positive,section


I0000 00:00:1734274406.004619  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


[@882253,44,52) "COVID-19"
[@77c574,4,12) "COVID-19"
[@77c574,4,12) "COVID-19"
[@ffb7c7,9,17) "COVID-19"
[@ffb7c7,9,17) "COVID-19"
[@b2612f,26,34) "COVID-19"


'?CovidTags(A,B,C,D)'

A,B,C,D
sample3.txt,"[@882253,44,52) ""COVID-19""",positive,section
sample4.txt,"[@77c574,4,12) ""COVID-19""",future,sentence context
sample4.txt,"[@77c574,4,12) ""COVID-19""",negated,sentence context
sample5.txt,"[@ffb7c7,9,17) ""COVID-19""",future,sentence context
sample5.txt,"[@ffb7c7,9,17) ""COVID-19""",positive,sentence context
sample6.txt,"[@b2612f,26,34) ""COVID-19""",patient_experiencer,sentence context


I0000 00:00:1734274423.280963  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


[@882253,44,52) "COVID-19"
[@77c574,4,12) "COVID-19"
[@77c574,4,12) "COVID-19"
[@77c574,4,12) "COVID-19"
[@ffb7c7,9,17) "COVID-19"
[@ffb7c7,9,17) "COVID-19"
[@ffb7c7,9,17) "COVID-19"
[@b2612f,26,34) "COVID-19"


'?CovidTags(A,B,C,D)'

A,B,C,D
sample3.txt,"[@882253,44,52) ""COVID-19""",positive,section
sample4.txt,"[@77c574,4,12) ""COVID-19""",IGNORE,post pattern
sample4.txt,"[@77c574,4,12) ""COVID-19""",future,sentence context
sample4.txt,"[@77c574,4,12) ""COVID-19""",negated,sentence context
sample5.txt,"[@ffb7c7,9,17) ""COVID-19""",future,sentence context
sample5.txt,"[@ffb7c7,9,17) ""COVID-19""",no_future,post attribute change
sample5.txt,"[@ffb7c7,9,17) ""COVID-19""",positive,sentence context
sample6.txt,"[@b2612f,26,34) ""COVID-19""",patient_experiencer,sentence context


I0000 00:00:1734274447.441028  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


[@882253,44,52) "COVID-19"
[@77c574,4,12) "COVID-19"
[@77c574,4,12) "COVID-19"
[@77c574,4,12) "COVID-19"
[@ffb7c7,9,17) "COVID-19"
[@ffb7c7,9,17) "COVID-19"
[@ffb7c7,9,17) "COVID-19"
[@b2612f,26,34) "COVID-19"


'?CovidTags(A,B,C,D)'

A,B,C,D
sample3.txt,"[@882253,44,52) ""COVID-19""",positive,section
sample4.txt,"[@77c574,4,12) ""COVID-19""",IGNORE,post pattern
sample4.txt,"[@77c574,4,12) ""COVID-19""",future,sentence context
sample4.txt,"[@77c574,4,12) ""COVID-19""",negated,sentence context
sample5.txt,"[@ffb7c7,9,17) ""COVID-19""",future,sentence context
sample5.txt,"[@ffb7c7,9,17) ""COVID-19""",no_future,post attribute change
sample5.txt,"[@ffb7c7,9,17) ""COVID-19""",positive,sentence context
sample6.txt,"[@b2612f,26,34) ""COVID-19""",patient_experiencer,sentence context


In [None]:
#| export
def agg_mention(group):
    """
    aggregates attribute groups of covid spans
    """
    if 'IGNORE' in group:
        return 'IGNORE'
    elif 'negated' in group and not 'no_negated' in group:
        return 'negated'
    elif 'future' in group and not 'no_future' in group:
        return 'negated'
    elif 'other experiencer' in group or 'not relevant' in group:
        return 'negated'
    elif 'positive' in group and not 'uncertain' in group and not 'no_positive' in group:
        return 'positive'
    else:
        return 'uncertain'

#| export
def AggDocumentTags(group):
    """
    Classifies a document as 'POS', 'UNK', or 'NEG' based on COVID-19 attributes.
    """
    if 'positive' in group:
        return 'POS'
    elif 'uncertain' in group:
        return 'UNK'
    elif 'negated' in group:
        return 'NEG'
    else:
        return 'UNK'


sess.register_agg('agg_mention',agg_mention,[str],[str])
sess.register_agg('agg_doc_tags',AggDocumentTags,[str],[str])

In [None]:
%%spannerlog
AggregatedCovidTags(Path,Mention,agg_mention(Tag))<-
    CovidTags(Path,Mention,Tag,Derivation).
?AggregatedCovidTags(A,B,C)

DocumentTags(Path,agg_doc_tags(Tag))<-
    AggregatedCovidTags(Path,Mention,Tag).
?DocumentTags(A,B)


I0000 00:00:1734274472.739193  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


[@882253,44,52) "COVID-19"
[@77c574,4,12) "COVID-19"
[@ffb7c7,9,17) "COVID-19"
[@b2612f,26,34) "COVID-19"


'?AggregatedCovidTags(A,B,C)'

A,B,C
sample3.txt,"[@882253,44,52) ""COVID-19""",positive
sample4.txt,"[@77c574,4,12) ""COVID-19""",IGNORE
sample5.txt,"[@ffb7c7,9,17) ""COVID-19""",positive
sample6.txt,"[@b2612f,26,34) ""COVID-19""",uncertain


I0000 00:00:1734274498.614330  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


'?DocumentTags(A,B)'

A,B
sample3.txt,POS
sample4.txt,UNK
sample5.txt,POS
sample6.txt,UNK


In [None]:
#| export
doc_tags = sess.export('?DocumentTags(P,T)')

I0000 00:00:1734274524.741991  625950 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


In [None]:
#| export
paths = pd.DataFrame([p.name for p in file_paths],columns=['P'])
classification = paths.merge(doc_tags,on='P',how='outer')
classification['T']=classification['T'].fillna('UNK')
classification

Unnamed: 0,P,T
0,sample1.txt,UNK
1,sample10.txt,UNK
2,sample2.txt,UNK
3,sample3.txt,POS
4,sample4.txt,UNK
5,sample5.txt,POS
6,sample6.txt,UNK
7,sample7.txt,UNK
8,sample8.txt,UNK
9,sample9.txt,UNK


In [None]:
end_time = time.time()
print(f"Number of Documents: {len(file_paths)}")
print(f"Time taken: {end_time-start_time:.2f} seconds")

Number of Documents: 10
Time taken: 221.19 seconds


In [None]:
#|hide
import nbdev; nbdev.nbdev_export()
     

