In [30]:
import spacy
from spacy.tokens import Span

import medspacy
from medspacy.preprocess import Preprocessor, PreprocessingRule
from medspacy.ner import TargetRule
from medspacy.context import ConTextItem, ConTextComponent
from medspacy.section_detection import Sectionizer
from medspacy.postprocess import Postprocessor, PostprocessingRule, PostprocessingPattern
from medspacy.postprocess import postprocessing_functions
from medspacy.visualization import visualize_ent, visualize_dep


import re

# Overview
In this notebook, we'll use a pretrained model instead of defining rules for concept extraction. We'll add preprocessing, context, section detection, and postprocessing.

```bash
pip install https://github.com/abchapman93/spacy_models/raw/master/releases/en_info_3700_i2b2_2012-0.1.0/dist/en_info_3700_i2b2_2012-0.1.0.tar.gz
```

In [2]:
with open("./discharge_summary.txt") as f:
    text = f.read()

In [3]:
nlp = spacy.load("en_info_3700_i2b2_2012")

In [4]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [5]:
doc = nlp(text)

In [6]:
visualize_ent(doc)

### Preprocessing

In [7]:
preprocessor = Preprocessor(nlp.tokenizer)
nlp.tokenizer = preprocessor

In [8]:
preprocess_rules = [
    lambda x: x.lower(),
    
    PreprocessingRule(
        re.compile("\[\*\*[\d]{1,4}-[\d]{1,2}(-[\d]{1,2})?\*\*\]"),
        repl="01-01-2010",
        desc="Replace MIMIC date brackets with a generic date."
    ),
    
    PreprocessingRule(
        re.compile("\[\*\*[\d]{4}\*\*\]"),
        repl="2010",
        desc="Replace MIMIC year brackets with a generic year."
    ),
    
    PreprocessingRule(
        re.compile("dx'd"), repl="Diagnosed", 
                  desc="Replace abbreviation"
    ),
    
    PreprocessingRule(
        re.compile("tx'd"), repl="Treated", 
                  desc="Replace abbreviation"
    ),
    
        PreprocessingRule(
        re.compile("\[\*\*[^\]]+\]"), 
        desc="Remove all other bracketed placeholder text from MIMIC"
    )
]

In [9]:
preprocessor.add(preprocess_rules)

### Context

In [10]:
context = ConTextComponent(nlp, rules="default")
nlp.add_pipe(context)

In [11]:
item_data = [
    ConTextItem("diagnosed in <YEAR>", "HISTORICAL", 
               pattern=[
                   {"LOWER": "diagnosed"},
                   {"LOWER": "in"},
                   {"LOWER": {"REGEX": "^[\d]{4}$"}}
               ])
]

In [12]:
context.add(item_data)

### Section detection

In [13]:
sectionizer = Sectionizer(nlp, patterns="default")
nlp.add_pipe(sectionizer)

In [14]:
section_patterns = [
    {"section_title": "hospital_course", "pattern": "Brief Hospital Course:"}
]

In [15]:
sectionizer.add(section_patterns)

### Postprocessing
Let's clean up some of the entities here which we know are incorrect.

In [16]:
postprocessor = Postprocessor()
nlp.add_pipe(postprocessor)

In [17]:
re.search("asdf", "NamePattern1")

In [18]:
postprocess_rules = [
    PostprocessingRule(
        patterns=[
            PostprocessingPattern(condition=lambda ent: ent.lower_ == "married"),
        ],
        action=postprocessing_functions.remove_ent,
        description="Remove a specific misclassified span of text."
    ),
    
]

In [19]:
postprocessor.add(postprocess_rules)

# Process our document

In [20]:
nlp.pipe_names

['tagger', 'parser', 'ner', 'context', 'sectionizer', 'postprocessor']

In [21]:
doc = nlp(text)

In [22]:
visualize_ent(doc)

In [27]:
short_text = "Colon cancer dx'd in [**2554**], tx'd with hemicolectomy"
short_doc = nlp(short_text)

In [28]:
visualize_ent(short_doc)

In [31]:
visualize_dep(short_doc)