# Overview
This notebook will walk through how to train a spaCy NER model using the i2b2 2012 Clinical Event Extraction shared task.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys
import glob
import re
import xml.etree.ElementTree as ET
import spacy

from spacy.tokens import Span

In [3]:
# import src.i2b2_utils
# import constants

# Define paths
If you would like get access to the data for the challenge so you can actually train the model yourself, fill out this data access form on the i2b2 website: https://portal.dbmi.hms.harvard.edu/. Once you're approved, you can download the XML files and change the following path variables in the notebook to read in the original data sources.

In [4]:
DATADIR = '/Users/alec/Data/i2b2_2012'
TRAINDIR = os.path.join(DATADIR, '2012-07-15.original-annotation.release')
TESTDIR = os.path.join(DATADIR, '2012-08-08.test-data.event-timex-groundtruth/xml')

If you don't have the files downloaded, or if the above path variables are not correct, this notebook will use the sample data below. This won't be nearly enough to actually train a model (that takes several thousand examples), but it will be enough to be able to run the script as an example.

In [5]:
# If you don't have 
EXAMPLE_TRAIN_DATA = [
    (' While this patient has had no complaints\nof gastrointestinal bleed in the past,'
     ' he should be sent for\nendoscopy to look for varices as an outpatient.',
     {'entities': [(45, 67, 'PROBLEM'),
                   (103, 112, 'TEST'),
                   (125, 132, 'PROBLEM')]})
]

EXAMPLE_TEST_DATA = [
    ('The patient has allergy to Bactrim that causes\nhim to have a rash.',
     {'entities': [
         (16, 23, 'PROBLEM'),
           (27, 34, 'TREATMENT'),
           (59, 66, 'PROBLEM')]})
]

In [6]:
if os.path.exists(DATADIR):
    print("i2b2 data found. Will use that to train.")
    USE_I2B2 = True
else:
    print("No i2b2 data found. Will use example data.")
    USE_I2B2 = False


i2b2 data found. Will use that to train.


# Load in i2b2 data and convert to spaCy

In [7]:
nlp = spacy.load("en_core_web_sm", disable="ner")

In [8]:
nlp.pipe_names

['tagger', 'parser']

In [9]:
annotation_classes = "EVENT"
labels = ['PROBLEM', 'TEST', 'TREATMENT']

In [10]:
def parse_i2b2_xmls(directory, nlp, num_docs=-1, annotation_classes=None, labels=None):
    """Read and parse all XML files in directory with suffix, which could be either
    .xml for all classes, .extent for just Event classes, or tlink.
    num_doc is an integer specifying how many docs to read in. Default is -1,
    meaning all in the directory.

    Returns a list of spaCy Docs.
    """
    if annotation_classes is None:
        annotation_classes = ["EVENT", "TIMEX3"]
    docs = []
    metas = []
    assert os.path.exists(directory)
    xml_files = glob.glob(os.path.join(directory, "*.xml"))
    if num_docs == -1:
        num_docs = len(xml_files)
    for fname in xml_files[:num_docs]:
        meta = {}
        rpt_id = os.path.basename(fname).split('.')[0]
        # First read in the xml document
        
        root = read_i2b2_xml(fname)
        text = root.find('TEXT').text
        doc = nlp(text)
        meta["report_id"] = rpt_id
        meta["filepath"] = fname

        spans = []
        span_tuples = set()
        for anno_class in annotation_classes:
            for tag in root.iter(anno_class):
                try:
                    span = ent_from_xml(tag, anno_class, doc)
                    # If specific labels have been given, restrict to those annotations
                    if labels is not None and span.label_ not in labels:
                        continue
                        
                    span_tuple = (span.start, span.end)
                    if span_tuple not in span_tuples:
                        span_tuples.add(span_tuple)
                        spans.append(span)
                except Exception as e:
                    raise e
        doc.ents = tuple()
        for span in spans:
            try:
                doc.ents += (span,)
            except ValueError as e:
                # TODO: merge overlapping annotations
                print(rpt_id)
                continue
                raise e


        docs.append(doc)
    return docs

def read_i2b2_xml(filepath):
    try:
        with open(filepath) as f:
            xmlstring = f.read()
        # NOTE - having '&' in the tags throws an error
        # simple solution - replace them with '+' signs
        xmlstring = re.sub('&', '+', xmlstring)
        parser = ET.XMLParser(encoding='utf-8')
        root = ET.fromstring(xmlstring, parser=parser)
    except Exception as e:
        print("Failed: {}".format(xml))
        print(e)
        #raise e
        return
    return root

def ent_from_xml(tag, label, doc):
    """Create a new Event Annotation object from an xml tag.
    """
    ent_attrib = {}
    anno_id = tag.attrib["id"]
    start = int(tag.attrib["start"])
    end = int(tag.attrib["end"])
    ent_attrib["type"] = tag.attrib["type"]
    
    document_span = doc.char_span(start, end)
    
    
    ent = Span(doc, document_span.start, document_span.end, tag.attrib["type"])
    return ent
    
    
    

In [11]:
def make_spacy_data(docs):
    sents2ents = {}
    for doc in docs:
        for sent in doc.sents:
            sents2ents.setdefault(sent, [])
        for ent in doc.ents:
            sent = ent.sent
            sents2ents[sent].append(ent)
    

    data = []
    for (sent, ents) in sents2ents.items():
        # Tuple of start_char, end_char, label
        annotations = []
        for ent in ents:
            if ent.label_ == '':
                continue
            annotation = (ent.start_char - sent.start_char, ent.end_char - sent.start_char, ent.label_)
            if sent.text[annotation[0]:annotation[1]] == '':
                continue
            annotations.append(annotation)
    
        data.append((sent.text, {"entities": annotations}))
    return data



In [12]:
%%time
if USE_I2B2 is True:
    training_docs = parse_i2b2_xmls(TRAINDIR, nlp, labels=labels, annotation_classes=["EVENT"]) 
    testing_docs = parse_i2b2_xmls(TESTDIR, nlp, labels=labels, annotation_classes=["EVENT"])
    sents2ents = {}
    for doc in training_docs:
        for sent in doc.sents:
            sents2ents.setdefault(sent, [])
        for ent in doc.ents:
            sent = ent.sent
            sents2ents[sent].append(ent)
    training_data = make_spacy_data(training_docs)
    testing_data = make_spacy_data(testing_docs)
else:
    training_data = EXAMPLE_TRAIN_DATA
    testing_data = EXAMPLE_TEST_DATA

517
517
517
517
517
517
676
426
156
156
168
141
151
CPU times: user 23 s, sys: 1.96 s, total: 25 s
Wall time: 28.2 s


In [13]:
print(len(training_data))
print(len(testing_data))

7640
5821


# Train Model

In [14]:
output_dir = "./my_saved_model"
n_iter = 250

In [15]:
import random

In [16]:
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

In [17]:
def save_model(directory):
    directory = Path(directory)
    if not directory.exists():
        directory.mkdir()
    nlp.to_disk(directory)
    print("Saved model to", directory)

## Training loop
- Iterate throug 

In [18]:
%%time

if "ner" in nlp.pipe_names:
    nlp.remove_pipe("ner")
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner, last=True)

# add labels
for _, annotations in training_data:
    for ent in annotations.get("entities"):
        label = ent[2]
        if label in labels: # Limit to the subset of trained labels which we're interested in
            ner.add_label(ent[2])
print(ner.labels)


        
# get names of other pipes to disable them during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes):  # only train NER
    # reset and initialize the weights randomly – but only if we're
    # training a new model

    nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(training_data)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(training_data, size=compounding(50.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(
                texts,  # batch of texts
                annotations,  # batch of annotations
                drop=0.5,  # dropout - make it harder to memorise data
                losses=losses,
            )
        if itn % 1 == 0:
            print(f"{itn} / {n_iter}")
            print("Losses", losses)

('PROBLEM', 'TEST', 'TREATMENT')
0 / 250
Losses {'ner': 31704.66421031952}
1 / 250
Losses {'ner': 25490.831581115723}
2 / 250
Losses {'ner': 22712.53517818451}
3 / 250
Losses {'ner': 21193.946857452393}
4 / 250
Losses {'ner': 19895.48380947113}
5 / 250
Losses {'ner': 18842.101117134094}
6 / 250
Losses {'ner': 17302.569165706635}
7 / 250
Losses {'ner': 16162.998267650604}
8 / 250
Losses {'ner': 15713.022518157959}
9 / 250
Losses {'ner': 14661.14384317398}
10 / 250
Losses {'ner': 13669.028096675873}
11 / 250
Losses {'ner': 12949.016832351685}
12 / 250
Losses {'ner': 12479.133256912231}
13 / 250
Losses {'ner': 11888.445013046265}
14 / 250
Losses {'ner': 11413.031613111496}
15 / 250
Losses {'ner': 11288.397390842438}
16 / 250
Losses {'ner': 10837.266374349594}
17 / 250
Losses {'ner': 10524.761800289154}
18 / 250
Losses {'ner': 10240.038081645966}
19 / 250
Losses {'ner': 9956.56172323227}
20 / 250
Losses {'ner': 9761.051298379898}
21 / 250
Losses {'ner': 9457.001722931862}
22 / 250
Losses {

188 / 250
Losses {'ner': 3539.919009283185}
189 / 250
Losses {'ner': 3555.1445163879544}
190 / 250
Losses {'ner': 3581.314238026738}
191 / 250
Losses {'ner': 3621.02473615855}
192 / 250
Losses {'ner': 3451.8410449400544}
193 / 250
Losses {'ner': 3571.260935753584}
194 / 250
Losses {'ner': 3652.4605939537287}
195 / 250
Losses {'ner': 3622.4723713845015}
196 / 250
Losses {'ner': 3482.4711535573006}
197 / 250
Losses {'ner': 3570.8038476035}
198 / 250
Losses {'ner': 3547.090422645211}
199 / 250
Losses {'ner': 3542.04293128103}
200 / 250
Losses {'ner': 3538.650417484343}
201 / 250
Losses {'ner': 3512.7869878299534}
202 / 250
Losses {'ner': 3490.9966707602143}
203 / 250
Losses {'ner': 3517.351877063513}
204 / 250
Losses {'ner': 3394.517757333815}
205 / 250
Losses {'ner': 3488.210429377854}
206 / 250
Losses {'ner': 3473.238218039274}
207 / 250
Losses {'ner': 3484.7658082842827}
208 / 250
Losses {'ner': 3368.0741888284683}
209 / 250
Losses {'ner': 3240.6222820729017}
210 / 250
Losses {'ner': 3

In [19]:
save_model(output_dir)

Saved model to my_saved_model


# Load and evaluate model
Now, you can load your trained model and use it just like you would with any other model! Let's load and test it on our testing data.

In [20]:
new_nlp = spacy.load("my_saved_model/")

In [21]:
# Evaluate on test data
scorer = new_nlp.evaluate(testing_data)
print("F1:", scorer.ents_f)

F1: 74.36894210335355


In [22]:
# Let's print out some of our predictions
for text, data in testing_data[:10]:
    doc = new_nlp(text)
    predicted = [(ent.start_char, ent.end_char, ent.label_)
                         for ent in doc.ents]
    print("Predicted:", predicted)
    print("Expected:", data["entities"])
    print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
    

Predicted: []
Expected: []
Tokens [('\n', '', 2), ('Admission', '', 2), ('Date', '', 2), (':', '', 2), ('\n', '', 2), ('2012', '', 2), ('-', '', 2), ('06', '', 2), ('-', '', 2), ('07', '', 2), ('\n', '', 2)]
Predicted: []
Expected: []
Tokens [('Discharge', '', 2), ('Date', '', 2), (':', '', 2), ('\n', '', 2)]
Predicted: []
Expected: []
Tokens [('2012', '', 2), ('-', '', 2), ('06', '', 2), ('-', '', 2), ('09', '', 2), ('\n', '', 2), ('Service', '', 2), (':', '', 2), ('\n', '', 2)]
Predicted: []
Expected: []
Tokens [('MEDICINE', '', 2), ('\n', '', 2), ('History', '', 2), ('of', '', 2), ('Present', '', 2), ('Illness', '', 2), (':', '', 2), ('\n', '', 2)]
Predicted: [(51, 62, 'PROBLEM'), (65, 81, 'PROBLEM'), (233, 238, 'TREATMENT')]
Expected: [(51, 62, 'PROBLEM'), (65, 81, 'PROBLEM'), (87, 103, 'PROBLEM'), (233, 238, 'TREATMENT')]
Tokens [('Mr.', '', 2), ('Vazquez', '', 2), ('is', '', 2), ('a', '', 2), ('48', '', 2), ('year', '', 2), ('old', '', 2), ('man', '', 2), ('with', '', 2), ('a', '