# A full example of how to use the package

In [1]:
from pathlib import Path
from sklearn.model_selection import train_test_split

## dataset

In [2]:
# we will use DDI data set at https://github.com/isegura/DDICorpus
training_root = "./N2C2_2018/train"
testing_root = "./N2C2_2018/test"

## preprocessing using NLPpreprocessing package

> if you do not want to use this package, you can see the tutorial brat2bio.ipynb or https://github.com/nlplab/brat/blob/master/tools/anntoconll.py

In [3]:
# download preprocessing package NLP
! git clone https://github.com/uf-hobi-informatics-lab/NLPreprocessing.git

fatal: destination path 'NLPreprocessing' already exists and is not an empty directory.


In [4]:
# link pacakge to python path
# import necessary functions
import sys
sys.path.append("./NLPreprocessing")
sys.path.append("./NLPreprocessing/text_process")
sys.path.append("./NLPpreprocessing/text_process/sentence_tokenization.py")

import logging
from annotation2BIO import generate_BIO, pre_processing, read_annotation_brat, BIOdata_to_file, logger
from sentence_tokenization import logger as logger1

# change log level to error to avoid too much log information in jupyter notebook
logger1.setLevel(logging.ERROR)
logger.setLevel(logging.ERROR)

In [5]:
train_file_ids = set()
for fn in Path(training_root).glob("*.ann"):
    train_file_ids.add(fn.stem)
    
test_file_ids = set()
for fn in Path(testing_root).glob("*.ann"):
    test_file_ids.add(fn.stem)

In [6]:
# generate BIO from brat annotation
train_root = Path(training_root)
train_bio = "./2018n2c2/bio/trains"
output_root = Path(train_bio)
output_root.mkdir(parents=True, exist_ok=True)

for fid in train_file_ids:
    txt_fn = train_root / (fid + ".txt")
    ann_fn = train_root / (fid + ".ann")
    bio_fn = output_root / (fid + ".bio.txt")
    
    txt, sents = pre_processing(txt_fn)
    e2idx, entities, rels = read_annotation_brat(ann_fn)
    nsents, sent_bound = generate_BIO(sents, entities, file_id=fid, no_overlap=False)
    
    BIOdata_to_file(bio_fn, nsents)


2021-11-01 09:43:32,229 ERROR ['LNC', (11457, 11460), (12552, 12555)]	('NC', 'Route', (11458, 11460)) not matched by their offsets.
2021-11-01 09:43:35,909 ERROR ['mgMWF', (9937, 9942), (11370, 11375)]	('MWF', 'Frequency', (9939, 9942)) not matched by their offsets.
2021-11-01 09:43:36,301 ERROR ['uRBCs', (1134, 1139), (1222, 1227)]	('RBCs', 'Drug', (1135, 1139)) not matched by their offsets.
2021-11-01 09:43:40,201 ERROR ['2Lnc', (7411, 7415), (8296, 8300)]	('nc', 'Route', (7413, 7415)) not matched by their offsets.
2021-11-01 09:43:43,731 ERROR ['upRBCs', (945, 951), (1010, 1016)]	('pRBCs', 'Drug', (946, 951)) not matched by their offsets.
2021-11-01 09:43:43,852 ERROR ['/', (1835, 1836), (2048, 2049)]	('Hypoxia', 'ADE', (1836, 1843)) not matched by their offsets.
2021-11-01 09:43:47,514 ERROR ['LNS', (1947, 1950), (2087, 2090)]	('NS', 'Drug', (1948, 1950)) not matched by their offsets.
2021-11-01 09:43:49,611 ERROR ['?', (613, 614), (651, 652)]	('reaction', 'ADE', (605, 613)) not ma

In [7]:
# now we have to split the train and dev sets
# for transformer NER, we need to name these two datasets as train.txt and dev.txt
train_file_ids = list(train_file_ids)
train_ids, dev_ids = train_test_split(train_file_ids, train_size=0.9, random_state=13, shuffle=True)
len(train_ids), len(dev_ids)

(272, 31)

In [8]:
import fileinput

merged = output_root / "merge" # this will the final data dir we use for training
merged.mkdir(exist_ok=True, parents=True)

# train
with open(merged / "train.txt", "w") as f:
    for fid in train_ids:
        f.writelines(fileinput.input(output_root / (fid + ".bio.txt")))
    fileinput.close()
        
# dev
with open(merged /"dev.txt", "w") as f:
    for fid in dev_ids:
        f.writelines(fileinput.input(output_root / (fid + ".bio.txt")))
    fileinput.close()

In [9]:
test_root = Path(testing_root)
test_gold_bio = "./2018n2c2/bio/test_gold"
output_root = Path(test_gold_bio)
output_root.mkdir(parents=True, exist_ok=True)

for fid in test_file_ids:
    txt_fn = test_root / (fid + ".txt")
    ann_fn = test_root / (fid + ".ann")
    bio_fn = output_root / (fid + ".bio.txt")

    txt, sents = pre_processing(txt_fn)
    e2idx, entities, rels = read_annotation_brat(ann_fn)
    nsents, sent_bound = generate_BIO(sents, entities, file_id=fid, no_overlap=False)

    BIOdata_to_file(bio_fn, nsents)

# test gold
import fileinput

merged = output_root / "merge" # this will the final data dir we use for training
merged.mkdir(exist_ok=True, parents=True)

with open(merged /"test.txt", "w") as f:
    for fid in test_file_ids:
        f.writelines(fileinput.input(output_root / (fid + ".bio.txt")))
    fileinput.close()

2021-11-01 09:44:19,099 ERROR ['/', (9915, 9916), (10741, 10742)]	('VOMITING', 'ADE', (9916, 9924)) not matched by their offsets.
2021-11-01 09:44:21,621 ERROR ['LNS', (937, 940), (985, 988)]	('NS', 'Drug', (938, 940)) not matched by their offsets.
2021-11-01 09:44:22,874 ERROR ['uFFP', (5294, 5298), (5849, 5853)]	('FFP', 'Drug', (5295, 5298)) not matched by their offsets.
2021-11-01 09:44:22,875 ERROR ['uPRBC', (5352, 5357), (5917, 5922)]	('PRBC', 'Drug', (5353, 5357)) not matched by their offsets.
2021-11-01 09:44:24,886 ERROR ['LNS', (869, 872), (913, 916)]	('NS', 'Route', (870, 872)) not matched by their offsets.
2021-11-01 09:44:37,368 ERROR [',', (1881, 1882), (1963, 1964)]	('OD', 'ADE', (1879, 1881)) not matched by their offsets.


## training model

In [None]:
"""
Next we will train the NER model

We will just use a BERT model pre-trained on general English corpora as an example

In general we need GPU to train the model, running with CPU the training will be extremely slow. 

To use GPU, you just need to run 'export CUDA_VISIBLE_DEVICES=0' before run the training command
"""

# -1 indicates using CPU for training; 
# 0 indicate we use the GPU with ID as 0, etc.
! export CUDA_VISIBLE_DEVICES=0 

# this is just an example, please refer to the readme for how to set hyperparameters
! python ../src/run_transformer_ner.py \
      --model_type bert \
      --pretrained_model bert-base-uncased \
      --data_dir ./ddi/ddi_bio/merge \
      --new_model_dir ./new_bert_ner_model \
      --overwrite_model_dir \
      --max_seq_length 128 \
      --data_has_offset_information \
      --save_model_core \
      --do_train \
      --model_selection_scoring strict-f_score-1 \
      --do_lower_case \
      --train_batch_size 8 \
      --train_steps 1000 \
      --learning_rate 1e-5 \
      --num_train_epochs 20 \
      --gradient_accumulation_steps 1 \
      --do_warmup \
      --seed 13 \
      --warmup_ratio 0.1 \
      --max_num_checkpoints 1 \
      --log_file ./log.txt \
      --progress_bar \
      --early_stop 3

## do prediction on each test set file and format prediction as brat output

In [13]:
"""
running prediction

In our transformer package, we have the format conversion between bio and brat; bioc implemented

you still have to convert the txt files for prediction to BIO format,
but here you do not need to assign a real annotation label, we just use O as dummy
"""

# generate bio
test_root = Path(testing_root)
test_bio = "./2018n2c2/bio/test"
output_root = Path(test_bio)
output_root.mkdir(parents=True, exist_ok=True)

for fn in test_root.glob("*.txt"):
    txt_fn = fn
    bio_fn = output_root / (fn.stem + ".bio.txt")

    txt, sents = pre_processing(txt_fn)
    annotations = [] # here we just use an empty list for annotation so that all words will be labeled as O
    nsents, sent_bound = generate_BIO(sents, annotations, file_id=fid, no_overlap=False)

    BIOdata_to_file(bio_fn, nsents)

In [None]:
# run prediction
! export CUDA_VISIBLE_DEVICES=0

# use format 1 for BRAT, 2 for BioC, 0 as default for BIO
# see readme for more information on parameters
! python ../ClinicalTransformerNER/src/run_transformer_batch_prediction.py \
      --model_type bert \
      --pretrained_model ./new_bert_ner_model \
      --raw_text_dir ./N2C2_2018/test \ # this is the dir where original text file located
      --preprocessed_text_dir ./2018n2c2/bio/test \# this is the dir where the BIO file located
      --output_dir ./ddi/results \
      --max_seq_length 128 \
      --do_lower_case \
      --eval_batch_size 8 \
      --log_file ./log.txt\
      --do_format 1 \
      --do_copy \
      --data_has_offset_information

# the bio prediction output will be generateed in ./ddi/results
# the brat prediction output will be generateed in ./ddi/results_formatted_output

## evaluation using brat eval script

In [None]:
# run evaluation
# we have a brat_eval.py script used for evaluation of NER and relation extraction based on brat format

! python ../src/eval_scripts/brat_eval.py --f1 ./ddi/DDICorpusBrat/Test/MedLine --f2 ./ddi/results_formatted_output/