A clinical named entity recognition model which can recognize the disease names from clinical text.
Named Entity Recognition: (17.3 MB), 8 datasets on biomedical named entity recognition.
train.tsv, test.tsv , dev.tsv and devel.tsv.
In These tsv files each word is annotated using the BIO format.
A few lines from  train.tsv in BC5CDR-disease dataset looks like:
Selegiline	O
-	O
induced	O
postural	B
hypotension	I
in	O
Parkinson	B
'	I
s	I
disease	I
:	O
a	O
longitudinal	O
study	O
on	O
the	O
effects	O
of	O
drug	O
withdrawal	O
.	O
Here it is of the format:
word \t label\n
for instance:
postural	B
hypotension	I

here B-> Begin entity, I-> inside entity and O-> outside entity



In [1]:
#Import all required libraries
import spacy
import random
import time
import numpy as np
import sys
from spacy import displacy
from itertools import chain
import matplotlib.pyplot as plt 
from matplotlib.ticker import MaxNLocator



In [2]:
def load_data_spacy(file_path):
    ''' Converts data from:
    word \t label \n word \t label \n \n word \t label
    to: sentence, {entities : [(start, end, label), (stard, end, label)]}
    '''
    file = open(file_path, 'r')
    training_data, entities, sentence, unique_labels = [], [], [], []
    current_annotation = None
    start =0
    end = 0 # initialize counter to keep track of start and end characters
    for line in file:
        line = line.strip("\n").split("\t")
        # lines with len > 1 are words
        if len(line) > 1:
            label = line[1]
            if(label != 'O'):
                label = line[1]+"_Disease"     # the .txt is formatted: label \t word, label[0:2] = label_type
            #label_type = line[0][0] # beginning of annotations - "B", intermediate - "I"
            word = line[0]
            sentence.append(word)
            start = end
            end += (len(word) + 1)  # length of the word + trailing space
           
            if label == 'I_Disease' :  # if at the end of an annotation
                entities.append(( start,end-1, label))  # append the annotation
                              
            if label == 'B_Disease':                         # if beginning new annotation
                entities.append(( start,end-1, label))# start annotation at beginning of word
                
           
           
            if label != 'O' and label not in unique_labels:
                unique_labels.append(label)
 
        # lines with len == 1 are breaks between sentences
        if len(line) == 1:
            if(len(entities) > 0):
                sentence = " ".join(sentence)
                training_data.append([sentence, {'entities' : entities}])
            # reset the counters and temporary lists
            end = 0 
            start = 0
            entities, sentence = [], []
            
    file.close()
    return training_data, unique_labels   

In [3]:
# unzip NERdata downloaded from BioBERT website
import zipfile
with zipfile.ZipFile('NERdata.zip', 'r') as zip_ref:
    zip_ref.extractall('NERdata')

In [4]:
TRAIN_DATA, LABELS = load_data_spacy("NERdata/BC5CDR-disease/train.tsv")
print(len(TRAIN_DATA))

TEST_DATA, _ = load_data_spacy("NERdata/BC5CDR-disease/test.tsv")
print(len(TEST_DATA))

VALID_DATA, _ = load_data_spacy("NERdata/BC5CDR-disease/train_dev.tsv")
print(len(VALID_DATA))


2658
2842
5385


In [5]:
TRAIN_DATA[1]

["OBJECTIVES : The United Kingdom Parkinson ' s Disease Research Group ( UKPDRG ) trial found an increased mortality in patients with Parkinson ' s disease ( PD ) randomized to receive 10 mg selegiline per day and L - dopa compared with those taking L - dopa alone .",
 {'entities': [(32, 41, 'B_Disease'),
   (42, 43, 'I_Disease'),
   (44, 45, 'I_Disease'),
   (46, 53, 'I_Disease'),
   (132, 141, 'B_Disease'),
   (142, 143, 'I_Disease'),
   (144, 145, 'I_Disease'),
   (146, 153, 'I_Disease'),
   (156, 158, 'B_Disease')]}]

In [6]:
import pandas as pd
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin

In [7]:
nlp = spacy.blank("en") # load a new spacy model

In [8]:
# saving TRAIN Data in spaCy format

db = DocBin() # create a DocBin object

for text, annot in tqdm(TRAIN_DATA): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk("./train.spacy") # save the docbin object

100%|██████████| 2658/2658 [00:01<00:00, 2292.36it/s]


In [9]:
# saving VALID Data in spaCy format

db = DocBin()
for text, annot in tqdm(VALID_DATA): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk("./valid.spacy") # save the docbin object

100%|██████████| 5385/5385 [00:01<00:00, 3183.66it/s]


# Download base_config file from https://spacy.io/usage/training#quickstart   
Upload and install the config file

In [10]:
! pip install spacy-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy-transformers
  Downloading spacy_transformers-1.2.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (193 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.5/193.5 KB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers<4.27.0,>=3.4.0
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m

In [14]:
! python -m spacy init fill-config base_config.cfg config.cfg

2023-02-02 09:28:27.217752: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


# Training the Custom NER here 

In [15]:
! python -m spacy train config.cfg --verbose --output ./ner_demo/training/ --paths.train train.spacy --paths.dev valid.spacy

2023-02-02 09:29:14.308466: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[2023-02-02 09:29:15,037] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
DEBUG:spacy:Config overrides from CLI: ['paths.train', 'paths.dev']
[38;5;4mℹ Saving to output directory: ner_demo/training[0m
[38;5;4mℹ Using CPU[0m
[1m
[2023-02-02 09:29:15,527] [INFO] Set up nlp object from config
INFO:spacy:Set up nlp object from config
[2023-02-02 09:29:15,538] [DEBUG] Loading corpus from path: valid.spacy
DEBUG:spacy:Loading corpus from path: valid.spacy
[2023-02-02 09:29:15,540] [DEBUG] Loading corpus from path: train.spacy
DEBUG:spacy:Loading corpus from path: train.spacy
[2023-02-02 09:29:15,540] [INFO] Pipeline: ['transformer', 'ner']
INFO:spacy:Pipeline: ['transformer', 'ner']
[2023-02-02 09:29:15,544] [INFO] Created vocabulary
INFO:spacy:Created vocabulary
[2023-02-02 09:29:15,545] [INFO] Finished initial

Model testing on test data

In [20]:
import spacy_transformers

ner = spacy.load(R"ner_demo/training/model-best") #load the best model   

In [22]:
test_sentences = [x[0] for x in TEST_DATA[0:4000]] # extract the sentences from [sentence, entity]
for i in range(0, 5):
    x = test_sentences[i]
    doc = ner(x)
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
    displacy.render(doc, jupyter=True, style = "ent")

Torsade 0 7 B_Disease
de 8 10 I_Disease
pointes 11 18 I_Disease
ventricular 19 30 B_Disease
tachycardia 31 42 I_Disease
dilated 111 118 B_Disease
cardiomyopathy 119 133 I_Disease
congestive 138 148 B_Disease
heart 149 154 I_Disease
failure 155 162 I_Disease


heart 79 84 B_Disease
failure 85 92 I_Disease
dilated 106 113 B_Disease
cardiomyopathy 114 128 I_Disease
ventricular 156 167 B_Disease
arrhythmias 168 179 I_Disease
QT 194 196 B_Disease
prolongation 197 209 I_Disease
torsade 214 221 B_Disease
de 222 224 I_Disease
pointes 225 232 I_Disease
ventricular 233 244 B_Disease
tachycardia 245 256 I_Disease


torsade 15 22 B_Disease
de 23 25 I_Disease
pointes 26 33 I_Disease
ventricular 34 45 I_Disease
tachycardia 46 57 I_Disease
arrhythmias 138 149 B_Disease
rhythm 227 233 B_Disease
disturbances 234 246 I_Disease


dyspnea 109 116 B_Disease
loss 119 123 B_Disease
of 124 126 I_Disease
consciousness 127 140 I_Disease
rash 171 175 I_Disease
heat 250 254 B_Disease
pain 268 272 B_Disease


allergy 94 101 B_Disease


In [23]:
ner = spacy.load(R"ner_demo/training/model-best") #load the best model
doc = ner("Selegiline - induced postural hypotension in Parkinson ' s disease : a longitudinal study on the effects of drug withdrawal.The aims of this study were to confirm our previous findings in a separate cohort of patients and to determine the time course of the cardiovascular consequences of stopping selegiline in the expectation that this might shed light on the mechanisms by which the drug causes orthostatic hypotension")
displacy.render(doc,jupyter=True, style = "ent")

In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
! mv '/content/drive/MyDrive/bmw/ner_demo' '/content/drive/MyDrive/spaCy_NER_trained_model'

In [29]:
! mv '/content/base_config.cfg' '/content/drive/MyDrive/spaCy_NER_trained_model'

In [30]:
! mv '/content/NERdata.zip' '/content/drive/MyDrive/spaCy_NER_trained_model'