In [1]:
import spacy


SpaCy is a free, open-source library for advanced natural language processing (NLP) in Python. It is designed to make it easy to build systems for information extraction or general-purpose natural language processing.

In [2]:
# Importing the locale module
import locale

# Defining a custom function to override the getpreferredencoding() function
def getpreferredencoding(do_setlocale=True):
    return "UTF-8"

# Overriding the getpreferredencoding() function from the locale module
locale.getpreferredencoding = getpreferredencoding


official doc --> https://spacy.io/models/en/#en_core_web_lg

In [3]:
!python -m spacy download en_core_web_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [4]:
# load the model 
nlp = spacy.load('en_core_web_lg')
nlp

<spacy.lang.en.English at 0x7fe6f2244220>

In [5]:
doc = nlp("Monkey D. Luffy, the future Pirate King, sets sail with his crew, the Straw Hat Pirates, in search of the legendary treasure, One Piece.")

In [6]:
doc

Monkey D. Luffy, the future Pirate King, sets sail with his crew, the Straw Hat Pirates, in search of the legendary treasure, One Piece.

In [7]:
type(doc)

spacy.tokens.doc.Doc

In [8]:
# finding the entities in the doc
doc.ents

(Pirate King, the Straw Hat Pirates, One)

In [9]:
doc.ents[0], type(doc.ents[0])

(Pirate King, spacy.tokens.span.Span)

In [10]:
from spacy import displacy 
displacy.render(doc, style='ent', jupyter=True)

In [11]:
!pip install opendatasets -q

In [12]:
# download the dataset
import opendatasets as od 
url = "https://www.kaggle.com/datasets/finalepoch/medical-ner"

od.download(url)

Downloading medical-ner.zip to ./medical-ner


100%|██████████| 26.2k/26.2k [00:00<00:00, 39.4MB/s]







In [13]:
# loading the dataset
import json 
with open('/content/medical-ner/Corona2.json', 'r', encoding="utf-8") as f: 
  data = json.load(f)

In [14]:
#checking sample
data.keys()

dict_keys(['examples'])

In [15]:
data['examples'][0].keys()

dict_keys(['id', 'content', 'metadata', 'annotations', 'classifications'])

In [16]:
data['examples'][0]['content']

"While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]"

In [17]:
data['examples'][0]['annotations'][0]

{'id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed',
 'tag_id': 'c06bd022-6ded-44a5-8d90-f17685bb85a1',
 'end': 371,
 'start': 360,
 'example_id': '18c2f619-f102-452f-ab81-d26f7e283ffe',
 'tag_name': 'Medicine',
 'value': 'Diosmectite',
 'correct': None,
 'human_annotations': [{'timestamp': '2020-03-21T00:24:32.098000Z',
   'annotator_id': 1,
   'tagged_token_id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed',
   'name': 'Ashpat123',
   'reason': 'exploration'}],
 'model_annotations': []}

In [18]:
training_data = [] 
for example in data['examples']:
  temp_dict = {} 
  temp_dict['text'] = example['content']
  temp_dict['entities'] = []
  for annotation in example['annotations']:
    start = annotation['start']
    end = annotation['end']
    label = annotation['tag_name'].upper() 
    temp_dict['entities'].append((start, end, label))
  training_data.append(temp_dict)

print(training_data[0])


{'text': "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]", 'entities': [(360, 371, 'MEDICINE'), (383, 408, 'MEDICINE'), (104, 112, 'MEDICALCONDITION'), (679,

In [19]:
training_data[0]

{'text': "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]",
 'entities': [(360, 371, 'MEDICINE'),
  (383, 408, 'MEDICINE'),
  (104, 112, 'MEDICALCONDITION'),


In [20]:
training_data[0]['text']

"While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]"

In [None]:
training_data[0]['entities']

[(360, 371, 'MEDICINE'),
 (383, 408, 'MEDICINE'),
 (104, 112, 'MEDICALCONDITION'),
 (679, 689, 'MEDICINE'),
 (6, 23, 'MEDICINE'),
 (25, 37, 'MEDICINE'),
 (461, 470, 'MEDICALCONDITION'),
 (577, 589, 'MEDICINE'),
 (853, 865, 'MEDICALCONDITION'),
 (188, 198, 'MEDICINE'),
 (754, 762, 'MEDICALCONDITION'),
 (870, 880, 'MEDICALCONDITION'),
 (823, 833, 'MEDICINE'),
 (852, 853, 'MEDICALCONDITION'),
 (461, 469, 'MEDICALCONDITION'),
 (535, 543, 'MEDICALCONDITION'),
 (692, 704, 'MEDICINE'),
 (563, 571, 'MEDICALCONDITION')]

In [None]:
training_data[0]['text'][360:371]

'Diosmectite'

In [None]:
training_data[0]['text'][383:408]

'aluminomagnesium silicate'

In [None]:
training_data[0].keys()

dict_keys(['text', 'entities'])

In [21]:
training_data[1]['text']

'Diarrhea, also spelled diarrhoea, is the condition of having at least three loose, liquid, or watery bowel movements each day.[2] It often lasts for a few days and can result in dehydration due to fluid loss.[2] Signs of dehydration often begin with loss of the normal stretchiness of the skin and irritable behaviour.[2] This can progress to decreased urination, loss of skin color, a fast heart rate, and a decrease in responsiveness as it becomes more severe.[2] Loose but non-watery stools in babies who are exclusively breastfed, however, are normal.[2]'

In [24]:
len(training_data)

31

In [35]:
# creating sample file for testing the model 

import random
# import open 

f = open("sample.txt","w")

for i in range(0,5):
  random_number = random.randint(0,30)
  f.write(training_data[random_number]['text'] + "\n\n")

f.close()

In [None]:
# The DocBin class in spaCy is used to efficiently serialize a collection of Doc objects. 
#It is faster and produces smaller data sizes than pickle, and allows you to deserialize without executing arbitrary Python code.

from spacy.tokens import DocBin 
from tqdm.notebook import tqdm 

nlp = spacy.blank('en') # laod a new spacy model 
doc_bin = DocBin() # creating a object for DocBin()

In [None]:
from spacy.util import filter_spans
# the filter_spans() function in spaCy is used to filter a sequence of spans and remove duplicates or overlaps.
# The function works by first sorting the spans by their end-point, then iterating over the spans and checking 
# if the current span overlaps with any of the previous spans. If the current span does not overlap with any of the previous spans, 
# it is added to the new sequence of spans.


# Iterate over training examples
for training_example in tqdm(training_data):
    text = training_example['text']
    labels = training_example['entities']
    
    # Create a document object from the text
    doc = nlp.make_doc(text)
    
    # Initialize an empty list to store spans for entities
    ents = []
    
    # Iterate over the labels for entities
    for start, end, label in labels:
        # Create a span from start and end indices
        span = doc.char_span(start, end, label=label, alignment_mode='contract')
        
        if span is None:
            # If the span is None, print a message and skip the entity
            print('Skipping entity')
        else:
            # If the span is valid, append it to the list of entities
            ents.append(span)
    
    # Filter the spans to remove overlapping or nested entities
    filtered_ents = filter_spans(ents)
    
    # Set the filtered spans as the entities for the document
    doc.ents = filtered_ents
    
    # Add the processed document to the document bin
    doc_bin.add(doc)

# Save the document bin to disk
doc_bin.to_disk('train.spacy')


  0%|          | 0/31 [00:00<?, ?it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


In [None]:
%%writefile base_config.cfg
# This is an auto-generated partial config. To use it with 'spacy train'
# you can run spacy init fill-config to auto-fill all default settings:
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
[paths]
train = null
dev = null
vectors = "en_core_web_lg"
[system]
gpu_allocator = null

[nlp]
lang = "en"
pipeline = ["tok2vec","ner"]
batch_size = 1000

[components]

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"

[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width}
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
rows = [5000, 1000, 2500, 2500]
include_static_vectors = true

[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 256
depth = 8
window_size = 1
maxout_pieces = 3

[components.ner]
factory = "ner"

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}

[corpora]

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0

[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"

[training.optimizer]
@optimizers = "Adam.v1"

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2

[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001

[initialize]
vectors = ${paths.vectors}

Writing base_config.cfg


In [None]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


> Open the newly generated config.cfg file and update the details for `max_epochs` and `batch_size`

In [None]:
!python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./train.spacy  --output . --nlp.batch_size 100 --training.max_epochs=100

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[2023-05-16 16:59:31,519] [INFO] Set up nlp object from config
[2023-05-16 16:59:31,557] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-05-16 16:59:31,564] [INFO] Created vocabulary
[2023-05-16 16:59:35,225] [INFO] Added vectors: en_core_web_lg
[2023-05-16 16:59:39,156] [INFO] Finished initializing nlp object
[2023-05-16 16:59:42,225] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    153.29    0.55    0.91    0.39    0.01
  7     200        499.25   3640.10   70.00   82.80   60.63    0.70
 14     400        192.16   1153.36   87.20   97.10   79.13    0.87
 22     600         72.20    471.57   93.66   97.45   90.16    0.94


In [None]:
nlp_ner = spacy.load("model-best")

In [None]:
doc = nlp_ner("While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.")

colors = {"PATHOGEN": "#F67DE3", "MEDICINE": "#7DF6D9", "MEDICALCONDITION":"#a6e22d"}
options = {"colors": colors} 

spacy.displacy.render(doc, style="ent", options= options, jupyter=True)

In [None]:
nlp.to_disk('model-best.pkl')