# Part 1: Train your own NER with Spacy
This notebook trains an NER model on the [Ade Corpus](https://huggingface.co/datasets/ade_corpus_v2) from huggingface.
At first, we explore how an off the shelf model from spacy and huggingface BERT performs NER on the data.
Then we train an NER transformer based model using Spacy using CLI and evaluate the model.

In [None]:
!pip install 'transformers[torch]'
!pip install datasets
!pip install zstandard
!pip install 'spacy[transformers]'
!python -m spacy download en_core_web_trf

Collecting transformers[torch]
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[torch])
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m103.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m73.2 MB/s

In [None]:
import datasets
from datasets import DatasetInfo, DatasetDict
import os
import torch
import spacy
from spacy import displacy
import random
from spacy.scorer import Scorer
from spacy.tokens import Doc, DocBin
from spacy.training.example import Example
from pathlib import Path
from transformers import pipeline

In [None]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
print(f'is CUDA available {torch.cuda.is_available()}')
print(f'CUDA device name:', torch.cuda.get_device_name(0))
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'(Free memory, Available Memory){torch.cuda.mem_get_info()}')

is CUDA available True
CUDA device name: Tesla T4
(Free memory, Available Memory)(15727394816, 15835398144)


## Load Data
[Ade Corpus](https://huggingface.co/datasets/ade_corpus_v2) is a dataset for Adverse Drug Reaction. We will focus on the subset Ade_corpus_v2_drug_ade_relation as it contains the relation between drug and effect. In fact the goal is tp identify the drug and effect and these are the two labels we will be training the NER on.


In [None]:
ds = datasets.load_dataset("ade_corpus_v2", "Ade_corpus_v2_drug_ade_relation")
 #making it small for running easily
train_testvalid = ds['train'].train_test_split(train_size=1000, test_size=200)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
ds = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})
ds

Downloading builder script:   0%|          | 0.00/11.7k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.84k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/307k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/18.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/868k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'drug', 'effect', 'indexes'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['text', 'drug', 'effect', 'indexes'],
        num_rows: 100
    })
    valid: Dataset({
        features: ['text', 'drug', 'effect', 'indexes'],
        num_rows: 100
    })
})

In [None]:
ds["train"][0]

{'text': 'Diarrhoea, T-CD4+ lymphopenia and bilateral patchy pulmonary infiltrates developed in a male 60 yrs of age, who was treated with oxaliplatinum and 5-fluorouracil for unresectable rectum carcinoma.',
 'drug': 'oxaliplatinum',
 'effect': 'Diarrhoea',
 'indexes': {'drug': {'start_char': [129], 'end_char': [142]},
  'effect': {'start_char': [0], 'end_char': [9]}}}

In [None]:
text1 = ds["train"][0]["text"]
text2 = "My name is Sukanya and I live in Bern"

### Spacy transformers off the shelf model

In [None]:
nlp_vanilla = spacy.load('en_core_web_trf')

In [None]:
doc = nlp_vanilla(text1)
displacy.render(doc, style='ent', jupyter=True, options={'distance': 90}) # Fails to identify the tokens

In [None]:
doc = nlp_vanilla(text2)
displacy.render(doc, style='ent', jupyter=True, options={'distance': 90}) # Can identify the tokens

In [None]:
nlp_vanilla.get_pipe("ner").labels # shows all the entity labels used by Spacy

('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')

### Transformers off the shelf model

In [None]:
from transformers import pipeline
ner_hg = pipeline('ner', model ="dslim/bert-base-NER", aggregation_strategy = 'simple')

Downloading (…)lve/main/config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
ner_hg(text1)

[{'entity_group': 'MISC',
  'score': 0.9815594,
  'word': 'T',
  'start': 11,
  'end': 12},
 {'entity_group': 'MISC',
  'score': 0.84758747,
  'word': 'CD',
  'start': 13,
  'end': 15}]

In [None]:
ner_hg(text2)

[{'entity_group': 'PER',
  'score': 0.99726796,
  'word': 'Su',
  'start': 11,
  'end': 13},
 {'entity_group': 'PER',
  'score': 0.73005754,
  'word': '##kan',
  'start': 13,
  'end': 16},
 {'entity_group': 'PER',
  'score': 0.68934876,
  'word': '##ya',
  'start': 16,
  'end': 18},
 {'entity_group': 'LOC',
  'score': 0.9955876,
  'word': 'Bern',
  'start': 33,
  'end': 37}]

In [None]:
def convert_ner_format(row):
  """
  Converts each row of the data into a text,entity list format.
  The entity list contains the tuple of start and ending positions and the entity label
  """
  try:
      text = row["text"]
      entities = []
      for label in row['indexes'].keys():
          start_pos = int(row['indexes'].get(label).get("start_char")[0])
          end_pos = int(row['indexes'].get(label).get("end_char")[0])
          #print((start_pos, end_pos, label))
          entities.append((start_pos, end_pos, label))
      data_point = (text,{"entities": entities})
      return data_point
  except:
      pass

In [None]:
ds_train_df = ds["train"].to_pandas()
ds_test_df = ds["test"].to_pandas()
ds_valid_df = ds["valid"].to_pandas()

In [None]:
ds_train_df.head(1)

Unnamed: 0,text,drug,effect,indexes
0,"Diarrhoea, T-CD4+ lymphopenia and bilateral pa...",oxaliplatinum,Diarrhoea,"{'drug': {'start_char': [129], 'end_char': [14..."


In [None]:
convert_ner_format(ds_train_df.iloc[0])

('Diarrhoea, T-CD4+ lymphopenia and bilateral patchy pulmonary infiltrates developed in a male 60 yrs of age, who was treated with oxaliplatinum and 5-fluorouracil for unresectable rectum carcinoma.',
 {'entities': [(129, 142, 'drug'), (0, 9, 'effect')]})

In [None]:
#convert our train, validation and test datasets to ner format
train_ner = ds_train_df.apply(lambda x: convert_ner_format(x), axis = 1).to_list()
train_ner = [x for  x in train_ner if x is not None] #Removing any Null values caused
test_ner = ds_test_df.apply(lambda x: convert_ner_format(x), axis = 1).to_list()
test_ner = [x for  x in test_ner if x is not None]
valid_ner =  ds_valid_df.apply(lambda x: convert_ner_format(x), axis = 1).to_list()
valid_ner = [x for  x in valid_ner if x is not None]

In [None]:
len(train_ner), len(test_ner), len(valid_ner) #we lost some rows which could not be processed by the function. To resolve, check the dataset more closely and fix the exception in the convert_ner_format

(992, 99, 100)

## Spacy training command line
[Documentation](https://spacy.io/usage/training#quickstart)
1. Create a file base_config.cfg form the documentation link and save it locally. In the widget, select the appropriate language, select only NER as the component and set the respective hardware.
2. Convert base config to config file (CLI) ```python -m spacy init fill-config base_config.cfg config.cfg --diff```
3. Convert the training and validation data to train.spacy, dev.spacy docbin file (See Code below)
4. Debug and check the configurations are done correctly (CLI) ```python -m spacy debug data config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy```
6. Train Model (CLI) ```python -m spacy train config.cfg --output ./ner_output --paths.train ./train.spacy --paths.dev ./dev.spacy```
7. Load Model
8. Evaluate Model output (CLI)

In [None]:
#Step 2
!python -m spacy init fill-config base_config.cfg config.cfg --diff

2023-11-10 11:55:51.472679: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-10 11:55:51.472762: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-10 11:55:51.472806: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[38;5;2m✔ Auto-filled config with all values[0m
[1m

[paths]
train = null
dev = null
vectors = null
[38;5;16;48;5;2minit_tok2vec = null[0m

[system]
gpu_allocator = "pytorch"
[38;5;16;48;5;2mseed = 0[0m

[nlp]
lang = "en"
pipeline = ["transformer","ner"]
batch_size = 128
[38;5;16;48;5;2mdisabled = [][0m
[38;5;16;48;5;2mbefore_creation = null[0m
[38

In [None]:
#Step 3
def create_spacy_doc_bin_file(dataset, file_name):
    nlp = spacy.blank("en")
    db = DocBin()
    exception_count = 0
    for i in range(0,len(dataset)):
        try:
            text, annotations = dataset[i]
            #print(text, annotations)
            doc = nlp(text)
            ents = []
            for start, end, label in annotations.get("entities"):
                span = doc.char_span(start, end, label=label)
                ents.append(span)
            doc.ents = ents
            db.add(doc)
        except:
            #print(train_ner[i])
            exception_count = exception_count +1
            pass
    print("Number of cases where conversion failed and were excluded", exception_count)
    db.to_disk(file_name)

create_spacy_doc_bin_file(dataset = train_ner, file_name = "./train.spacy")
create_spacy_doc_bin_file(dataset = valid_ner, file_name = "./dev.spacy")
create_spacy_doc_bin_file(dataset = test_ner, file_name = "./test.spacy")

Number of cases where conversion failed and were excluded 19
Number of cases where conversion failed and were excluded 2
Number of cases where conversion failed and were excluded 3


In [None]:
#Step 4
!python -m spacy debug data config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy --paths.test ./test.spacy

2023-11-10 11:58:36.337940: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-10 11:58:36.338011: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-10 11:58:36.338055: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[1m
Downloading (…)lve/main/config.json: 100% 481/481 [00:00<00:00, 3.55MB/s]
Downloading (…)olve/main/vocab.json: 100% 899k/899k [00:00<00:00, 13.3MB/s]
Downloading (…)olve/main/merges.txt: 100% 456k/456k [00:00<00:00, 73.6MB/s]
Downloading (…)/main/tokenizer.json: 100% 1.36M/1.36M [00:00<00:00, 60.7MB/s]
Downloading model.safetensors: 100% 499M/499M [00:02<

In [None]:
#Step 5
!python -m spacy train config.cfg --output ./ner_output --paths.train ./train.spacy --paths.dev ./dev.spacy --gpu-id 0

2023-11-10 12:00:26.238908: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-10 12:00:26.238957: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-10 12:00:26.238991: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[38;5;4mℹ Saving to output directory: ner_output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel f

In [None]:
#Step 6 Load Model
model_trained = spacy.load(Path("ner_output/model-last"))

In [None]:
for text, _ in test_ner[0:5]:
    doc = model_trained(text)
    print("\nText: ", text)
    print('Entities: ', [(ent.text, ent.label_) for ent in doc.ents])


Text:  Here we describe a patient with Crohn's disease who developed a severe infliximab infusion reaction (IIR), complicated 1 day later by severe swelling of the forearm and hand ipsilateral to the site of infliximab infusion.
Entities:  [('infliximab', 'drug')]

Text:  Fever, lymphadenopathy, eosinophilia, lymphocytosis, hepatitis, and dermatitis: a severe adverse reaction to minocycline.
Entities:  [('minocycline', 'drug')]

Text:  This case highlights the need to monitor liver enzymes in patients treated with 6-TG and identifies the need for additional research focused on the mechanism of thiopurine-induced hepatic injury.
Entities:  [('thiopurine', 'drug'), ('hepatic injury', 'effect')]

Text:  A patient with coccidioidal meningitis was treated with intrathecally administered amphotericin B, and an acute toxic delirium with EEG abnormalities developed.
Entities:  [('amphotericin B', 'drug'), ('acute toxic delirium', 'effect')]

Text:  Both had impaired lung function and abnormal

In [None]:
doc = model_trained(text1)
print("\nText: ", text1)
print('Entities: ', [(ent.text, ent.label_) for ent in doc.ents])


Text:  Diarrhoea, T-CD4+ lymphopenia and bilateral patchy pulmonary infiltrates developed in a male 60 yrs of age, who was treated with oxaliplatinum and 5-fluorouracil for unresectable rectum carcinoma.
Entities:  [('oxaliplatinum', 'drug')]


## Evaluation
#### Step 7
```python -m spacy benchmark accuracy "ner_output/model-last" ./test.spacy   ```

In [None]:
!python -m spacy benchmark accuracy "ner_output/model-last" ./dev.spacy  --gpu-id 0

2023-11-10 12:50:23.351214: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-10 12:50:23.351285: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-10 12:50:23.351334: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK     100.00
NER P   68.93 
NER R   72.45 
NER F   70.65 
SPEED   94    

[1m

             P       R       F
drug     77.78   85.71   81.55
effect   59.18   59.18   59.18



In [None]:
!python -m spacy benchmark accuracy "ner_output/model-last" ./test.spacy   --gpu-id 0

2023-11-10 12:51:03.444081: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-10 12:51:03.444153: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-10 12:51:03.444189: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK     100.00
NER P   72.68 
NER R   73.44 
NER F   73.06 
SPEED   92    

[1m

             P       R       F
drug     76.92   83.33   80.00
effect   67.78   63.54   65.59



# Part 2:  Relation Extraction
We use a text2text model from [Babelscape/rebel-large](https://huggingface.co/Babelscape/rebel-large) for Relation Extraction which recognises upto 200 different relation types.

[Github and paper link](https://github.com/Babelscape/rebel/tree/main)

In [None]:
triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large')

# Function to parse the generated text and extract the triplets
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/344 [00:00<?, ?B/s]

In [None]:
# We need to use the tokenizer manually since we need special tokens.
extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(text1, return_tensors=True, return_text=False)[0]["generated_token_ids"]])
print(extracted_text[0])
extracted_triplets = extract_triplets(extracted_text[0])
print(extracted_triplets)

<s><triplet> oxaliplatinum <subj> rectum carcinoma <obj> medical condition treated <triplet> rectum carcinoma <subj> oxaliplatinum <obj> drug used for treatment</s>
[{'head': 'oxaliplatinum', 'type': 'medical condition treated', 'tail': 'rectum carcinoma'}, {'head': 'rectum carcinoma', 'type': 'drug used for treatment', 'tail': 'oxaliplatinum'}]


In [None]:
extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(text2, return_tensors=True, return_text=False)[0]["generated_token_ids"]])
print(extracted_text[0])
extracted_triplets = extract_triplets(extracted_text[0])
print(extracted_triplets)

<s><triplet> Sukanya <subj> Bern <obj> residence</s>
[{'head': 'Sukanya', 'type': 'residence', 'tail': 'Bern'}]


In [None]:
extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor("Malaria is caused by mosquitoes and not flies", return_tensors=True, return_text=False)[0]["generated_token_ids"]])
print(extracted_text[0])
extracted_triplets = extract_triplets(extracted_text[0])
print(extracted_triplets)

<s><triplet> Malaria <subj> mosquito <obj> has cause <triplet> mosquito <subj> Malaria <obj> has effect</s>
[{'head': 'Malaria', 'type': 'has cause', 'tail': 'mosquito'}, {'head': 'mosquito', 'type': 'has effect', 'tail': 'Malaria'}]
