<a href="https://colab.research.google.com/github/AhmedElgamiel/NER_methods_evaluation/blob/main/NERTestSetDataPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# First get clean text inputs to provide them to the different model

## Install Beautifulsoup library that will be used for scrapping the text from `p` tag

In [1]:
!pip install bs4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Read the text file 

In [2]:
ner_file = open("/content/news_sample_ner.txt","r")

## Scrap the file and get the text of `p` tag without any additional html tags , then remove any non-important spaces and make the sentece on a single line


In [3]:
from bs4 import BeautifulSoup

# parsing the html file
htmlParse = BeautifulSoup(ner_file, 'html.parser')
pure_sentences = []
# getting all the paragraphs
for para in htmlParse.find_all("p"):
    pure_sentences.append(para.get_text().replace("\n", " ").strip())
    #print(para.get_text().replace("\n", "").strip())

In [4]:
pure_sentences[6]

"``Because his capsule is not pressurized,'' Branson said in an interview, ``Steve will have to stay at around 20,000 feet, which is usually well below the core of the jet stream. That could slow him down a lot. We will be at 30,000 to 35,000 feet, where our chances of riding the high-speed jet stream are much better."

# Second , get the sentences with their NER tags to use them in creating the validation set to test the models later

## Read the file as raw text then split it on the P tag

In [5]:
from pathlib import Path
txt = Path('/content/news_sample_ner.txt').read_text()

In [6]:
sentences_with_NER_tags = txt.split('</p>')

In [7]:
# remove the last elemt of the list as it does not contain text of <p> tag
sentences_with_NER_tags.pop()

'</TEXT>\n<TRAILER>\nNYT-<TIMEX TYPE="DATE">04-03-96</TIMEX> <TIMEX TYPE="TIME">2047EST</TIMEX>\n</TRAILER>\n</DOC>'

In [8]:
# The first elemet of the list contains text of <p> tag and some other text , so I will remove the other text
sentences_with_NER_tags[0] = sentences_with_NER_tags[0].split('<p>')[1] 

In [9]:
# Clean the sentences : Remove unnecessary spaces , <p> tags and "\n"
for i in range(0,len(sentences_with_NER_tags)) :
  sentences_with_NER_tags[i] = sentences_with_NER_tags[i].replace("\n", "").replace("<p>", "").strip()

In [10]:
print(sentences_with_NER_tags[0])

Shortly after <ENAMEX TYPE="PERSON">Fossett</ENAMEX>'s launching <TIMEX TYPE="DATE">Monday</TIMEX> his competitors senthim telegrams of congratulation.


# Now we have sentences with their NER tags , It's time to create the test set from them

In [11]:
import re
def get_ground_truth_entities(sentence) :
  return re.findall('<ENAMEX TYPE="([a-zA-Z]*)">([^\<]*)</ENAMEX>', sentence)

In [12]:
ground_truth_entities = []
for sentence in sentences_with_NER_tags :
  ground_truth_entities.append(get_ground_truth_entities(sentence))

In [13]:
len(ground_truth_entities)

173

In [14]:
ground_truth_entities[1]

[('ORGANIZATION', 'Virgin'),
 ('PERSON', 'Richard Branson'),
 ('ORGANIZATION', 'Virgin Atlantic Airways'),
 ('PERSON', 'Per Lindstrand'),
 ('ORGANIZATION', 'Lindstrand Balloons Ltd.'),
 ('LOCATION', 'Oswestry'),
 ('LOCATION', 'England'),
 ('PERSON', 'Rory McCarthy')]

# Annotate the data with Spacy using statistal and transformer methods

## First install important libraries

In [15]:
# install spacy library
!pip install -U spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy
  Downloading spacy-3.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.4.4
    Uninstalling spacy-3.4.4:
      Successfully uninstalled spacy-3.4.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
en-core-web-sm 3.4.1 requires spacy<3.5.0,>=3.4.0, but you have spacy 3.5.0 which is incompatible.[0m[31m
[0mSuccessfully installed spacy-3.5.0


In [16]:
# Install spacy-transformers
!pip install spacy[transformers]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy-transformers<1.3.0,>=1.1.2
  Downloading spacy_transformers-1.2.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (193 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.5/193.5 KB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting transformers<4.27.0,>=3.4.0
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m65.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m68.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)


In [17]:
# Download the traditional spacy english language large model
!python -m spacy download en_core_web_lg 


2023-01-29 04:04:24.471678: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [18]:
# Download the spacy transformer (roberta-base) english model
!python -m spacy download en_core_web_trf


2023-01-29 04:04:54.431561: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-trf==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl (460.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m460.3/460.3 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-trf
Successfully installed en-core-web-trf-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


## Develope functions for getting , printing and visualize entities

In [19]:
def print_entities(pipeline, text):
    
  # Create a document 
  document = pipeline(text)
    
  # Entity text & label extraction
  for entity in document.ents:
    print(entity.text + '->', entity.label_)
        
        
def visualize_entities(pipeline, text):
  
  # Create a document 
  document = pipeline(text)
        
  # Show entities in pretty manner
  displacy.render(document, jupyter=True, style='ent')

In [20]:
def get_entities(pipeline, text):
  labels = []
  # Create a document 
  document = pipeline(text)
    
  # Entity text & label extraction
  for entity in document.ents:
    #print(entity)
    #print(entity.text + '->', entity.label_)
    #if entity.label_ in ["ORG" , "PERSON" , "LOC"] :
    labels.append((entity.label_ , entity.text))
  return labels

def get_specific_entities(pipeline, text):
  labels = []
  # Create a document 
  document = pipeline(text)
    
  # Entity text & label extraction
  for entity in document.ents:
    # -------------------------------- IMPORTANT NOTES ---------------------- #
    # We take just locations , persons and organizations in consideration
    # In the annotated text I found LOC and GPE are the same , so I treated them as they are same in the function
    if entity.label_ in ["ORG" , "PERSON" , "LOC" ,"GPE"] :
      txt = entity.text
      if entity.label_ == "LOC" or entity.label_ == "GPE" :
        label = "LOCATION"
      elif entity.label_ == "ORG" :
        label = "ORGANIZATION"
      elif entity.label_ == "PERSON" :
        label = "PERSON"

      labels.append((label , txt))
  return labels

## Load NER statistical model and get the prediction for the whole dataset

In [21]:
import spacy
from spacy import displacy

# Load English large model
nlp_sm = spacy.load("en_core_web_lg")



In [22]:
predictions_from_statistical_model = []
for i in pure_sentences :
  predictions_from_statistical_model.append(get_specific_entities(nlp_sm,i))

## Load NER transformers-based model and get the prediction for the whole dataset

In [23]:
# Load the spacy transformer (roberta-base) model
roberta_nlp = spacy.load("en_core_web_trf")


In [24]:
predictions_from_transformer_based_model = []
for i in pure_sentences :
  predictions_from_transformer_based_model.append(get_specific_entities(roberta_nlp,i))

In [25]:
short_text = pure_sentences[8]

In [26]:
sentences_with_NER_tags[8]

'The Dutch team, led by <ENAMEX TYPE="PERSON">Henk Brink</ENAMEX>, recently tested the inflationof its <ENAMEX TYPE="ORGANIZATION">Unicef</ENAMEX> Flyer balloon at <ENAMEX TYPE="LOCATION">Cape Kennedy</ENAMEX> in <ENAMEX TYPE="ORGANIZATION">NASA</ENAMEX>\'s huge VehicleAssembly Building, where space shuttles are prepared for flight.<ENAMEX TYPE="PERSON">Brink</ENAMEX>, who intends to launch his balloon from <ENAMEX TYPE="LOCATION">Nijmegen</ENAMEX>, <ENAMEX TYPE="LOCATION">theNetherlands</ENAMEX>, is a helicopter instructor and veteran balloonist.'

In [27]:
sm_labels = get_entities(nlp_sm, short_text)
sm_specific_labels = get_specific_entities(nlp_sm, short_text)
sm_labels

[('NORP', 'Dutch'),
 ('PERSON', 'Henk Brink'),
 ('ORG', 'Unicef'),
 ('FAC', 'Cape Kennedy'),
 ('ORG', 'NASA'),
 ('ORG', 'Vehicle Assembly Building'),
 ('PERSON', 'Brink'),
 ('GPE', 'Nijmegen'),
 ('GPE', 'Netherlands')]

In [44]:
#sm_specific_labels

In [29]:
trf_lables = get_entities(roberta_nlp, short_text)
trf_specific_lables = get_specific_entities(roberta_nlp, short_text)

trf_lables

[('NORP', 'Dutch'),
 ('PERSON', 'Henk Brink'),
 ('PRODUCT', 'Unicef Flyer'),
 ('LOC', 'Cape Kennedy'),
 ('ORG', 'NASA'),
 ('FAC', 'Vehicle Assembly Building'),
 ('PERSON', 'Brink'),
 ('GPE', 'Nijmegen'),
 ('GPE', 'Netherlands')]

# Now it's time to evaluate the model predictions usinf percision , recall and f1 metrics

In [30]:
# A function to get True Pos , False Pos and False Neg with ecaxt match method
def get_TPOS_FPOS_FNEG_with_exact_match_method(annotations , predictions) :
  len_of_samples = len(annotations)
  true_pos = false_pos = false_neg = 0
  for i in range(len_of_samples) :
    for annot in annotations[i] :
      if annot in predictions[i] :
        true_pos += 1
      else :
        false_neg += 1

    for pred in predictions[i] :
      if pred not in annotations[i] :
        false_pos += 1
    
  return true_pos , false_pos , false_neg


In [31]:
annot = ground_truth_entities[0]
annot

[('PERSON', 'Fossett')]

In [32]:
pred = predictions_from_statistical_model[0]
pred

[('ORGANIZATION', 'Fossett')]

In [33]:
true_pos , false_pos , false_neg = get_TPOS_FPOS_FNEG_with_exact_match_method([annot] , [pred])

In [34]:
print(true_pos , false_pos , false_neg)

0 1 1


### Get precision , recall and f1 for the statistical model

In [35]:
true_pos , false_pos , false_neg = get_TPOS_FPOS_FNEG_with_exact_match_method(ground_truth_entities,predictions_from_statistical_model)

In [36]:
print(true_pos , false_pos , false_neg)

272 121 157


In [37]:
Precision = true_pos / (true_pos + false_pos) 
Recall = true_pos / (true_pos + false_neg) 

In [38]:
f1_score = (2 * (Precision * Recall)) / (Precision + Recall)

In [39]:
print(Precision , Recall , f1_score)

0.6921119592875318 0.634032634032634 0.6618004866180048


### Get precision , recall and f1 for the transfomer-based model

In [40]:
true_pos , false_pos , false_neg = get_TPOS_FPOS_FNEG_with_exact_match_method(ground_truth_entities,predictions_from_transformer_based_model)

In [41]:
Precision = true_pos / (true_pos + false_pos) 
Recall = true_pos / (true_pos + false_neg) 

In [42]:
f1_score = (2 * (Precision * Recall)) / (Precision + Recall)

In [43]:
print(Precision , Recall , f1_score)

0.7902813299232737 0.7202797202797203 0.753658536585366
