<a href="https://colab.research.google.com/github/AhmedElgamiel/NER_methods_evaluation/blob/main/NERTestSetDataPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install important dependencies

In [None]:
!pip install bs4

In [None]:
# install spacy library
!pip install -U spacy

In [None]:
# Install spacy-transformers
!pip install spacy[transformers]

In [None]:
# Download the traditional spacy english language large model
!python -m spacy download en_core_web_lg 


In [None]:
# Download the spacy transformer (roberta-base) english model
!python -m spacy download en_core_web_trf


## Import our dependencies

In [6]:
from bs4 import BeautifulSoup
from pathlib import Path
import re
import spacy
from spacy import displacy




## Fuction Definitions

In [7]:
def get_ground_truth_entities(sentence) :
  return re.findall('<ENAMEX TYPE="([a-zA-Z]*)">([^\<]*)</ENAMEX>', sentence)

In [8]:
def print_entities(pipeline, text):
    
  # Create a document 
  document = pipeline(text)
    
  # Entity text & label extraction
  for entity in document.ents:
    print(entity.text + '->', entity.label_)
        
        
def visualize_entities(pipeline, text):
  
  # Create a document 
  document = pipeline(text)
        
  # Show entities in pretty manner
  displacy.render(document, jupyter=True, style='ent')

In [9]:

def get_specific_entities(pipeline, text):
  labels = []
  # Create a document 
  document = pipeline(text)
    
  # Entity text & label extraction
  for entity in document.ents:
    # -------------------------------- IMPORTANT NOTES ---------------------- #
    # We take just locations , persons and organizations in consideration
    # In the annotated text I found LOC and GPE are the same , so I treated them as they are same in the function
    if entity.label_ in ["ORG" , "PERSON" , "LOC" ,"GPE"] :
      txt = entity.text
      if entity.label_ == "LOC" or entity.label_ == "GPE" :
        label = "LOCATION"
      elif entity.label_ == "ORG" :
        label = "ORGANIZATION"
      elif entity.label_ == "PERSON" :
        label = "PERSON"

      labels.append((label , txt))
  return labels

In [10]:
# A function to get True Pos , False Pos and False Neg with ecaxt match method
def get_TPOS_FPOS_FNEG_with_exact_match_method(annotations , predictions) :
  len_of_samples = len(annotations)
  true_pos = false_pos = false_neg = 0
  for i in range(len_of_samples) :
    for annot in annotations[i] :
      if annot in predictions[i] :
        true_pos += 1
      else :
        false_neg += 1

    for pred in predictions[i] :
      if pred not in annotations[i] :
        false_pos += 1
    
  return true_pos , false_pos , false_neg


In [48]:
def calculatePercision_Recall_F1(true_pos , false_pos , false_neg) :
  Precision = true_pos / (true_pos + false_pos) 
  Recall = true_pos / (true_pos + false_neg) 
  f1_score = (2 * (Precision * Recall)) / (Precision + Recall)
  return Precision , Recall , f1_score

# First get clean text inputs to provide them to the different model

## Read the text file 

In [11]:
ner_file = open("/content/news_sample_ner.txt","r")

## Scrap the file and get the text of `p` tag without any additional html tags , then remove any non-important spaces and make the sentece on a single line


In [12]:
# parsing the html file
htmlParse = BeautifulSoup(ner_file, 'html.parser')
pure_sentences = []
# getting all the paragraphs
for para in htmlParse.find_all("p"):
    pure_sentences.append(para.get_text().replace("\n", " ").strip())
    #print(para.get_text().replace("\n", "").strip())

In [13]:
pure_sentences[6]

"``Because his capsule is not pressurized,'' Branson said in an interview, ``Steve will have to stay at around 20,000 feet, which is usually well below the core of the jet stream. That could slow him down a lot. We will be at 30,000 to 35,000 feet, where our chances of riding the high-speed jet stream are much better."

# Second , get the sentences with their NER tags to use them in creating the validation set to test the models later

## Read the file as raw text then split it on the P tag

In [14]:
txt = Path('/content/news_sample_ner.txt').read_text()

In [15]:
sentences_with_NER_tags = txt.split('</p>')

In [16]:
# remove the last elemt of the list as it does not contain text of <p> tag
sentences_with_NER_tags.pop()

'</TEXT>\n<TRAILER>\nNYT-<TIMEX TYPE="DATE">04-03-96</TIMEX> <TIMEX TYPE="TIME">2047EST</TIMEX>\n</TRAILER>\n</DOC>'

In [17]:
# The first elemet of the list contains text of <p> tag and some other text , so I will remove the other text
sentences_with_NER_tags[0] = sentences_with_NER_tags[0].split('<p>')[1] 

In [18]:
# Clean the sentences : Remove unnecessary spaces , <p> tags and "\n"
for i in range(0,len(sentences_with_NER_tags)) :
  sentences_with_NER_tags[i] = sentences_with_NER_tags[i].replace("\n", "").replace("<p>", "").strip()

In [19]:
print(sentences_with_NER_tags[0])

Shortly after <ENAMEX TYPE="PERSON">Fossett</ENAMEX>'s launching <TIMEX TYPE="DATE">Monday</TIMEX> his competitors senthim telegrams of congratulation.


# Now we have sentences with their NER tags , It's time to create the test set from them

In [20]:
ground_truth_entities = []
for sentence in sentences_with_NER_tags :
  ground_truth_entities.append(get_ground_truth_entities(sentence))

In [21]:
len(ground_truth_entities)

173

In [22]:
ground_truth_entities[1]

[('ORGANIZATION', 'Virgin'),
 ('PERSON', 'Richard Branson'),
 ('ORGANIZATION', 'Virgin Atlantic Airways'),
 ('PERSON', 'Per Lindstrand'),
 ('ORGANIZATION', 'Lindstrand Balloons Ltd.'),
 ('LOCATION', 'Oswestry'),
 ('LOCATION', 'England'),
 ('PERSON', 'Rory McCarthy')]

# Annotate the data with Spacy using statistal and transformer methods

## Load NER statistical model and get the prediction for the whole dataset

In [23]:
# Load English large model
nlp_sm = spacy.load("en_core_web_lg")

In [24]:
predictions_from_statistical_model = []
for i in pure_sentences :
  predictions_from_statistical_model.append(get_specific_entities(nlp_sm,i))

## Load NER transformers-based model and get the prediction for the whole dataset

In [25]:
# Load the spacy transformer (roberta-base) model
roberta_nlp = spacy.load("en_core_web_trf")


In [26]:
predictions_from_transformer_based_model = []
for i in pure_sentences :
  predictions_from_transformer_based_model.append(get_specific_entities(roberta_nlp,i))

In [28]:
text = pure_sentences[8]

In [29]:
sentences_with_NER_tags[8]

'The Dutch team, led by <ENAMEX TYPE="PERSON">Henk Brink</ENAMEX>, recently tested the inflationof its <ENAMEX TYPE="ORGANIZATION">Unicef</ENAMEX> Flyer balloon at <ENAMEX TYPE="LOCATION">Cape Kennedy</ENAMEX> in <ENAMEX TYPE="ORGANIZATION">NASA</ENAMEX>\'s huge VehicleAssembly Building, where space shuttles are prepared for flight.<ENAMEX TYPE="PERSON">Brink</ENAMEX>, who intends to launch his balloon from <ENAMEX TYPE="LOCATION">Nijmegen</ENAMEX>, <ENAMEX TYPE="LOCATION">theNetherlands</ENAMEX>, is a helicopter instructor and veteran balloonist.'

In [30]:
sm_specific_labels = get_specific_entities(nlp_sm, text)
sm_specific_labels

[('PERSON', 'Henk Brink'),
 ('ORGANIZATION', 'Unicef'),
 ('ORGANIZATION', 'NASA'),
 ('ORGANIZATION', 'Vehicle Assembly Building'),
 ('PERSON', 'Brink'),
 ('LOCATION', 'Nijmegen'),
 ('LOCATION', 'Netherlands')]

In [None]:
#sm_specific_labels

In [32]:
trf_specific_lables = get_specific_entities(roberta_nlp, text)
trf_specific_lables

[('PERSON', 'Henk Brink'),
 ('LOCATION', 'Cape Kennedy'),
 ('ORGANIZATION', 'NASA'),
 ('PERSON', 'Brink'),
 ('LOCATION', 'Nijmegen'),
 ('LOCATION', 'Netherlands')]

# Now it's time to evaluate the model predictions usinf percision , recall and f1 metrics

### Get precision , recall and f1 for the statistical model

In [45]:
true_pos , false_pos , false_neg = get_TPOS_FPOS_FNEG_with_exact_match_method(ground_truth_entities,predictions_from_statistical_model)

In [46]:
print(true_pos , false_pos , false_neg)

272 121 157


In [49]:
Precision , Recall , f1_score = calculatePercision_Recall_F1(true_pos , false_pos , false_neg) 

In [51]:
print("Precision: " + str(Precision) +", Recall: "  + str(Recall) + ",f1_score: " + str(f1_score))

Precision: 0.6921119592875318, Recall: 0.634032634032634,f1_score: 0.6618004866180048


### Get precision , recall and f1 for the transfomer-based model

In [52]:
true_pos , false_pos , false_neg = get_TPOS_FPOS_FNEG_with_exact_match_method(ground_truth_entities,predictions_from_transformer_based_model)

In [53]:
Precision , Recall , f1_score = calculatePercision_Recall_F1(true_pos , false_pos , false_neg) 

In [54]:
print("Precision: " + str(Precision) +", Recall: "  + str(Recall) + ",f1_score: " + str(f1_score))

Precision: 0.7902813299232737, Recall: 0.7202797202797203,f1_score: 0.753658536585366
