# Import required libraries

Import the required libraries:
*   `nltk`
*   `spacy`
*   `scikit-learn`
*   `conll`
*   `pandas`
*   `operator`


In [2]:
import nltk
from nltk.corpus.reader import ConllCorpusReader

import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from sklearn.metrics import classification_report

# to import conll
from conll import evaluate

# for nice tables
import pandas as pd
import operator

# 0. Evaluate spaCy NER on CoNLL 2003 data


The function `remerge_sent()` merges the tokens of doc when the token is not followed by a whitespace (e.g. `01-01-2021` date is tokenized by spaCy as five different tokens and we need to merge them together).
Merging tokens is required in order to align the token of CoNLL 2003 data with that found by spaCy.
This method, as pointed out by the creator of spaCy [here](https://github.com/explosion/spaCy/issues/379), is the best way to put the tokens back together.

In [3]:
def remerge_sent(doc):
  ''' Merge tokens of doc when tokens are not followed by a whitespace.
  '''
  with doc.retokenize() as retokenizer:
    i = 0
    while i < len(doc)-1:            
      tok = doc[i]
      # check if there isn't a whitespace after the token in the original sentence
      if not tok.whitespace_:
        j = i+1
        # find the last token index to merge
        while j < len(doc) and not doc[j].whitespace_:
            j += 1
        
        # in-place operation.
        retokenizer.merge(doc[i:j+1])

        i = j+1
      else:
        i += 1

The function `get_doc_from_sent_list()` accepts a list of words that represents a sentence, applies spaCy pipeline, gets a `Doc` object and merges tokens not followed by a whitespace.

In [4]:
def get_doc_from_sent_list(sent_list):
  ''' Get spaCy doc from a list of words and apply remerge_sent().
  '''
  sentence = ' '.join([text for text, pos, iob in sent_list])
  doc = nlp(sentence)
  remerge_sent(doc)

  return doc

Read CoNLL 2003 data using the `ConllCorpusReader()` function from `nltk` and remove empty lists using `filter()`.

In [5]:
train = list(filter(None, ConllCorpusReader('data/conll2003', 'train.txt', ['words', 'pos', 'ignore', 'chunk', 'ne']).iob_sents() ))
test = list(filter(None, ConllCorpusReader('data/conll2003', 'test.txt', ['words', 'pos', 'ignore', 'chunk', 'ne']).iob_sents() ))

In [6]:
# getting references
refs = [[(text, iob) for text, pos, iob in sent] for sent in test]

# getting spaCy documents needed to obtain the hypothesis list
doc_hyps = [get_doc_from_sent_list(sent) for sent in test]
tok_hyps = [[t for t in doc] for doc in doc_hyps]

Find the mapping between spaCy and CoNLL entity types.
Each entity type of spaCy is mapped to the entity type of CoNLL with the maximum occurrences wrt. training set sentences. The function below calculates how many times a spaCy entity type corresponds to a CoNLL type in the test sentence and then gets the maximum one. It returns a dictionary, mapping spaCy entity types to CoNLL types.

In [7]:
def get_spacy2conll_mapping(train):
  ''' Find the mapping between spacy and CoNLL entity types.
  Each entity type of spacy is mapped to the entity type of CoNLL with the
  maximum occurrences wrt. training set sentences.
  '''
  refs = [[(text, iob) for text, pos, iob in sent] for sent in train]
  tok_hyps = [[t for t in get_doc_from_sent_list(sent)] for sent in train]

  hyp_ref_mapping_counter = {'':{}}
  for r_sent, h_sent in zip(refs, tok_hyps):
    for r, h in zip(r_sent, h_sent):
      if r[1] != 'O' or h.ent_iob_ != 'O':
        if h.ent_type_ not in hyp_ref_mapping_counter:
          hyp_ref_mapping_counter[h.ent_type_] = dict()
        
        iob_label = 'O'      
        if len(r[1].split('-')) > 1:
          iob_label = r[1].split('-')[1]

        if iob_label not in hyp_ref_mapping_counter[h.ent_type_]:
          hyp_ref_mapping_counter[h.ent_type_][iob_label] = 1
        else:
          hyp_ref_mapping_counter[h.ent_type_][iob_label] += 1

  # calculate spacy-to-conll entity types mapping
  spacy2conll_mapping = dict()
  for k,v in hyp_ref_mapping_counter.items():
    # take the mapping with the maximum occurrences
    spacy2conll_mapping[k] = max(v.items(), key=operator.itemgetter(1))[0]

  return spacy2conll_mapping

In [8]:
spacy2conll_mapping = get_spacy2conll_mapping(train[:4000]) # consider only the first 4000 sentences to speed up computation
print(spacy2conll_mapping)

{'': 'ORG', 'ORG': 'ORG', 'NORP': 'MISC', 'PERSON': 'PER', 'DATE': 'O', 'GPE': 'LOC', 'LOC': 'LOC', 'CARDINAL': 'O', 'QUANTITY': 'O', 'PERCENT': 'O', 'MONEY': 'O', 'LANGUAGE': 'MISC', 'TIME': 'O', 'ORDINAL': 'O', 'PRODUCT': 'ORG', 'EVENT': 'MISC', 'FAC': 'LOC', 'LAW': 'MISC', 'WORK_OF_ART': 'O'}


Create an hypothesis list where spaCy IOB labels are mapped to CoNLL labels, using the dictionary `spacy2conll_mapping` calculated before.

The parameter `ext_entities` should be set to `True` when named entities have been extended. In this way, user defined attributes are going to be used.

In [9]:
def spacy2conll_hyps(doc_hyps, ext_entities=False):
  conll_hyps = list()
  for doc in doc_hyps:
    sent_list = list()
    for t in doc:
      # map spacy iob to conll iob
      if ext_entities: # extended named entities, use user defined attributes
        if t._.ent_iob_ == 'O' or spacy2conll_mapping[t._.ent_type_] == 'O':
          iob = 'O'
        else:
          iob = '-'.join([t._.ent_iob_, spacy2conll_mapping[t._.ent_type_]])
      else:
        if t.ent_iob_ == 'O' or spacy2conll_mapping[t.ent_type_] == 'O':
          iob = 'O'
        else:
          iob = '-'.join([t.ent_iob_, spacy2conll_mapping[t.ent_type_]])

      sent_list.append((t.text, iob))

    conll_hyps.append(sent_list)
  
  return conll_hyps

In [10]:
conll_hyps = spacy2conll_hyps(doc_hyps)

## 0.1 Report token-level performance (per class and total).

Using `sklearn.classification_report()`, show token-level performance results.

In [11]:
report = classification_report([r[1] for r_sent in refs for r in r_sent], [r[1] for r_sent in conll_hyps for r in r_sent], digits=3)
print(report)

              precision    recall  f1-score   support

       B-LOC      0.760     0.681     0.718      1668
      B-MISC      0.802     0.541     0.646       702
       B-ORG      0.503     0.309     0.383      1661
       B-PER      0.800     0.629     0.704      1617
       I-LOC      0.535     0.560     0.548       257
      I-MISC      0.555     0.352     0.431       216
       I-ORG      0.415     0.515     0.460       835
       I-PER      0.838     0.788     0.812      1156
           O      0.949     0.981     0.965     38323

    accuracy                          0.909     46435
   macro avg      0.684     0.595     0.630     46435
weighted avg      0.902     0.909     0.904     46435



## 0.2 Report CoNLL chunk-level performance (per class and total).

Using the `evaluate()` function from `conll.py`, show chunk-level performance of spaCy pre-trained model. 

In [12]:
results = evaluate(refs, conll_hyps)
pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)

Unnamed: 0,p,r,f,s
ORG,0.452,0.278,0.344,1661
LOC,0.748,0.671,0.708,1668
MISC,0.791,0.534,0.638,702
PER,0.774,0.609,0.681,1617
total,0.69,0.521,0.593,5648


# 1. Grouping of Entities

The function `group_entities()` groups more named entities when part of a noun chunk, otherwise the named entity is treated as a group of one element. The function returns a list-of-list containing the different groups.

The function proceeds grouping entities that are part of a noun chunk and that are between two noun chunks, till the end of the document.

In [1]:
def group_entities(doc):
  noun_chunks = list(doc.noun_chunks)
  groups = []

  first_chunk_start = len(doc)
  if len(noun_chunks) > 0:
    first_chunk_start = noun_chunks[0].start

  # group named entities (groups of one element) before the first noun chunk
  groups += [[ent.label_] for ent in doc[:first_chunk_start].ents]

  for i, chunk in enumerate(noun_chunks):
    # group named entities that are in this noun chunk
    g = [ent.label_ for ent in chunk.ents]
    if len(g) > 0:
      groups.append(g)

    if i == len(noun_chunks)-1:
      span = doc[chunk.end:]
    else:
      span = doc[chunk.end:noun_chunks[i+1].start]
    
    # group named entities (groups of one element) between this noun chunk and the next one
    groups += [[ent.label_] for ent in span.ents]

  return groups

Example of grouping entities.

In [14]:
sent = "Apple's Steve Jobs died in 2011 in Palo Alto, California."
print(group_entities(nlp(sent)))

[['ORG', 'PERSON'], ['DATE'], ['GPE'], ['GPE']]


## Frequency of groups

Calculate frequency of each group of entity types. It has been decided to treat groups with the same elements but different order as different because order can be important in this analysis. The list of group types with their frequency in the descending order is shown below.

The most frequent groups with more than one element are:
```
CARDINAL-PERSON: 52
NORP-PERSON:     42
GPE-PERSON:      35
GPE-GPE:         24
```

In [28]:
occurrencies = dict() # number of occurrencies for each group
for doc in doc_hyps:
  groups = group_entities(doc)
  for g in groups:
    k = '-'.join(g) # groups with same elements but different order are counted as different
    if k not in occurrencies: # add to dict if not present
      occurrencies[k] = 1
    else:
      occurrencies[k] += 1

total = sum(occurrencies.values())
print("{:30} {:<15} {}\n{}".format("group entity types", "frequency", "occurrencies", "-"*60))
for k,v in sorted(occurrencies.items(), key=operator.itemgetter(1), reverse=True):
  print("{:30} {:<15} {}".format(k, round(v/total, 6), v))

group entity types             frequency       occurrencies
------------------------------------------------------------
CARDINAL                       0.230221        1583
GPE                            0.181937        1251
PERSON                         0.153287        1054
DATE                           0.12929         889
ORG                            0.11591         797
NORP                           0.04203         289
MONEY                          0.020361        140
ORDINAL                        0.015416        106
PERCENT                        0.011344        78
TIME                           0.010908        75
EVENT                          0.007853        54
CARDINAL-PERSON                0.007563        52
LOC                            0.007417        51
QUANTITY                       0.006981        48
NORP-PERSON                    0.006108        42
GPE-PERSON                     0.00509         35
GPE-GPE                        0.00349         24
PRODUCT           

# 2. Fix segmentation errors

Add user defined attributes `ent_iob_`, `ent_type_` and `ent_id` to `Token` class. These attributes are used to extend named entities without modifing the original `doc`. The attribute `ent_id` is used to identify named entities in the function `extend_entity()`.

In [16]:
token = nlp(".")[0] # get a token
token.set_extension("ent_iob_", default='O', force=True)
token.set_extension("ent_type_", default='', force=True)
token.set_extension("ent_id", default=None, force=True)

The function `extend_entity_p2c_rec()` extends recursively named entities from a parent token that is part of an entity to a child token that is not. Child token must have a `'compound'` dependency relation with its parent. Child token attribute `ent_iob_` is initially assigned to `'I'` and successively the function `extend_entity()` reassign it correctly.

In [17]:
def extend_entity_p2c_rec(token):
  for child_token in token.children:
    if token._.ent_iob_ != 'O' and child_token.ent_iob_ == 'O' and child_token.dep_ == 'compound':
      # set child token as part of the entity
      child_token._.ent_iob_ = 'I'
      child_token._.ent_type_ = token._.ent_type_
      child_token._.ent_id = token._.ent_id
    else:
      child_token._.ent_iob_ = child_token.ent_iob_
      child_token._.ent_type_ = child_token.ent_type_

    extend_entity_p2c_rec(child_token)    

The function `extend_entity_c2p_rec()` extends recursively named entities from a child token that is part of an entity to a parent token that is not. Child token must have a `'compound'` dependency relation with its parent. Parent token attribute `ent_iob_` is initially assigned to `'I'` and successively the function `extend_entity()` reassigns it correctly. The dependency tree is visited in post-order.

In [18]:
def extend_entity_c2p_rec(token):
  token._.ent_iob_ = token.ent_iob_
  token._.ent_type_ = token.ent_type_

  for child_token in token.children:
    extend_entity_c2p_rec(child_token)
    if child_token._.ent_iob_ != 'O' and token.ent_iob_ == 'O' and child_token.dep_ == 'compound':
      token._.ent_iob_ = 'I'
      token._.ent_type_ = child_token._.ent_type_
      token._.ent_id = child_token._.ent_id

The function `extend_entity()` invokes the previous functions to extend named entities. If `p2c` parameter is set to `True`, named entities are extended from parent to child tokens (`extend_entity_p2c_rec()`), otherwise named entities are extended from child to parent tokens (`extend_entity_c2p_rec()`).
Both versions have been implemented in order to compare the results.

First the function assigns to each token the entity id that it belongs to, invokes one of the two functions to extend named entities recursively from the head of the sentence and then changes the value of the attribute `ent_iob_` of each token in order to correctly assign `'B'` to the first token of the entity and `'I'` to the successive ones.

In [19]:
def extend_entity(doc_hyps, p2c):
  for doc in doc_hyps:
    # assign an id to each recognized named entity
    for i, ent in enumerate(doc.ents):
      for token in ent:
        token._.ent_id = i
    
    # extend named entities
    for token in doc:
      if token.head == token: # root token
        if p2c: # if p2c is True, apply parent-to-child extension
          token._.ent_iob_ = token.ent_iob_
          token._.ent_type_ = token.ent_type_
          extend_entity_p2c_rec(token)
        else: # else apply child-to-parent
          extend_entity_c2p_rec(token)

    # assign ent_iob_ correctly to each extended entity
    entity_ids = set()
    for token in doc:
      if token._.ent_id is not None:
        if token._.ent_id not in entity_ids:
          token._.ent_iob_ = 'B'
          entity_ids.add(token._.ent_id)
        else:
          token._.ent_iob_ = 'I'


Example of extending named entities where the token `'university'` is added to the named entity `'New York'` because token `'York'` has a `'compound'` relation to `'university'`.

In [20]:
sent = "New York university was established in 1000."
doc = nlp(sent)

displacy.render(doc, jupyter=True, style="ent")
displacy.render(doc, jupyter=True)

# before extend named entities
print([(t.text, t.ent_iob_, t.ent_type_) for t in doc])

# extend named entities (child-to-parent)
extend_entity([doc], p2c=False)
print([(t.text, t._.ent_iob_, t._.ent_type_) for t in doc])

[('New', 'B', 'GPE'), ('York', 'I', 'GPE'), ('university', 'O', ''), ('was', 'O', ''), ('established', 'O', ''), ('in', 'O', ''), ('1000', 'B', 'DATE'), ('.', 'O', '')]
[('New', 'B', 'GPE'), ('York', 'I', 'GPE'), ('university', 'I', 'GPE'), ('was', 'O', ''), ('established', 'O', ''), ('in', 'O', ''), ('1000', 'B', 'DATE'), ('.', 'O', '')]


## Comparing results

Show token-level and chunk-level performance results of extended named entities, extending from parent to child.

In [21]:
extend_entity(doc_hyps, p2c=True)
ext_p2c_hyps = spacy2conll_hyps(doc_hyps, ext_entities=True)
extended_entity_results = evaluate(refs, ext_p2c_hyps)
pd_tbl = pd.DataFrame().from_dict(extended_entity_results, orient='index')
pd_tbl.round(decimals=3)

Unnamed: 0,p,r,f,s
ORG,0.442,0.272,0.337,1661
LOC,0.724,0.651,0.686,1668
MISC,0.787,0.533,0.636,702
PER,0.644,0.508,0.568,1617
total,0.64,0.484,0.551,5648


In [22]:
report = classification_report([r[1] for r_sent in refs for r in r_sent], [r[1] for r_sent in ext_p2c_hyps for r in r_sent], digits=3)
print(report)

              precision    recall  f1-score   support

       B-LOC      0.736     0.660     0.696      1668
      B-MISC      0.800     0.540     0.645       702
       B-ORG      0.495     0.305     0.377      1661
       B-PER      0.668     0.525     0.588      1617
       I-LOC      0.439     0.560     0.492       257
      I-MISC      0.539     0.352     0.426       216
       I-ORG      0.402     0.522     0.454       835
       I-PER      0.683     0.797     0.735      1156
           O      0.950     0.973     0.961     38323

    accuracy                          0.898     46435
   macro avg      0.635     0.582     0.597     46435
weighted avg      0.892     0.898     0.893     46435



Show token-level and chunk-level performance results of extended named entities, extending from child to parent.

In [23]:
extend_entity(doc_hyps, p2c=False)
ext_c2p_hyps = spacy2conll_hyps(doc_hyps, ext_entities=True)
extended_entity_results = evaluate(refs, ext_c2p_hyps)
pd_tbl = pd.DataFrame().from_dict(extended_entity_results, orient='index')
pd_tbl.round(decimals=3)

Unnamed: 0,p,r,f,s
ORG,0.289,0.224,0.253,1661
LOC,0.688,0.627,0.656,1668
MISC,0.71,0.489,0.579,702
PER,0.764,0.602,0.674,1617
total,0.599,0.484,0.536,5648


In [24]:
report = classification_report([r[1] for r_sent in refs for r in r_sent], [r[1] for r_sent in ext_c2p_hyps for r in r_sent], digits=3)
print(report)

              precision    recall  f1-score   support

       B-LOC      0.760     0.662     0.708      1668
      B-MISC      0.805     0.540     0.646       702
       B-ORG      0.419     0.317     0.361      1661
       B-PER      0.791     0.541     0.642      1617
       I-LOC      0.315     0.576     0.407       257
      I-MISC      0.374     0.356     0.365       216
       I-ORG      0.332     0.526     0.407       835
       I-PER      0.708     0.800     0.751      1156
           O      0.951     0.964     0.957     38323

    accuracy                          0.892     46435
   macro avg      0.606     0.587     0.583     46435
weighted avg      0.894     0.892     0.890     46435



We can conclude that extending named entities using the `'compound'` relation is not a great choice. Performances are lower than in the original NER of spaCy.