<a href="https://colab.research.google.com/github/Bentley97/NLU_Second_Assignment/blob/main/SecondAssignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

To clone the repository into the notebook and unzip the conll2003 archive

In [1]:
%%bash
git clone https://github.com/Bentley97/NLU_Second_Assignment.git

mkdir NLU_Second_Assignment/src/conll2003

unzip -q NLU_Second_Assignment/src/conll2003.zip -d NLU_Second_Assignment/src/conll2003

Cloning into 'NLU_Second_Assignment'...


In [2]:
### POST-PROCESS reassamble tokens
def reassemble_tokens(doc):
  i = 0
  j = -1
  doc_length = len(doc)
  while i != doc_length:
    if doc[i].whitespace_ == "" and doc[i] != doc[-1]:
      if j == -1:
        j = i
    elif j != -1:
      with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[j:i+1])
      doc_length -= i-j
      i = j
      j = -1

    i += 1

  return doc

### convert labels from the spacy format to the conll format
def convert_labels_into_conll(doc, convert_dict):
  temp_hyp = []
  for token in doc:
    if token.ent_type_ == "":
      temp_hyp.append((token.text, token.ent_iob_))
    else:
      temp_hyp.append((token.text, token.ent_iob_+"-"+convert_dict[token.ent_type_]))
    
  return temp_hyp

### convert a list of lists of tuples into a list of string(label) with the same order
def convert_in_ordered_list_of_label(l):
  return [tup[1] for sent in l for tup in sent ]
  
### builds a list of tuples from a text sentence
def build_references(sentence):
  return [(e0,e3) for elem in sent for e0,e1,e2,e3 in [elem[0].split(" ")]]



In [3]:
import os
import sys
sys.path.insert(0, os.path.abspath('NLU_Second_Assignment/src'))

from conll import read_corpus_conll

import spacy
from spacy.tokens import Doc
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

trn_url = "NLU_Second_Assignment/src/conll2003/dev.txt"
trn_url = "NLU_Second_Assignment/src/conll2003/train.txt"
tst_url = "NLU_Second_Assignment/src/conll2003/test.txt"


raw_corpus = read_corpus_conll(tst_url) # reading the file
raw_corpus.remove(raw_corpus[0])  # remove -DOCSTAR-

### loading the english pipeline
nlp = spacy.load("en_core_web_sm")

hyps = []
refs = []

### cicle over all sentences in the corpus
for sent in raw_corpus:
  sentence = " ".join([elem[0].split(" ")[0] for elem in sent])
  
  ### building list of references for a sentence and append it to the list of references of the whole
  refs.append(build_references(sentence))
  
  ### call to the NER of spacy
  doc = nlp(sentence)

  ### POST-PROCESS reassamble tokens
  doc = reassemble_tokens(doc)

  ### build the list of tuple (text, label) for a sentence converting labels in conll format and appending to list of hypoteses
  convert_dict = {
      "PERSON": "PER",
      "ORG": "ORG",
      "LOC": "LOC",
      "GPE": "LOC",
      "FAC": "LOC",
      "CARDINAL": "MISC",
      "DATE": "MISC",
      "EVENT": "MISC",
      "LANGUAGE": "MISC",
      "LAW": "MISC",
      "MONEY": "MISC",
      "NORP": "MISC",
      "ORDINAL": "MISC",
      "PERCENT": "MISC",
      "PRODUCT": "MISC",
      "QUANTITY": "MISC",
      "TIME": "MISC",
      "WORK_OF_ART": "MISC"
  }
  hyps.append(convert_labels_into_conll(doc, convert_dict))
  
 
### adapt hypoteses and references to sklear input format
hyps_for_sklearn = convert_in_ordered_list_of_label(hyps)
refs_for_sklearn = convert_in_ordered_list_of_label(refs)

### extract labels present 
labels = sorted(list(set(refs_for_sklearn)))


### total accuracy is labeld as accuracy
print("PERFORMANCES token-level:")
print(classification_report(refs_for_sklearn, hyps_for_sklearn, labels=labels, digits=3))

print("Total accuracy: ",accuracy_score(refs_for_sklearn,hyps_for_sklearn))


PERFORMANCES token-level:
              precision    recall  f1-score   support

       B-LOC      0.786     0.726     0.755      1668
      B-MISC      0.091     0.581     0.157       702
       B-ORG      0.523     0.337     0.410      1661
       B-PER      0.780     0.623     0.693      1617
       I-LOC      0.537     0.591     0.563       257
      I-MISC      0.055     0.426     0.097       216
       I-ORG      0.458     0.550     0.499       835
       I-PER      0.736     0.776     0.755      1156
           O      0.950     0.840     0.892     38553

    accuracy                          0.797     46665
   macro avg      0.546     0.606     0.536     46665
weighted avg      0.890     0.797     0.836     46665

Total accuracy:  0.7966998821386478


**EVALUATE FUNCTION**

In [4]:
from conll import evaluate
import pandas as pd

results = evaluate(refs, hyps)

print("PERFORMANCES chunk-level:")
pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)

PERFORMANCES chunk-level:


Unnamed: 0,p,r,f,s
ORG,0.464,0.299,0.363,1661
PER,0.74,0.592,0.658,1617
LOC,0.777,0.718,0.746,1668
MISC,0.087,0.558,0.151,702
total,0.363,0.539,0.433,5648


# **2. Grouping of entities**

In [5]:
def grouping_entities(doc):
#  print("ENTITIES:")
#  for ent in doc.ents:
#    print(ent, ent[0].idx)
#  print("ENTITIES!!!")

#  for chunk in doc.noun_chunks:
#    print("")
#    print("CHUNK: ",chunk.text)
#    print(chunk, " | ", chunk.root)
#    for ent in chunk.ents:
#      print("ENT: ", ent.text, " - ", ent.label_, ent[0].idx)

#  print("")
#  print("")
#  print("")
#  print("START")
#  print("")

  retlist = []
  ent_chunked = []

#  for ent in doc.ents:
    #ent_indexes.append(ent[0].idx)
#    print(ent[0].idx)

#  print("")
#  print("FOR")
#  print("")

  for ent in doc.ents:
#    print("")
#    print(ent)
    in_chunk = False
    if ent[0].idx not in ent_chunked:
      for chunk in doc.noun_chunks:
        if len(chunk.ents) != 0:
#          print("CHUNK:: ",chunk, "  ", chunk.ents)
#          print(chunk.ents[0].start_char, chunk.ents[0])
          if chunk.ents[0].start_char == ent[0].idx:
            in_chunk = True
            temp_result = []
            for ce in chunk.ents:
              temp_result.append(ce.label_)
              ent_chunked.append(ce[0].idx)
              #result.append([ce.label_ for ce in chunk.ents])
            break
      if in_chunk == False:
        retlist.append([ent.label_])
      else:
        retlist.append(temp_result)

  return retlist
  

In [6]:
import spacy

test_sentence = "Apple's Steve Jobs died in 2011 in Palo Alto , California . Autonomous cars shift insurance liability toward manufacturers in 1996"

nlp = spacy.load("en_core_web_sm")
doc = nlp(test_sentence)
groups_of_entities = grouping_entities(doc)
print("Test grouping function")
print(groups_of_entities)


Test grouping function
[['ORG', 'PERSON'], ['DATE'], ['GPE'], ['GPE'], ['DATE']]


In [41]:
### non tengo conto dell'ordine preché il significato del chunk è diverso (anche se non so che statistica vuole fare il prof)
from collections import defaultdict

def counting(groups):
  dict_group = defaultdict(int)

  for g in groups:
    key = ", ".join([s for s in g])
    dict_group[key] = dict_group[key] + 1

  return dict_group


In [51]:
import spacy
from collections

nlp = spacy.load("en_core_web_sm")

groups = []

for sent in raw_corpus:
  sentence = " ".join([elem[0].split(" ")[0] for elem in sent])
  
  doc = nlp(sentence)
  
  groups.extend(grouping_entities(doc))
  

counts = counting(groups)
sort_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)

print("NE groups frequencies:")
for comb in sort_counts:
  print(comb)

NE groups frequencies:
('CARDINAL', 2116)
('GPE', 1346)
('DATE', 1140)
('PERSON', 1105)
('ORG', 955)
('NORP', 308)
('MONEY', 151)
('ORDINAL', 117)
('TIME', 92)
('PERCENT', 86)
('QUANTITY', 82)
('EVENT', 58)
('LOC', 57)
('NORP, PERSON', 47)
('CARDINAL, PERSON', 45)
('GPE, PERSON', 26)
('PRODUCT', 26)
('ORG, PERSON', 25)
('FAC', 21)
('CARDINAL, NORP', 16)
('CARDINAL, ORG', 13)
('WORK_OF_ART', 12)
('GPE, ORG', 11)
('GPE, GPE', 11)
('CARDINAL, GPE', 11)
('PERSON, PERSON', 10)
('DATE, EVENT', 9)
('ORG, ORG', 9)
('LANGUAGE', 8)
('LAW', 8)
('NORP, ORG', 7)
('PERSON, GPE', 6)
('DATE, ORG', 6)
('GPE, CARDINAL', 5)
('DATE, TIME', 5)
('DATE, NORP', 5)
('ORG, GPE', 4)
('CARDINAL, CARDINAL', 4)
('NORP, NORP', 4)
('GPE, NORP', 4)
('ORG, DATE', 4)
('CARDINAL, DATE', 3)
('ORDINAL, PERSON', 3)
('GPE, ORDINAL', 3)
('NORP, GPE', 3)
('ORDINAL, CARDINAL', 2)
('DATE, PRODUCT', 2)
('NORP, DATE', 2)
('ORG, NORP', 2)
('MONEY, ORG', 2)
('PERSON, ORG', 2)
('ORG, LOC', 2)
('DATE, PERSON', 2)
('ORDINAL, EVENT', 2)