<a href="https://colab.research.google.com/github/Bentley97/NLU_Second_Assignment/blob/main/SecondAssignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

To clone the repository into the notebook and unzip the conll2003 archive

In [1]:
%%bash
git clone https://github.com/Bentley97/NLU_Second_Assignment.git

mkdir NLU_Second_Assignment/src/conll2003

unzip -q NLU_Second_Assignment/src/conll2003.zip -d NLU_Second_Assignment/src/conll2003

Cloning into 'NLU_Second_Assignment'...


In [127]:
### POST-PROCESS reassamble tokens
def reassemble_tokens(doc):
  i = 0
  j = -1
  doc_length = len(doc)
  while i != doc_length:
    if doc[i].whitespace_ == "" and doc[i] != doc[-1]:
      if j == -1:
        j = i
    elif j != -1:
      with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[j:i+1])
      doc_length -= i-j
      i = j
      j = -1

    i += 1

  return doc

### convert labels from the spacy format to the conll format
def convert_labels_into_conll(doc):
  temp_hyp = []
  for token in doc:
    if token.ent_type_ == "":
      temp_hyp.append((token.text, token.ent_iob_))
    else:
      if token.ent_type_ == "PERSON":
        temp_hyp.append((token.text, token.ent_iob_+"-PER"))
      elif token.ent_type_ == "ORG":
        temp_hyp.append((token.text, token.ent_iob_+"-ORG"))
      elif token.ent_type_ == "LOC" or token.ent_type_ == "GPE" or token.ent_type_ == "FAC":
        temp_hyp.append((token.text, token.ent_iob_+"-LOC"))
      else:
        temp_hyp.append((token.text, token.ent_iob_+"-MISC")) #includes also numbers (CARDINAL,DATE,MONEY,ORDINAL,PERCENT,QUANTITY,TIME)

  return temp_hyp

### convert a list of lists of tuples into a list of string(label) with the same order
def convert_in_ordered_list_of_label(l):
  return [tup[1] for sent in l for tup in sent ]
  
### builds a list of tuples from a text sentence
def build_references(sentence):
  return [(e0,e3) for elem in sent for e0,e1,e2,e3 in [elem[0].split(" ")]]



In [128]:
import os
import sys
sys.path.insert(0, os.path.abspath('NLU_Second_Assignment/src'))

from conll import read_corpus_conll

import spacy
from spacy.tokens import Doc
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

trn_url = "NLU_Second_Assignment/src/conll2003/dev.txt"
trn_url = "NLU_Second_Assignment/src/conll2003/train.txt"
tst_url = "NLU_Second_Assignment/src/conll2003/test.txt"


raw_corpus = read_corpus_conll(tst_url) # reading the file
raw_corpus.remove(raw_corpus[0])  # remove -DOCSTAR-

### loading the english pipeline
nlp = spacy.load("en_core_web_sm")

hyps = []
refs = []

### cicle over all sentences in the corpus
for sent in raw_corpus:
  sentence = " ".join([elem[0].split(" ")[0] for elem in sent])
  
  ### building list of references for a sentence and append it to the list of references of the whole
  refs.append(build_references(sentence))
  
  ### call to the NER of spacy
  doc = nlp(sentence)

  ### POST-PROCESS reassamble tokens
  doc = reassemble_tokens(doc)

  ### build the list of tuple (text, label) for a sentence converting labels in conll format and appending to list of hypoteses
  hyps.append(convert_labels_into_conll(doc))
  
 
### adapt hypoteses and references to sklear input format
hyps_for_sklearn = convert_in_ordered_list_of_label(hyps)
refs_for_sklearn = convert_in_ordered_list_of_label(refs)

### extract labels present 
labels = sorted(list(set(refs_for_sklearn)))


### total accuracy is labeld as accuracy
print("PERFORMANCES token-level:")
print(classification_report(refs_for_sklearn, hyps_for_sklearn, labels=labels, digits=3))

print("Total accuracy: ",accuracy_score(refs_for_sklearn,hyps_for_sklearn))


PERFORMANCES token-level:
              precision    recall  f1-score   support

       B-LOC      0.786     0.726     0.755      1668
      B-MISC      0.091     0.581     0.157       702
       B-ORG      0.523     0.337     0.410      1661
       B-PER      0.780     0.623     0.693      1617
       I-LOC      0.537     0.591     0.563       257
      I-MISC      0.055     0.426     0.097       216
       I-ORG      0.458     0.550     0.499       835
       I-PER      0.736     0.776     0.755      1156
           O      0.950     0.840     0.892     38553

    accuracy                          0.797     46665
   macro avg      0.546     0.606     0.536     46665
weighted avg      0.890     0.797     0.836     46665

Total accuracy:  0.7966998821386478


**EVALUATE FUNCTION**

In [125]:
from conll import evaluate
import pandas as pd

results = evaluate(refs, hyps)

print("PERFORMANCES chunk-level:")
pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)

PERFORMANCES chunk-level:


Unnamed: 0,p,r,f,s
LOC,0.777,0.718,0.746,1668
MISC,0.087,0.558,0.151,702
ORG,0.464,0.299,0.363,1661
PER,0.74,0.592,0.658,1617
total,0.363,0.539,0.433,5648


In [None]:
####################################################     NON SERVE     ######################################################
def extract_label(refs, hyps, label):
  r = []
  h = []

  for i in range(len(refs)):
    if hyps[i] == label:
    #if refs[i] == label or hyps[i] == label or refs[i] != hyps[i]:
      r.append(refs[i])
      h.append(hyps[i])
  
  return r,h