<a href="https://colab.research.google.com/github/Bentley97/NLU_Second_Assignment/blob/main/SecondAssignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

To clone the repository into the notebook and unzip the conll2003 archive

In [1]:
%%bash
git clone https://github.com/Bentley97/NLU_Second_Assignment.git

mkdir NLU_Second_Assignment/src/conll2003

unzip -q NLU_Second_Assignment/src/conll2003.zip -d NLU_Second_Assignment/src/conll2003

Cloning into 'NLU_Second_Assignment'...


In [36]:
import os
import sys
sys.path.insert(0, os.path.abspath('NLU_Second_Assignment/src'))

from conll import read_corpus_conll

import spacy
from spacy.tokens import Doc
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

trn_url = "NLU_Second_Assignment/src/conll2003/dev.txt"
trn_url = "NLU_Second_Assignment/src/conll2003/train.txt"
tst_url = "NLU_Second_Assignment/src/conll2003/test.txt"

raw_corpus = read_corpus_conll(tst_url)
raw_corpus.remove(raw_corpus[0])  #remove -DOCSTAR-

#getting hypotheses sentence by sentence
nlp = spacy.load("en_core_web_sm")

hyps = []
refs = []

for sent in raw_corpus:
  sentence = " ".join([elem[0].split(" ")[0] for elem in sent])
  
  refs.append([(e0,e3) for elem in sent for e0,e1,e2,e3 in [elem[0].split(" ")]])
  

  doc = nlp(sentence)


  ##### POST-PROCESS reassamble tokens
  i = 0
  j = -1
  doc_length = len(doc)
  while i != doc_length:
    if doc[i].whitespace_ == "" and doc[i] != doc[-1]:
      if j == -1:
        j = i
    elif j != -1:
      with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[j:i+1])
      doc_length -= i-j
      i = j
      j = -1

    i += 1
  #####

  ##### build the list of tuple (text, label) converting labels in conll format
  temp_hyp = []
  for token in doc:
    if token.ent_type_ == "":
      temp_hyp.append((token.text, token.ent_iob_))
    else:
      if token.ent_type_ == "PERSON":
        temp_hyp.append((token.text, token.ent_iob_+"-PER"))
      elif token.ent_type_ == "ORG":
        temp_hyp.append((token.text, token.ent_iob_+"-ORG"))
      elif token.ent_type_ == "LOC" or token.ent_type_ == "GPE" or token.ent_type_ == "FAC":
        temp_hyp.append((token.text, token.ent_iob_+"-LOC"))
      else:
        temp_hyp.append((token.text, token.ent_iob_+"-MISC")) #includes also numbers (CARDINAL,DATE,MONEY,ORDINAL,PERCENT,QUANTITY,TIME)
  
  hyps.append(temp_hyp)
  #####
  
 
##### adapt hypoteses and references to sklear input format
hyps_for_sklearn = [tup[1] for sent in hyps for tup in sent ]
refs_for_sklearn = [tup[1] for sent in refs for tup in sent ]

labels = sorted(list(set(refs_for_sklearn)))

### total accuracy is labeld as accuracy
print(classification_report(refs_for_sklearn, hyps_for_sklearn, labels=labels, digits=3))

print("Total accuracy: ",accuracy_score(refs_for_sklearn,hyps_for_sklearn))


              precision    recall  f1-score   support

       B-LOC      0.786     0.726     0.755      1668
      B-MISC      0.091     0.581     0.157       702
       B-ORG      0.523     0.337     0.410      1661
       B-PER      0.780     0.623     0.693      1617
       I-LOC      0.537     0.591     0.563       257
      I-MISC      0.055     0.426     0.097       216
       I-ORG      0.458     0.550     0.499       835
       I-PER      0.736     0.776     0.755      1156
           O      0.950     0.840     0.892     38553

    accuracy                          0.797     46665
   macro avg      0.546     0.606     0.536     46665
weighted avg      0.890     0.797     0.836     46665

Total accuracy:  0.7966998821386478


**EVALUATE FUNCTION**

In [None]:
from conll import evaluate
import pandas as pd

results = evaluate(refs, hyps)

pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)

{'ORG': {'p': 0.7948350071736011, 'r': 0.39571428571428574, 'f': 0.5283738674296614, 's': 1400}, 'PER': {'p': 0.6912751677852349, 'r': 0.2802721088435374, 'f': 0.398838334946757, 's': 735}, 'LOC': {'p': 0.0297753899928798, 'r': 0.8487084870848709, 'f': 0.0575323619535989, 's': 1084}, 'MISC': {'p': 0.5702479338842975, 'r': 0.20294117647058824, 'f': 0.299349240780911, 's': 340}, 'total': {'p': 0.05463234834759793, 'r': 0.4914301770160157, 'f': 0.09833300536924071, 's': 3559}}


Unnamed: 0,p,r,f,s
ORG,0.795,0.396,0.528,1400
PER,0.691,0.28,0.399,735
LOC,0.03,0.849,0.058,1084
MISC,0.57,0.203,0.299,340
total,0.055,0.491,0.098,3559
