In [12]:
# imports
import sys
sys.path.append("../")

from scripts.load_data import mapping, read_tsv_file, read_iob2_file
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

## Computing metrics for predictions on non-augmented and augmented test set

In [13]:
# paths to data
path_test = "../data/no_overlap_da_news/da_news_test.tsv"
path_test_pred = "predictions/test_pred.iob2" 
path_me_test = "../data/me_data/middle_eastern_test.tsv"
path_me_test_pred = "predictions/me_test_pred.iob2" 

In [14]:
# mapping labels
label2id, id2label = mapping(path_test)

print(label2id)
print(id2label)

{'I-LOC': 0, 'B-PER': 1, 'B-LOC': 2, 'B-ORG': 3, 'O': 4, 'I-PER': 5, 'I-ORG': 6, 'B-MISC': 7, 'I-MISC': 8}
{0: 'I-LOC', 1: 'B-PER', 2: 'B-LOC', 3: 'B-ORG', 4: 'O', 5: 'I-PER', 6: 'I-ORG', 7: 'B-MISC', 8: 'I-MISC'}


In [15]:
# reading in files
test_data = read_tsv_file(path_test, label2id)
test_pred = read_iob2_file(path_test_pred, label2id)
me_test_data = read_tsv_file(path_me_test, label2id)
me_test_pred = read_iob2_file(path_me_test_pred, label2id)

In [16]:
# first sentence
print(test_data[1])
print(test_pred[1])
print(me_test_data[1])
print(me_test_pred[1])

{'tokens': ['I', '1983', 'ville', 'en', 'mand', 'have', 'navneforandring', 'til', 'Werwolf', ',', 'men', 'det', 'fik', 'han', 'ikke', 'lov', 'til', ',', 'for', 'det', 'betragtes', 'som', 'upassende', 'og', 'anstødeligt', '.'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'tag_ids': [4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]}
{'tokens': ['I', '1983', 'ville', 'en', 'mand', 'have', 'navneforandring', 'til', 'Werwolf', ',', 'men', 'det', 'fik', 'han', 'ikke', 'lov', 'til', ',', 'for', 'det', 'betragtes', 'som', 'upassende', 'og', 'anstødeligt', '.'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'tag_ids': [4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]}
{'tokens': ['I', '1983', 'ville', 'en', 'mand', 'have', 'navneforand

In [17]:
# saving all true labels
true_labels = []

for sent in test_data:
    true_labels.append(sent['ner_tags'])

print(true_labels[1])
print(len(true_labels) == len(test_data))

# saving all predicted labels
pred_labels = []

for sent in test_pred:
    pred_labels.append(sent['ner_tags'])

print(pred_labels[1])
print(len(pred_labels) == len(test_data))

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True


In [18]:
# for ME predictions
me_true_labels = []

for sent in me_test_data:
    me_true_labels.append(sent['ner_tags'])

print(me_true_labels[1])
print(len(me_true_labels) == len(test_data))

me_pred_labels = []

for sent in me_test_pred:
    me_pred_labels.append(sent['ner_tags'])

print(me_pred_labels[1])
print(len(me_pred_labels) == len(test_data))

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True


## Non-augmented test set

In [19]:
# compute precision, recall, and F1 score
precision = precision_score(true_labels, pred_labels)
recall = recall_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels)

print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1: {f1:.2f}%")

# optionally, print a detailed classification report
print("\nClassification Report:")
print(classification_report(true_labels, pred_labels))

Precision: 0.85%
Recall: 0.78%
F1: 0.81%

Classification Report:
              precision    recall  f1-score   support

         LOC       0.79      0.79      0.79        58
        MISC       0.65      0.43      0.52        35
         ORG       0.85      0.68      0.76        82
         PER       0.92      0.95      0.93       118

   micro avg       0.85      0.78      0.81       293
   macro avg       0.80      0.71      0.75       293
weighted avg       0.84      0.78      0.81       293



In [20]:
!python span_f1.py ../data/no_overlap_da_news/da_news_test.iob2 predictions/test_pred.iob2

recall:    0.7808219178082192
precision: 0.8507462686567164
slot-f1:   0.8142857142857143

unlabeled
ul_recall:    0.8561643835616438
ul_precision: 0.9328358208955224
ul_slot-f1:   0.8928571428571429

loose (partial overlap with same label)
l_recall:    0.7945205479452054
l_precision: 0.8694029850746269
l_slot-f1:   0.830276779405892


## Augmented test set

In [21]:
# compute precision, recall, and F1 score
precision = precision_score(me_true_labels, me_pred_labels)
recall = recall_score(me_true_labels, me_pred_labels)
f1 = f1_score(me_true_labels, me_pred_labels)

print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1: {f1:.2f}%")

# optionally, print a detailed classification report
print("\nClassification Report:")
print(classification_report(me_true_labels, me_pred_labels))

Precision: 0.75%
Recall: 0.71%
F1: 0.73%

Classification Report:
              precision    recall  f1-score   support

         LOC       0.76      0.78      0.77        58
        MISC       0.65      0.43      0.52        35
         ORG       0.75      0.48      0.58        82
         PER       0.76      0.93      0.84       118

   micro avg       0.75      0.71      0.73       293
   macro avg       0.73      0.65      0.68       293
weighted avg       0.74      0.71      0.71       293



In [22]:
!python span_f1.py ../data/me_data/middle_eastern_test.iob2 predictions/me_test_pred.iob2

recall:    0.708904109589041
precision: 0.7527272727272727
slot-f1:   0.73015873015873

unlabeled
ul_recall:    0.8698630136986302
ul_precision: 0.9236363636363636
ul_slot-f1:   0.8959435626102293

loose (partial overlap with same label)
l_recall:    0.7363013698630136
l_precision: 0.7818181818181819
l_slot-f1:   0.7583774250440918
