In [1]:
# imports
import sys
sys.path.append("../")

from scripts.load_data import mapping, read_tsv_file, read_iob2_file
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

## Computing metrics for predictions on non-augmented and augmented test set

In [2]:
# paths to data
path_test = "../data/no_overlap_da_news/da_news_test.tsv"
path_test_pred = "predictions/test_pred.iob2" 
path_me_test = "../data/me_data/middle_eastern_test.tsv"
path_me_test_pred = "predictions/me_test_pred.iob2" 

In [3]:
# mapping labels
label2id, id2label = mapping(path_test)

print(label2id)
print(id2label)

{'B-ORG': 0, 'O': 1, 'I-PER': 2, 'B-MISC': 3, 'I-ORG': 4, 'B-PER': 5, 'I-LOC': 6, 'I-MISC': 7, 'B-LOC': 8}
{0: 'B-ORG', 1: 'O', 2: 'I-PER', 3: 'B-MISC', 4: 'I-ORG', 5: 'B-PER', 6: 'I-LOC', 7: 'I-MISC', 8: 'B-LOC'}


In [4]:
# reading in files
test_data = read_tsv_file(path_test, label2id)
test_pred = read_iob2_file(path_test_pred, label2id)
me_test_data = read_tsv_file(path_me_test, label2id)
me_test_pred = read_iob2_file(path_me_test_pred, label2id)

In [5]:
# first sentence
print(test_data[1])
print(test_pred[1])
print(me_test_data[1])
print(me_test_pred[1])

{'tokens': ['I', '1983', 'ville', 'en', 'mand', 'have', 'navneforandring', 'til', 'Werwolf', ',', 'men', 'det', 'fik', 'han', 'ikke', 'lov', 'til', ',', 'for', 'det', 'betragtes', 'som', 'upassende', 'og', 'anstødeligt', '.'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'tag_ids': [1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'tokens': ['I', '1983', 'ville', 'en', 'mand', 'have', 'navneforandring', 'til', 'Werwolf', ',', 'men', 'det', 'fik', 'han', 'ikke', 'lov', 'til', ',', 'for', 'det', 'betragtes', 'som', 'upassende', 'og', 'anstødeligt', '.'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'tag_ids': [1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'tokens': ['I', '1983', 'ville', 'en', 'mand', 'have', 'navneforand

In [6]:
# saving all true labels
true_labels = []

for sent in test_data:
    true_labels.append(sent['ner_tags'])

print(true_labels[1])
print(len(true_labels) == len(test_data))

# saving all predicted labels
pred_labels = []

for sent in test_pred:
    pred_labels.append(sent['ner_tags'])

print(pred_labels[1])
print(len(pred_labels) == len(test_data))

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True


In [7]:
# for ME predictions
me_true_labels = []

for sent in me_test_data:
    me_true_labels.append(sent['ner_tags'])

print(me_true_labels[1])
print(len(me_true_labels) == len(test_data))

me_pred_labels = []

for sent in me_test_pred:
    me_pred_labels.append(sent['ner_tags'])

print(me_pred_labels[1])
print(len(me_pred_labels) == len(test_data))

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True


## Non-augmented test set

In [8]:
# compute precision, recall, and F1 score
precision = precision_score(true_labels, pred_labels)
recall = recall_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels)

print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1: {f1:.2f}%")

# optionally, print a detailed classification report
print("\nClassification Report:")
print(classification_report(true_labels, pred_labels))

Precision: 0.82%
Recall: 0.77%
F1: 0.79%

Classification Report:
              precision    recall  f1-score   support

         LOC       0.77      0.79      0.78        58
        MISC       0.70      0.54      0.61        35
         ORG       0.74      0.63      0.68        82
         PER       0.92      0.92      0.92       118

   micro avg       0.82      0.77      0.79       293
   macro avg       0.78      0.72      0.75       293
weighted avg       0.81      0.77      0.79       293



In [9]:
!python span_f1.py ../data/no_overlap_da_news/da_news_test.iob2 predictions/test_pred.iob2

recall:    0.7705479452054794
precision: 0.8302583025830258
slot-f1:   0.7992895204262876

unlabeled
ul_recall:    0.8561643835616438
ul_precision: 0.922509225092251
ul_slot-f1:   0.8880994671403197

loose (partial overlap with same label)
l_recall:    0.7842465753424658
l_precision: 0.8487084870848709
l_slot-f1:   0.8152051942825745


## Augmented test set

In [10]:
# compute precision, recall, and F1 score
precision = precision_score(me_true_labels, me_pred_labels)
recall = recall_score(me_true_labels, me_pred_labels)
f1 = f1_score(me_true_labels, me_pred_labels)

print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1: {f1:.2f}%")

# optionally, print a detailed classification report
print("\nClassification Report:")
print(classification_report(me_true_labels, me_pred_labels))

Precision: 0.75%
Recall: 0.71%
F1: 0.73%

Classification Report:
              precision    recall  f1-score   support

         LOC       0.75      0.71      0.73        58
        MISC       0.45      0.51      0.48        35
         ORG       0.79      0.54      0.64        82
         PER       0.83      0.90      0.86       118

   micro avg       0.75      0.71      0.73       293
   macro avg       0.70      0.66      0.68       293
weighted avg       0.75      0.71      0.73       293



In [11]:
!python span_f1.py ../data/me_data/middle_eastern_test.iob2 predictions/me_test_pred.iob2

recall:    0.7123287671232876
precision: 0.7482014388489209
slot-f1:   0.7298245614035087

unlabeled
ul_recall:    0.8767123287671232
ul_precision: 0.920863309352518
ul_slot-f1:   0.8982456140350877

loose (partial overlap with same label)
l_recall:    0.7328767123287672
l_precision: 0.7733812949640287
l_slot-f1:   0.7525844019890081
