In [73]:
# imports
import sys
sys.path.append("../")

from scripts.load_data import mapping, read_tsv_file, read_iob2_file
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

## Computing metrics for predictions on non-augmented and augmented test set

In [74]:
# paths to data
path_test = "../data/no_overlap_da_news/da_news_test.tsv"
path_test_pred = "predictions/test_pred.iob2" 
path_me_test = "../data/me_data/middle_eastern_test.tsv"
path_me_test_pred = "predictions/me_test_pred.iob2" 

In [75]:
# mapping labels
label2id, id2label = mapping(path_test)

print(label2id)
print(id2label)

{'I-ORG': 0, 'B-ORG': 1, 'I-LOC': 2, 'B-MISC': 3, 'I-PER': 4, 'B-LOC': 5, 'I-MISC': 6, 'B-PER': 7, 'O': 8}
{0: 'I-ORG', 1: 'B-ORG', 2: 'I-LOC', 3: 'B-MISC', 4: 'I-PER', 5: 'B-LOC', 6: 'I-MISC', 7: 'B-PER', 8: 'O'}


In [76]:
# reading in files
test_data = read_tsv_file(path_test, label2id)
test_pred = read_iob2_file(path_test_pred, label2id)
me_test_data = read_tsv_file(path_me_test, label2id)
me_test_pred = read_iob2_file(path_me_test_pred, label2id)

In [77]:
# first sentence
print(test_data[1])
print(test_pred[1])
print(me_test_data[1])
print(me_test_pred[1])

{'tokens': ['I', '1983', 'ville', 'en', 'mand', 'have', 'navneforandring', 'til', 'Werwolf', ',', 'men', 'det', 'fik', 'han', 'ikke', 'lov', 'til', ',', 'for', 'det', 'betragtes', 'som', 'upassende', 'og', 'anstødeligt', '.'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'tag_ids': [8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]}
{'tokens': ['I', '1983', 'ville', 'en', 'mand', 'have', 'navneforandring', 'til', 'Werwolf', ',', 'men', 'det', 'fik', 'han', 'ikke', 'lov', 'til', ',', 'for', 'det', 'betragtes', 'som', 'upassende', 'og', 'anstødeligt', '.'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'tag_ids': [8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]}
{'tokens': ['I', '1983', 'ville', 'en', 'mand', 'have', 'navneforand

In [78]:
# saving all true labels
true_labels = []

for sent in test_data:
    true_labels.append(sent['ner_tags'])

print(true_labels[1])
print(len(true_labels) == len(test_data))

# saving all predicted labels
pred_labels = []

for sent in test_pred:
    pred_labels.append(sent['ner_tags'])

print(pred_labels[1])
print(len(pred_labels) == len(test_data))

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True


In [79]:
# for ME predictions
me_true_labels = []

for sent in me_test_data:
    me_true_labels.append(sent['ner_tags'])

print(me_true_labels[1])
print(len(me_true_labels) == len(test_data))

me_pred_labels = []

for sent in me_test_pred:
    me_pred_labels.append(sent['ner_tags'])

print(me_pred_labels[1])
print(len(me_pred_labels) == len(test_data))

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True


## Non-augmented test set

In [80]:
# compute precision, recall, and F1 score
precision = precision_score(true_labels, pred_labels)
recall = recall_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels)

print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1: {f1:.2f}%")

# optionally, print a detailed classification report
print("\nClassification Report:")
print(classification_report(true_labels, pred_labels))

Precision: 0.79%
Recall: 0.73%
F1: 0.76%

Classification Report:
              precision    recall  f1-score   support

         LOC       0.79      0.82      0.81        91
        MISC       0.48      0.22      0.30        63
         ORG       0.72      0.66      0.69       122
         PER       0.90      0.92      0.91       169

   micro avg       0.79      0.73      0.76       445
   macro avg       0.72      0.66      0.68       445
weighted avg       0.77      0.73      0.74       445



In [84]:
!python span_f1.py ../data/no_overlap_da_news/da_news_test.iob2 predictions/test_pred.iob2

recall:    0.7268623024830699
precision: 0.7970297029702971
slot-f1:   0.7603305785123967

unlabeled
ul_recall:    0.8261851015801355
ul_precision: 0.905940594059406
ul_slot-f1:   0.8642266824085006

loose (partial overlap with same label)
l_recall:    0.7516930022573364
l_precision: 0.8267326732673267
l_slot-f1:   0.7874291135386946


## Augmented test set

In [81]:
# compute precision, recall, and F1 score
precision = precision_score(me_true_labels, me_pred_labels)
recall = recall_score(me_true_labels, me_pred_labels)
f1 = f1_score(me_true_labels, me_pred_labels)

print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1: {f1:.2f}%")

# optionally, print a detailed classification report
print("\nClassification Report:")
print(classification_report(me_true_labels, me_pred_labels))

Precision: 0.71%
Recall: 0.67%
F1: 0.69%

Classification Report:
              precision    recall  f1-score   support

         LOC       0.80      0.65      0.72        91
        MISC       0.41      0.21      0.27        63
         ORG       0.63      0.59      0.61       122
         PER       0.78      0.91      0.84       169

   micro avg       0.71      0.67      0.69       445
   macro avg       0.65      0.59      0.61       445
weighted avg       0.69      0.67      0.67       445



In [None]:
!python span_f1.py ../data/no_overlap_da_news/da_news_test.iob2 predictions/test_pred.iob2