In [2]:
# imports
import sys
sys.path.append("../")

from scripts.load_data import mapping, read_tsv_file, read_iob2_file
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

# Computing metrics for predictions on non-augmented test set

In [3]:
# paths to data
path_test = "../data/da_news_new/new_da_news_test.tsv"
path_test_pred = "../baseline/outputs/test_pred.iob2" 

In [4]:
# mapping labels
label2id, id2label = mapping(path_test)

print(label2id)
print(id2label)

{'B-LOC': 0, 'O': 1, 'B-ORG': 2, 'I-PER': 3, 'I-MISC': 4, 'I-ORG': 5, 'B-MISC': 6, 'I-LOC': 7, 'B-PER': 8}
{0: 'B-LOC', 1: 'O', 2: 'B-ORG', 3: 'I-PER', 4: 'I-MISC', 5: 'I-ORG', 6: 'B-MISC', 7: 'I-LOC', 8: 'B-PER'}


In [5]:
# reading in files
test_data = read_tsv_file(path_test, label2id)
test_pred = read_iob2_file(path_test_pred, label2id)

In [6]:
# first sentence
print(test_data[0])
print(test_pred[0])

{'tokens': ['Under', 'rejser', 'og', 'ophold', 'i', 'udlandet', 'følger', 'sygeplejersker', 'og', 'hjælpere', 'med', '.'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'tag_ids': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'tokens': ['Under', 'rejser', 'og', 'ophold', 'i', 'udlandet', 'følger', 'sygeplejersker', 'og', 'hjælpere', 'med', '.'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'tag_ids': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [7]:
# saving all true labels
true_labels = []

for sent in test_data:
    true_labels.append(sent['ner_tags'])

print(true_labels[0])
print(len(true_labels) == len(test_data))

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True


In [8]:
# saving all predicted labels
pred_labels = []

for sent in test_pred:
    pred_labels.append(sent['ner_tags'])

print(pred_labels[0])
print(len(true_labels) == len(test_data))

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
True


In [9]:
# compute precision, recall, and F1 score
precision = precision_score(true_labels, pred_labels)
recall = recall_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels)

print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1: {f1:.2f}%")

# optionally, print a detailed classification report
print("\nClassification Report:")
print(classification_report(true_labels, pred_labels))

Precision: 0.77%
Recall: 0.73%
F1: 0.75%

Classification Report:
              precision    recall  f1-score   support

         LOC       0.77      0.77      0.77        91
        MISC       0.58      0.24      0.34        63
         ORG       0.64      0.71      0.67       122
         PER       0.91      0.89      0.90       169

   micro avg       0.77      0.73      0.75       445
   macro avg       0.72      0.65      0.67       445
weighted avg       0.76      0.73      0.73       445

