In [12]:
# imports
import sys
sys.path.append("../")

from scripts.load_data import label_mapping, read_tsv_file, read_iob2_file
from scripts.evaluation import extract_true_and_pred_labels
from seqeval.metrics import classification_report

# Computing metrics for predictions on non-augmented and augmented test set for final model

In [13]:
# paths to data
path_test = "../data/no_overlap_da_news/da_news_test.tsv"
path_test_pred = "final_preds/final_test_pred.iob2" 
path_me_test = "../data/me_data/middle_eastern_test.tsv"
path_me_test_pred = "final_preds/final_me_test_pred.iob2" 

In [14]:
# mapping labels
label2id, id2label = label_mapping(path_test)

print(label2id)
print(id2label)

{'I-PER': 0, 'B-PER': 1, 'I-ORG': 2, 'B-ORG': 3, 'O': 4, 'B-MISC': 5, 'I-MISC': 6, 'I-LOC': 7, 'B-LOC': 8}
{0: 'I-PER', 1: 'B-PER', 2: 'I-ORG', 3: 'B-ORG', 4: 'O', 5: 'B-MISC', 6: 'I-MISC', 7: 'I-LOC', 8: 'B-LOC'}


In [15]:
# reading in files
test_data = read_tsv_file(path_test, label2id)
test_pred = read_iob2_file(path_test_pred, label2id)
me_test_data = read_tsv_file(path_me_test, label2id)
me_test_pred = read_iob2_file(path_me_test_pred, label2id)

In [16]:
# saving all true and predicted labels for test set
true_labels, pred_labels = extract_true_and_pred_labels(test_data, test_pred)
me_true_labels, me_pred_labels = extract_true_and_pred_labels(me_test_data, me_test_pred)

## Non-augmented test set

In [17]:
# print span-f1
!python ../scripts/span_f1.py ../data/no_overlap_da_news/da_news_test.iob2 final_preds/final_test_pred.iob2

recall:    0.8047945205479452
precision: 0.8362989323843416
slot-f1:   0.8202443280977312

unlabeled
ul_recall:    0.8904109589041096
ul_precision: 0.9252669039145908
ul_slot-f1:   0.9075043630017452

loose (partial overlap with same label)
l_recall:    0.821917808219178
l_precision: 0.8540925266903915
l_slot-f1:   0.837696335078534


In [18]:
# print classification report
print("\nClassification Report:")
print(classification_report(true_labels, pred_labels))


Classification Report:
              precision    recall  f1-score   support

         LOC       0.80      0.88      0.84        58
        MISC       0.59      0.63      0.61        35
         ORG       0.82      0.71      0.76        82
         PER       0.96      0.89      0.93       118

   micro avg       0.84      0.81      0.82       293
   macro avg       0.79      0.78      0.78       293
weighted avg       0.85      0.81      0.82       293



## Augmented test set

In [19]:
# print span-f1
!python ../scripts/span_f1.py ../data/me_data/middle_eastern_test.iob2 final_preds/final_me_test_pred.iob2

recall:    0.7808219178082192
precision: 0.7916666666666666
slot-f1:   0.7862068965517242

unlabeled
ul_recall:    0.9178082191780822
ul_precision: 0.9305555555555556
ul_slot-f1:   0.9241379310344828

loose (partial overlap with same label)
l_recall:    0.797945205479452
l_precision: 0.8055555555555556
l_slot-f1:   0.8017323208353109


In [20]:
# print a classification report
print("\nClassification Report:")
print(classification_report(me_true_labels, me_pred_labels))


Classification Report:
              precision    recall  f1-score   support

         LOC       0.78      0.86      0.82        58
        MISC       0.65      0.63      0.64        35
         ORG       0.72      0.60      0.65        82
         PER       0.87      0.92      0.89       118

   micro avg       0.79      0.78      0.79       293
   macro avg       0.75      0.75      0.75       293
weighted avg       0.78      0.78      0.78       293

