In [10]:
# imports
import sys
sys.path.append("../")

from scripts.load_data import label_mapping, read_tsv_file, read_iob2_file
from scripts.evaluation import extract_true_and_pred_labels
from seqeval.metrics import classification_report

## Computing metrics for predictions on non-augmented and augmented test set

In [11]:
# paths to data
path_test = "../data/no_overlap_da_news/da_news_test.tsv"
path_test_pred = "baseline_preds/test_pred.iob2" 
path_me_test = "../data/me_data/middle_eastern_test.tsv"
path_me_test_pred = "baseline_preds/me_test_pred.iob2" 

In [12]:
# mapping labels
label2id, id2label = label_mapping(path_test)

print(label2id)
print(id2label)

{'O': 0, 'B-ORG': 1, 'I-ORG': 2, 'B-LOC': 3, 'I-LOC': 4, 'B-PER': 5, 'B-MISC': 6, 'I-PER': 7, 'I-MISC': 8}
{0: 'O', 1: 'B-ORG', 2: 'I-ORG', 3: 'B-LOC', 4: 'I-LOC', 5: 'B-PER', 6: 'B-MISC', 7: 'I-PER', 8: 'I-MISC'}


In [13]:
# reading in files
test_data = read_tsv_file(path_test, label2id)
test_pred = read_iob2_file(path_test_pred, label2id)
me_test_data = read_tsv_file(path_me_test, label2id)
me_test_pred = read_iob2_file(path_me_test_pred, label2id)

In [14]:
# saving all true and predicted labels
true_labels, pred_labels = extract_true_and_pred_labels(test_data, test_pred)
me_true_labels, me_pred_labels = extract_true_and_pred_labels(me_test_data, me_test_pred)

## Non-augmented test set

In [15]:
# print classification report
print("\nClassification Report:")
print(classification_report(true_labels, pred_labels))


Classification Report:
              precision    recall  f1-score   support

         LOC       0.78      0.81      0.80        58
        MISC       0.67      0.51      0.58        35
         ORG       0.79      0.61      0.69        82
         PER       0.97      0.95      0.96       118

   micro avg       0.85      0.77      0.81       293
   macro avg       0.80      0.72      0.76       293
weighted avg       0.85      0.77      0.81       293



In [16]:
!python ../scripts/span_f1.py ../data/no_overlap_da_news/da_news_test.iob2 baseline_preds/test_pred.iob2

recall:    0.7773972602739726
precision: 0.8664122137404581
slot-f1:   0.8194945848375452

unlabeled
ul_recall:    0.8458904109589042
ul_precision: 0.9427480916030534
ul_slot-f1:   0.8916967509025271

loose (partial overlap with same label)
l_recall:    0.7876712328767124
l_precision: 0.8816793893129771
l_slot-f1:   0.8320283137058382


## Augmented test set

In [17]:
# print a classification report
print("\nClassification Report:")
print(classification_report(me_true_labels, me_pred_labels))


Classification Report:
              precision    recall  f1-score   support

         LOC       0.77      0.81      0.79        58
        MISC       0.67      0.51      0.58        35
         ORG       0.69      0.46      0.55        82
         PER       0.80      0.90      0.84       118

   micro avg       0.76      0.71      0.73       293
   macro avg       0.73      0.67      0.69       293
weighted avg       0.75      0.71      0.72       293



In [18]:
!python ../scripts/span_f1.py ../data/me_data/middle_eastern_test.iob2 baseline_preds/me_test_pred.iob2

recall:    0.7191780821917808
precision: 0.7749077490774908
slot-f1:   0.7460035523978686

unlabeled
ul_recall:    0.8767123287671232
ul_precision: 0.9446494464944649
ul_slot-f1:   0.9094138543516873

loose (partial overlap with same label)
l_recall:    0.7294520547945206
l_precision: 0.7896678966789668
l_slot-f1:   0.7583665388358802
