In [18]:
# imports
import sys
sys.path.append("../")

from scripts.load_data import label_mapping, read_tsv_file, read_iob2_file
from scripts.evaluation import extract_true_and_pred_labels
from seqeval.metrics import classification_report

In [19]:
# paths to data
path_test = "../data/no_overlap_da_news/da_news_test.tsv"
path_test_pred = "final_preds/final_me_test_pred.iob2" 
path_me_test = "../data/me_data/middle_eastern_test.tsv"
path_me_test_pred = "final_preds/final_test_pred.iob2" 

In [20]:
# mapping labels
label2id, id2label = label_mapping(path_test)

print(label2id)
print(id2label)

{'B-PER': 0, 'B-MISC': 1, 'I-PER': 2, 'B-ORG': 3, 'O': 4, 'I-MISC': 5, 'B-LOC': 6, 'I-ORG': 7, 'I-LOC': 8}
{0: 'B-PER', 1: 'B-MISC', 2: 'I-PER', 3: 'B-ORG', 4: 'O', 5: 'I-MISC', 6: 'B-LOC', 7: 'I-ORG', 8: 'I-LOC'}


In [21]:
# reading in files
test_data = read_tsv_file(path_test, label2id)
test_pred = read_iob2_file(path_test_pred, label2id)
me_test_data = read_tsv_file(path_me_test, label2id)
me_test_pred = read_iob2_file(path_me_test_pred, label2id)

In [22]:
# saving all true and predicted labels
true_labels, pred_labels = extract_true_and_pred_labels(test_data, test_pred)
me_true_labels, me_pred_labels = extract_true_and_pred_labels(me_test_data, me_test_pred)

## Non-augmented test set

In [23]:
!python ../scripts/span_f1.py ../data/no_overlap_da_news/da_news_test.iob2 final_preds/final_test_pred.iob2

recall:    0.7808219178082192
precision: 0.8507462686567164
slot-f1:   0.8142857142857143

unlabeled
ul_recall:    0.8493150684931506
ul_precision: 0.9253731343283582
ul_slot-f1:   0.8857142857142857

loose (partial overlap with same label)
l_recall:    0.7945205479452054
l_precision: 0.8694029850746269
l_slot-f1:   0.830276779405892


In [24]:
# print classification report
print("\nClassification Report:")
print(classification_report(true_labels, pred_labels))


Classification Report:
              precision    recall  f1-score   support

         LOC       0.73      0.81      0.77        58
        MISC       0.70      0.54      0.61        35
         ORG       0.75      0.56      0.64        82
         PER       0.82      0.93      0.87       118

   micro avg       0.78      0.76      0.77       293
   macro avg       0.75      0.71      0.72       293
weighted avg       0.77      0.76      0.76       293



## Augmented test set

In [25]:
!python ../scripts/span_f1.py ../data/me_data/middle_eastern_test.iob2 final_preds/final_me_test_pred.iob2

recall:    0.7636986301369864
precision: 0.7852112676056338
slot-f1:   0.7743055555555555

unlabeled
ul_recall:    0.9143835616438356
ul_precision: 0.9401408450704225
ul_slot-f1:   0.9270833333333333

loose (partial overlap with same label)
l_recall:    0.773972602739726
l_precision: 0.795774647887324
l_slot-f1:   0.7847222222222222


In [26]:
# print a classification report
print("\nClassification Report:")
print(classification_report(me_true_labels, me_pred_labels))


Classification Report:
              precision    recall  f1-score   support

         LOC       0.77      0.83      0.80        58
        MISC       0.66      0.54      0.59        35
         ORG       0.79      0.61      0.69        82
         PER       0.93      0.93      0.93       118

   micro avg       0.83      0.77      0.80       293
   macro avg       0.79      0.73      0.75       293
weighted avg       0.83      0.77      0.80       293

