In [None]:
# imports
import sys
sys.path.append("../")

from scripts.load_data import label_mapping, read_tsv_file, read_iob2_file
from scripts.evaluation import extract_true_and_pred_labels
from seqeval.metrics import classification_report

## Computing metrics for predictions on non-augmented and augmented test set

In [16]:
# paths to data
path_test = "../data/no_overlap_da_news/da_news_test.tsv"
path_test_pred = "baseline_preds/test_pred.iob2" 
path_me_test = "../data/me_data/middle_eastern_test.tsv"
path_me_test_pred = "baseline_preds/me_test_pred.iob2" 

In [17]:
# mapping labels
label2id, id2label = label_mapping(path_test)

print(label2id)
print(id2label)

{'B-LOC': 0, 'I-LOC': 1, 'I-PER': 2, 'O': 3, 'I-ORG': 4, 'B-MISC': 5, 'B-ORG': 6, 'I-MISC': 7, 'B-PER': 8}
{0: 'B-LOC', 1: 'I-LOC', 2: 'I-PER', 3: 'O', 4: 'I-ORG', 5: 'B-MISC', 6: 'B-ORG', 7: 'I-MISC', 8: 'B-PER'}


In [18]:
# reading in files
test_data = read_tsv_file(path_test, label2id)
test_pred = read_iob2_file(path_test_pred, label2id)
me_test_data = read_tsv_file(path_me_test, label2id)
me_test_pred = read_iob2_file(path_me_test_pred, label2id)

In [None]:
# saving all true and predicted labels
true_labels, pred_labels = extract_true_and_pred_labels(test_data, test_pred)
me_true_labels, me_pred_labels = extract_true_and_pred_labels(me_test_data, me_test_pred)

## Non-augmented test set

In [20]:
# print classification report
print("\nClassification Report:")
print(classification_report(true_labels, pred_labels))


Classification Report:
              precision    recall  f1-score   support

         LOC       0.77      0.79      0.78        58
        MISC       0.70      0.54      0.61        35
         ORG       0.74      0.63      0.68        82
         PER       0.92      0.92      0.92       118

   micro avg       0.82      0.77      0.79       293
   macro avg       0.78      0.72      0.75       293
weighted avg       0.81      0.77      0.79       293



In [21]:
!python ../scripts/span_f1.py ../data/no_overlap_da_news/da_news_test.iob2 baseline_preds/test_pred.iob2

recall:    0.7705479452054794
precision: 0.8302583025830258
slot-f1:   0.7992895204262876

unlabeled
ul_recall:    0.8561643835616438
ul_precision: 0.922509225092251
ul_slot-f1:   0.8880994671403197

loose (partial overlap with same label)
l_recall:    0.7842465753424658
l_precision: 0.8487084870848709
l_slot-f1:   0.8152051942825745


## Augmented test set

In [22]:
# print a classification report
print("\nClassification Report:")
print(classification_report(me_true_labels, me_pred_labels))


Classification Report:
              precision    recall  f1-score   support

         LOC       0.75      0.71      0.73        58
        MISC       0.45      0.51      0.48        35
         ORG       0.79      0.54      0.64        82
         PER       0.83      0.90      0.86       118

   micro avg       0.75      0.71      0.73       293
   macro avg       0.70      0.66      0.68       293
weighted avg       0.75      0.71      0.73       293



In [23]:
!python ../scripts/span_f1.py ../data/me_data/middle_eastern_test.iob2 baseline_preds/me_test_pred.iob2

recall:    0.7123287671232876
precision: 0.7482014388489209
slot-f1:   0.7298245614035087

unlabeled
ul_recall:    0.8767123287671232
ul_precision: 0.920863309352518
ul_slot-f1:   0.8982456140350877

loose (partial overlap with same label)
l_recall:    0.7328767123287672
l_precision: 0.7733812949640287
l_slot-f1:   0.7525844019890081
