In [10]:
# Felicia necessariy import thing
import sys
import os

project_root = os.path.abspath("..")  # or adjust to wherever scripts/ lives
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [11]:
# imports
import sys

import os
import sys
sys.path.append(os.path.abspath("../"))

from scripts.load_data import label_mapping, read_tsv_file, read_iob2_file
from scripts.evaluation import extract_true_and_pred_labels
from seqeval.metrics import classification_report

## Computing metrics for predictions on non-augmented and augmented test set

In [12]:
# paths to data
path_test = "../data/no_overlap_da_news/da_news_test.tsv"
path_test_pred = "baseline_preds/test_pred.iob2" 
path_me_test = "../data/me_data/middle_eastern_test.tsv"
path_me_test_pred = "baseline_preds/me_test_pred.iob2" 

In [13]:
# mapping labels
label2id, id2label = label_mapping(path_test)

print(label2id)
print(id2label)

{'I-PER': 0, 'B-LOC': 1, 'I-ORG': 2, 'B-ORG': 3, 'O': 4, 'B-PER': 5, 'I-MISC': 6, 'I-LOC': 7, 'B-MISC': 8}
{0: 'I-PER', 1: 'B-LOC', 2: 'I-ORG', 3: 'B-ORG', 4: 'O', 5: 'B-PER', 6: 'I-MISC', 7: 'I-LOC', 8: 'B-MISC'}


In [14]:
# reading in files
test_data = read_tsv_file(path_test, label2id)
test_pred = read_iob2_file(path_test_pred, label2id)
me_test_data = read_tsv_file(path_me_test, label2id)
me_test_pred = read_iob2_file(path_me_test_pred, label2id)

In [15]:
# saving all true and predicted labels
true_labels, pred_labels = extract_true_and_pred_labels(test_data, test_pred)
me_true_labels, me_pred_labels = extract_true_and_pred_labels(me_test_data, me_test_pred)

## Non-augmented test set

In [16]:
# print classification report
print("\nClassification Report:")
print(classification_report(true_labels, pred_labels))


Classification Report:
              precision    recall  f1-score   support

         LOC       0.78      0.81      0.80        58
        MISC       0.67      0.51      0.58        35
         ORG       0.79      0.61      0.69        82
         PER       0.97      0.95      0.96       118

   micro avg       0.85      0.77      0.81       293
   macro avg       0.80      0.72      0.76       293
weighted avg       0.85      0.77      0.81       293



In [17]:
!python ../scripts/span_f1.py ../data/no_overlap_da_news/da_news_test.iob2 baseline_preds/test_pred.iob2

recall:    0.7773972602739726
precision: 0.8664122137404581
slot-f1:   0.8194945848375452

unlabeled
ul_recall:    0.8458904109589042
ul_precision: 0.9427480916030534
ul_slot-f1:   0.8916967509025271

loose (partial overlap with same label)
l_recall:    0.7876712328767124
l_precision: 0.8816793893129771
l_slot-f1:   0.8320283137058382


## Augmented test set

In [18]:
# print a classification report
print("\nClassification Report:")
print(classification_report(me_true_labels, me_pred_labels))


Classification Report:


              precision    recall  f1-score   support

         LOC       0.77      0.81      0.79        58
        MISC       0.67      0.51      0.58        35
         ORG       0.69      0.46      0.55        82
         PER       0.80      0.90      0.84       118

   micro avg       0.76      0.71      0.73       293
   macro avg       0.73      0.67      0.69       293
weighted avg       0.75      0.71      0.72       293



In [19]:
!python ../scripts/span_f1.py ../data/me_data/middle_eastern_test.iob2 baseline_preds/me_test_pred.iob2

recall:    0.7191780821917808
precision: 0.7749077490774908
slot-f1:   0.7460035523978686

unlabeled
ul_recall:    0.8767123287671232
ul_precision: 0.9446494464944649
ul_slot-f1:   0.9094138543516873

loose (partial overlap with same label)
l_recall:    0.7294520547945206
l_precision: 0.7896678966789668
l_slot-f1:   0.7583665388358802


# Statistical significance 

## test if it's normally distributed

In [30]:
from scipy.stats import wilcoxon
from scipy.stats import shapiro
from scipy.stats import normaltest
from seqeval.metrics import f1_score
import numpy as np

In [34]:
# sentence-level F1 scores
f1_baseline = []
f1_me = []

# zip ensures we're comparing the same sentences
for true_label, pred_baseline_label, pred_me_label in zip(true_labels, pred_labels, me_pred_labels):
    f1_b = f1_score([true_label], [pred_baseline_label], zero_division=0)
    f1_m = f1_score([true_label], [pred_me_label], zero_division=0)
    f1_baseline.append(f1_b)
    f1_me.append(f1_m)

In [36]:
diffs = np.array(f1_baseline) - np.array(f1_me)

stat, p = shapiro(diffs)
print(f"Shapiro-Wilk p-value for normality: {p}")

Shapiro-Wilk p-value for normality: 1.694830695554986e-39


In [37]:
stat, p = normaltest(diffs)
print(f"D’Agostino and Pearson test p-value: {p}")

D’Agostino and Pearson test p-value: 8.36553482381241e-36


## Wilcoxon signed-rank test:

In [39]:
# Wilcoxon signed-rank test
stat, p = wilcoxon(f1_baseline, f1_me)

print("\nStatistical Significance Test:")
print(f"Wilcoxon signed-rank test statistic = {stat:.4f}, p-value = {p:.4f}")

# average F1s and difference
print(f"Mean F1 - Baseline: {np.mean(f1_baseline):.4f}")
print(f"Mean F1 - Middle Eastern: {np.mean(f1_me):.4f}")
print(f"Mean Difference: {np.mean(np.array(f1_baseline) - np.array(f1_me)):.4f}")



Statistical Significance Test:
Wilcoxon signed-rank test statistic = 458.5000, p-value = 0.0280
Mean F1 - Baseline: 0.3115
Mean F1 - Middle Eastern: 0.2844
Mean Difference: 0.0271
