In [1]:
# Felicia necessariy import thing
import sys
import os

project_root = os.path.abspath("..")  # or adjust to wherever scripts/ lives
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [2]:
# imports
import sys

import os
import sys
sys.path.append(os.path.abspath("../"))

from scripts.load_data import label_mapping, read_tsv_file, read_iob2_file
from scripts.evaluation import extract_true_and_pred_labels
from seqeval.metrics import classification_report

## Computing metrics for predictions on non-augmented and augmented test set

In [3]:
# paths to data
path_test = "../data/no_overlap_da_news/da_news_test.tsv"
path_test_pred = "baseline_preds/test_pred.iob2" 
path_me_test = "../data/me_data/middle_eastern_test.tsv"
path_me_test_pred = "baseline_preds/me_test_pred.iob2" 

In [4]:
# mapping labels
label2id, id2label = label_mapping(path_test)

print(label2id)
print(id2label)

{'I-MISC': 0, 'I-PER': 1, 'O': 2, 'I-LOC': 3, 'B-PER': 4, 'I-ORG': 5, 'B-LOC': 6, 'B-MISC': 7, 'B-ORG': 8}
{0: 'I-MISC', 1: 'I-PER', 2: 'O', 3: 'I-LOC', 4: 'B-PER', 5: 'I-ORG', 6: 'B-LOC', 7: 'B-MISC', 8: 'B-ORG'}


In [5]:
# reading in files
test_data = read_tsv_file(path_test, label2id)
test_pred = read_iob2_file(path_test_pred, label2id)
me_test_data = read_tsv_file(path_me_test, label2id)
me_test_pred = read_iob2_file(path_me_test_pred, label2id)

In [6]:
# saving all true and predicted labels
true_labels, pred_labels = extract_true_and_pred_labels(test_data, test_pred)
me_true_labels, me_pred_labels = extract_true_and_pred_labels(me_test_data, me_test_pred)

## Non-augmented test set

In [7]:
# print classification report
print("\nClassification Report:")
print(classification_report(true_labels, pred_labels))


Classification Report:
              precision    recall  f1-score   support

         LOC       0.79      0.78      0.78        58
        MISC       0.60      0.60      0.60        35
         ORG       0.80      0.68      0.74        82
         PER       0.92      0.92      0.92       118

   micro avg       0.82      0.79      0.80       293
   macro avg       0.78      0.75      0.76       293
weighted avg       0.82      0.79      0.80       293



In [8]:
!python ../scripts/span_f1.py ../data/no_overlap_da_news/da_news_test.iob2 baseline_preds/test_pred.iob2

recall:    0.7876712328767124
precision: 0.8273381294964028
slot-f1:   0.8070175438596491

unlabeled
ul_recall:    0.8664383561643836
ul_precision: 0.9100719424460432
ul_slot-f1:   0.8877192982456141

loose (partial overlap with same label)
l_recall:    0.8047945205479452
l_precision: 0.8453237410071942
l_slot-f1:   0.824561403508772


## Augmented test set

In [9]:
# print a classification report
print("\nClassification Report:")
print(classification_report(me_true_labels, me_pred_labels))


Classification Report:
              precision    recall  f1-score   support

         LOC       0.78      0.78      0.78        58
        MISC       0.66      0.60      0.63        35
         ORG       0.70      0.54      0.61        82
         PER       0.77      0.90      0.83       118

   micro avg       0.74      0.74      0.74       293
   macro avg       0.72      0.70      0.71       293
weighted avg       0.74      0.74      0.73       293



In [10]:
!python ../scripts/span_f1.py ../data/me_data/middle_eastern_test.iob2 baseline_preds/me_test_pred.iob2

recall:    0.7397260273972602
precision: 0.75
slot-f1:   0.7448275862068966

unlabeled
ul_recall:    0.8938356164383562
ul_precision: 0.90625
ul_slot-f1:   0.9000000000000001

loose (partial overlap with same label)
l_recall:    0.7671232876712328
l_precision: 0.7777777777777778
l_slot-f1:   0.7724137931034483


# Statistical significance 

## test if it's normally distributed

In [11]:
from scipy.stats import wilcoxon
from scipy.stats import shapiro
from scipy.stats import normaltest
from seqeval.metrics import f1_score
import numpy as np

In [12]:
# sentence-level F1 scores
f1_baseline = []
f1_me = []

# zip ensures we're comparing the same sentences
for true_label, pred_baseline_label, pred_me_label in zip(true_labels, pred_labels, me_pred_labels):
    f1_b = f1_score([true_label], [pred_baseline_label], zero_division=0)
    f1_m = f1_score([true_label], [pred_me_label], zero_division=0)
    f1_baseline.append(f1_b)
    f1_me.append(f1_m)

In [13]:
diffs = np.array(f1_baseline) - np.array(f1_me)

stat, p = shapiro(diffs)
print(f"Shapiro-Wilk p-value for normality: {p}")

Shapiro-Wilk p-value for normality: 4.9185356264299756e-39


In [14]:
stat, p = normaltest(diffs)
print(f"D’Agostino and Pearson test p-value: {p}")

D’Agostino and Pearson test p-value: 2.8345455491278046e-37


## Wilcoxon signed-rank test:

In [15]:
# Wilcoxon signed-rank test
stat, p = wilcoxon(f1_baseline, f1_me)

print("\nStatistical Significance Test:")
print(f"Wilcoxon signed-rank test statistic = {stat:.4f}, p-value = {p:.4f}")

# average F1s and difference
print(f"Mean F1 - Baseline: {np.mean(f1_baseline):.4f}")
print(f"Mean F1 - Middle Eastern: {np.mean(f1_me):.4f}")
print(f"Mean Difference: {np.mean(np.array(f1_baseline) - np.array(f1_me)):.4f}")



Statistical Significance Test:
Wilcoxon signed-rank test statistic = 538.5000, p-value = 0.0281
Mean F1 - Baseline: 0.3152
Mean F1 - Middle Eastern: 0.2905
Mean Difference: 0.0247
