# Token-based evaluation

In [1]:
from datasets import load_dataset
import pandas as pd
import spacy
from tqdm import tqdm

from utils import filter_outer_ene_spans, evaluate_multilabel_token_dataset

In [2]:
dataset = load_dataset("GEODE/GeoEDdA")
df = pd.DataFrame(dataset['test'])
df.head()

Unnamed: 0,text,meta,tokens,spans
0,"COMPIEGNE, (Géog. mod.) ville de France, dans ...","{'volume': 3, 'head': 'COMPIEGNE', 'author': '...","[{'text': 'COMPIEGNE', 'start': 0, 'end': 9, '...","[{'text': 'COMPIEGNE', 'start': 0, 'end': 9, '..."
1,"HILPERHAUSEN, (Géog.) ville d'Allemagne en Fra...","{'volume': 8, 'head': 'HILPERHAUSEN', 'author'...","[{'text': 'HILPERHAUSEN', 'start': 0, 'end': 1...","[{'text': 'HILPERHAUSEN', 'start': 0, 'end': 1..."
2,"Patane ou Patany, (Géog. mod.) royaume des Ind...","{'volume': 12, 'head': 'Patane ou Patany', 'au...","[{'text': 'Patane', 'start': 0, 'end': 6, 'id'...","[{'text': 'Patane ou Patany', 'start': 0, 'end..."
3,"* ABYDE ou ABYDOS, sub. Ville maritime de Phry...","{'volume': 1, 'head': 'ABYDE ou ABYDOS', 'auth...","[{'text': '*', 'start': 0, 'end': 1, 'id': 0, ...","[{'text': 'ABYDE ou ABYDOS', 'start': 2, 'end'..."
4,"DUSSELDORP, (Géog. mod.) ville du cercle de We...","{'volume': 5, 'head': 'DUSSELDORP', 'author': ...","[{'text': 'DUSSELDORP', 'start': 0, 'end': 10,...","[{'text': 'DUSSELDORP', 'start': 0, 'end': 10,..."


In [3]:
nlp = spacy.load("fr_spacy_custom_spancat_edda")

In [4]:
gold_docs = []
pred_docs = []

for index, row in tqdm(df.iterrows()):
  gold_docs.append(filter_outer_ene_spans(row['spans']))

  doc = nlp(row['text'])
  spans = []
  for span in doc.spans['sc']:
    spans.append({
        "start": span.start_char,
        "end": span.end_char,
        "token_start": span.start,      # index du 1er token du span
        "token_end": span.end - 1,
        "label": span.label_,
        "text": span.text
    })
  pred_docs.append(filter_outer_ene_spans(spans))


200it [00:03, 61.91it/s]


In [5]:
metrics = evaluate_multilabel_token_dataset(gold_docs, pred_docs)

print("=== Scores par label ===")
for label, scores in metrics['per_label'].items():
    print(f"{label}: {scores}")

print("\n=== Moyenne macro ===")
print(metrics['macro_avg'])

print("\n=== Moyenne micro ===")
print(metrics['micro_avg'])


=== Scores par label ===
Domain-mark: {'precision': 0.997, 'recall': 0.921, 'f1': 0.958}
ENE-Misc: {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
ENE-Person: {'precision': 0.963, 'recall': 0.794, 'f1': 0.871}
ENE-Spatial: {'precision': 0.937, 'recall': 0.931, 'f1': 0.934}
Head: {'precision': 1.0, 'recall': 0.291, 'f1': 0.451}
Latlong: {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
NC-Person: {'precision': 0.926, 'recall': 0.667, 'f1': 0.775}
NC-Spatial: {'precision': 0.96, 'recall': 0.921, 'f1': 0.94}
NP-Misc: {'precision': 0.925, 'recall': 0.56, 'f1': 0.698}
NP-Person: {'precision': 0.938, 'recall': 0.901, 'f1': 0.92}
NP-Spatial: {'precision': 0.931, 'recall': 0.939, 'f1': 0.935}
Relation: {'precision': 0.963, 'recall': 0.343, 'f1': 0.506}

=== Moyenne macro ===
{'precision': 0.795, 'recall': 0.606, 'f1': 0.666}

=== Moyenne micro ===
{'precision': 0.949, 'recall': 0.644, 'f1': 0.768}
