In [51]:
import os
import glob
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
)

In [52]:
OUTPUT_DIR = "../results/"   # ou "output"
SEP = "\t"

files = sorted(glob.glob(os.path.join(OUTPUT_DIR, "simulated_spans_*.tsv"))) # spans_long_*

print(f"{len(files)} fichiers trouvés")
for f in files:
    print(" -", Path(f).name)


2 fichiers trouvés
 - simulated_spans_falcon7b.tsv
 - simulated_spans_qwen3-32b.tsv


In [53]:
dfs = []

for f in files:
    df = pd.read_csv(f, sep=SEP)
    df["source_file"] = Path(f).name
    dfs.append(df)

df_all = pd.concat(dfs, ignore_index=True)

print("Shape globale :", df_all.shape)
# df_all.head()

Shape globale : (5160, 18)


In [54]:
df = df_all.copy()

# Prédiction binaire : y a-t-il AU MOINS un span pour ce prompt / phrase ?
df["pred_annotation"] = (df["spans_count"] > 0).astype(int)

# Span non vide
df["has_span"] = df["span_index"] >= 0

In [56]:
def eval_annotation(df_sub):
    y_true = df_sub["gold_annotation"]
    y_pred = df_sub["pred_annotation"]

    p, r, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average="binary", zero_division=0
    )

    return pd.Series({
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": p,
        "recall": r,
        "f1": f1,
    })


annotation_scores = (
    df
    .groupby(["model", "prompt_name"])
    .apply(eval_annotation)
    .reset_index()
)

annotation_scores_renamed = annotation_scores.rename(columns={
    "accuracy": "ann_accuracy",
    "precision": "ann_precision",
    "recall": "ann_recall",
    "f1": "ann_f1",
})


annotation_scores_renamed.sort_values("ann_f1", ascending=False)


  .apply(eval_annotation)


Unnamed: 0,model,prompt_name,ann_accuracy,ann_precision,ann_recall,ann_f1
2,falcon7b,span_detection_with_examples,0.569575,0.604076,0.816929,0.694561
5,qwen3-32b,span_detection_with_examples,0.569575,0.604076,0.816929,0.694561
0,falcon7b,negation_detection,0.552204,0.591667,0.822394,0.688207
3,qwen3-32b,negation_detection,0.552204,0.591667,0.822394,0.688207
1,falcon7b,span_detection,0.548276,0.58011,0.825147,0.681265
4,qwen3-32b,span_detection,0.548276,0.58011,0.825147,0.681265


In [57]:
NEGATION_PROMPT_KEYWORDS = ["negation"]

mask_neg_prompt = df["prompt_name"].str.contains(
    "|".join(NEGATION_PROMPT_KEYWORDS), case=False
)

df_neg = df[mask_neg_prompt & df["has_span"]].copy()

df_neg["pred_negated"] = (
    df_neg["span_text"].str.contains("NOT_|NEG_", regex=True, na=False)
).astype(bool)


In [59]:
neg_scores = (
    df_neg
    .groupby(["model", "prompt_name"])
    .apply(
        lambda x: precision_recall_fscore_support(
            x["gold_negated"],
            x["pred_negated"],
            average="binary",
            zero_division=0
        )[:3]
    )
)

neg_scores = neg_scores.apply(pd.Series)
neg_scores.columns = ["precision", "recall", "f1"]
neg_scores = neg_scores.reset_index()


neg_scores_renamed = neg_scores.rename(columns={
    "precision": "neg_precision",
    "recall": "neg_recall",
    "f1": "neg_f1",
})


neg_scores_renamed.sort_values("neg_f1", ascending=False)


  .apply(


Unnamed: 0,model,prompt_name,neg_precision,neg_recall,neg_f1
0,falcon7b,negation_detection,0.0,0.0,0.0
1,qwen3-32b,negation_detection,0.0,0.0,0.0


In [60]:
def levenshtein_distance(a: str, b: str) -> int:
    """
    Distance de Levenshtein classique (programmation dynamique).
    """
    a = str(a)
    b = str(b)

    if a == b:
        return 0
    if len(a) == 0:
        return len(b)
    if len(b) == 0:
        return len(a)

    dp = [[0] * (len(b) + 1) for _ in range(len(a) + 1)]

    for i in range(len(a) + 1):
        dp[i][0] = i
    for j in range(len(b) + 1):
        dp[0][j] = j

    for i in range(1, len(a) + 1):
        for j in range(1, len(b) + 1):
            cost = 0 if a[i - 1] == b[j - 1] else 1
            dp[i][j] = min(
                dp[i - 1][j] + 1,      # deletion
                dp[i][j - 1] + 1,      # insertion
                dp[i - 1][j - 1] + cost  # substitution
            )

    return dp[-1][-1]


def levenshtein_similarity(a: str, b: str) -> float:
    """
    Similarité Levenshtein normalisée entre 0 et 1.
    """
    a = str(a).strip().lower()
    b = str(b).strip().lower()

    if not a and not b:
        return 1.0
    if not a or not b:
        return 0.0

    dist = levenshtein_distance(a, b)
    return 1.0 - dist / max(len(a), len(b))


In [61]:
df_span = df[df["gold_span_text"].notna() & df["has_span"]].copy()

df_span["lev_sim"] = df_span.apply(
    lambda r: levenshtein_similarity(r["gold_span_text"], r["span_text"]),
    axis=1
)

df_span[[
    "gold_span_text",
    "span_text",
    "lev_sim"
]].head()


Unnamed: 0,gold_span_text,span_text,lev_sim
3,en période néo-natale notamment pas de notion ...,fever,0.035088
9,trouble du spectre autistique,fever,0.103448
10,trouble du spectre autistique,respiratory distress,0.275862
11,trouble du spectre autistique,respiratory distress,0.275862
12,trouble du spectre autistique,fever,0.103448


In [62]:
lev_scores = (
    df_span
    .groupby(["model", "prompt_name"])
    .agg(
        lev_sim_mean=("lev_sim", "mean"),
        lev_sim_median=("lev_sim", "median"),
        lev_sim_p75=("lev_sim", lambda x: x.quantile(0.75)),
    )
    .reset_index()
    .sort_values("lev_sim_mean", ascending=False)
)

lev_scores_renamed = lev_scores.rename(columns={
    "lev_sim_mean": "lev_mean",
    "lev_sim_median": "lev_median",
    "lev_sim_p75": "lev_p75",
})


lev_scores_renamed


Unnamed: 0,model,prompt_name,lev_mean,lev_median,lev_p75
1,falcon7b,span_detection,0.140016,0.129331,0.18007
4,qwen3-32b,span_detection,0.140016,0.129331,0.18007
2,falcon7b,span_detection_with_examples,0.131801,0.127273,0.170892
5,qwen3-32b,span_detection_with_examples,0.131801,0.127273,0.170892
0,falcon7b,negation_detection,0.131121,0.125,0.166667
3,qwen3-32b,negation_detection,0.131121,0.125,0.166667


In [63]:
overview = (
    df
    .groupby(["model", "prompt_name"])
    .agg(
        n_sentences=("sentence", "nunique"),
        mean_spans=("spans_count", "mean"),
        pct_with_span=("pred_annotation", "mean"),
        mean_latency_s=("latency_s", "mean"),
    )
    .reset_index()
    .sort_values(["model", "prompt_name"])
)

overview


summary = (
    overview
    .merge(annotation_scores_renamed, on=["model", "prompt_name"], how="left")
    .merge(lev_scores_renamed, on=["model", "prompt_name"], how="left")
    .merge(neg_scores_renamed, on=["model", "prompt_name"], how="left")
)

cols = [
    "model","prompt_name",
    "n_sentences","pct_with_span","mean_spans","mean_latency_s",
    "ann_accuracy","ann_precision","ann_recall","ann_f1",
    "lev_mean","lev_median","lev_p75",
    "neg_precision","neg_recall","neg_f1",
]
summary = summary[[c for c in cols if c in summary.columns]]
summary



Unnamed: 0,model,prompt_name,n_sentences,pct_with_span,mean_spans,mean_latency_s,ann_accuracy,ann_precision,ann_recall,ann_f1,lev_mean,lev_median,lev_p75,neg_precision,neg_recall,neg_f1
0,falcon7b,negation_detection,442,0.835267,1.955916,0.903788,0.552204,0.591667,0.822394,0.688207,0.131121,0.125,0.166667,0.0,0.0,0.0
1,falcon7b,span_detection,442,0.832184,1.972414,0.893844,0.548276,0.58011,0.825147,0.681265,0.140016,0.129331,0.18007,,,
2,falcon7b,span_detection_with_examples,442,0.810142,1.885613,0.891175,0.569575,0.604076,0.816929,0.694561,0.131801,0.127273,0.170892,,,
3,qwen3-32b,negation_detection,442,0.835267,1.955916,0.903788,0.552204,0.591667,0.822394,0.688207,0.131121,0.125,0.166667,0.0,0.0,0.0
4,qwen3-32b,span_detection,442,0.832184,1.972414,0.893844,0.548276,0.58011,0.825147,0.681265,0.140016,0.129331,0.18007,,,
5,qwen3-32b,span_detection_with_examples,442,0.810142,1.885613,0.891175,0.569575,0.604076,0.816929,0.694561,0.131801,0.127273,0.170892,,,
