# **Administration interne et pilotage (SG, RH, achats, systèmes)**

In [8]:
import json
from collections import Counter, defaultdict

In [24]:
PATH = "dataset_domaine1_admin_interne.jsonl"

rows = []
with open(PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        rows.append(json.loads(line))

print(f"Total lignes: {len(rows)}")

# Clusters uniques
clusters = {r["cluster_id"] for r in rows}
print(f"Clusters uniques: {len(clusters)}")

# Répartition par domaine / pair_type / anchor_type
by_domain = Counter(r["domain"] for r in rows)
by_pair_type = Counter(r["pair_type"] for r in rows)
by_anchor_type = Counter(r["anchor_type"] for r in rows)

print("\n--- Lignes par domaine ---")
for dom, n in by_domain.most_common():
    print(f"{n:5d}  {dom}")

print("\n--- Lignes par pair_type ---")
for p, n in by_pair_type.most_common():
    print(f"{n:5d}  {p}")

print("\n--- Lignes par anchor_type ---")
for a, n in by_anchor_type.most_common():
    print(f"{n:5d}  {a}")

# Vérif: chaque cluster doit idéalement avoir def/context/qa (3 lignes)
cluster_pairs = defaultdict(set)
for r in rows:
    cluster_pairs[r["cluster_id"]].add(r["pair_type"])

incomplete = {cid: sorted(list(pairs)) for cid, pairs in cluster_pairs.items() if set(pairs) != {"def", "context", "qa"}}

print("\n--- Clusters incomplets / non standards ---")
if not incomplete:
    print("OK: tous les clusters ont def/context/qa")
else:
    for cid, pairs in sorted(incomplete.items()):
        print(f"{cid}: {pairs}")

# Doublons exacts (même cluster_id, pair_type, anchor, positive)
seen = set()
dups = []
for r in rows:
    key = (r.get("cluster_id"), r.get("pair_type"), r.get("anchor"), r.get("positive"))
    if key in seen:
        dups.append(key)
    else:
        seen.add(key)

print("\n--- Doublons exacts ---")
print(f"{len(dups)} doublon(s)")
if dups[:5]:
    print("Exemples:", dups[:5])

Total lignes: 750
Clusters uniques: 250

--- Lignes par domaine ---
  750  Administration interne et pilotage (SG, RH, achats, systèmes)

--- Lignes par pair_type ---
  250  def
  250  context
  250  qa

--- Lignes par anchor_type ---
  375  acronym
  375  term

--- Clusters incomplets / non standards ---
OK: tous les clusters ont def/context/qa

--- Doublons exacts ---
0 doublon(s)


# **Juridique public et commande publique (DAJ, marchés publics, contentieux)**

In [None]:
PATH = "dataset_domaine2_juridique.jsonl"

rows = []
with open(PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        rows.append(json.loads(line))

print(f"Total lignes: {len(rows)}")

# Clusters uniques
clusters = {r["cluster_id"] for r in rows}
print(f"Clusters uniques: {len(clusters)}")

# Répartition par domaine / pair_type / anchor_type
by_domain = Counter(r["domain"] for r in rows)
by_pair_type = Counter(r["pair_type"] for r in rows)
by_anchor_type = Counter(r["anchor_type"] for r in rows)

print("\n--- Lignes par domaine ---")
for dom, n in by_domain.most_common():
    print(f"{n:5d}  {dom}")

print("\n--- Lignes par pair_type ---")
for p, n in by_pair_type.most_common():
    print(f"{n:5d}  {p}")

print("\n--- Lignes par anchor_type ---")
for a, n in by_anchor_type.most_common():
    print(f"{n:5d}  {a}")

# Vérif: chaque cluster doit idéalement avoir def/context/qa (3 lignes)
cluster_pairs = defaultdict(set)
for r in rows:
    cluster_pairs[r["cluster_id"]].add(r["pair_type"])

incomplete = {cid: sorted(list(pairs)) for cid, pairs in cluster_pairs.items() if set(pairs) != {"def", "context", "qa"}}

print("\n--- Clusters incomplets / non standards ---")
if not incomplete:
    print("OK: tous les clusters ont def/context/qa")
else:
    for cid, pairs in sorted(incomplete.items()):
        print(f"{cid}: {pairs}")

# Doublons exacts (même cluster_id, pair_type, anchor, positive)
seen = set()
dups = []
for r in rows:
    key = (r.get("cluster_id"), r.get("pair_type"), r.get("anchor"), r.get("positive"))
    if key in seen:
        dups.append(key)
    else:
        seen.add(key)

print("\n--- Doublons exacts ---")
print(f"{len(dups)} doublon(s)")
if dups[:5]:
    print("Exemples:", dups[:5])

Total lignes: 750
Clusters uniques: 250

--- Lignes par domaine ---
  750  Juridique public et commande publique (DAJ, marchés publics, contentieux)

--- Lignes par pair_type ---
  250  def
  250  context
  250  qa

--- Lignes par anchor_type ---
  375  acronym
  375  term

--- Clusters incomplets / non standards ---
OK: tous les clusters ont def/context/qa

--- Doublons exacts ---
0 doublon(s)


# **Fiscalité et recouvrement (DGFiP, DLF, contrôle, procédures)**

In [None]:
PATH = "dataset_domaine3_fiscalite.jsonl"

rows = []
with open(PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        rows.append(json.loads(line))

print(f"Total lignes: {len(rows)}")

# Clusters uniques
clusters = {r["cluster_id"] for r in rows}
print(f"Clusters uniques: {len(clusters)}")

# Répartition par domaine / pair_type / anchor_type
by_domain = Counter(r["domain"] for r in rows)
by_pair_type = Counter(r["pair_type"] for r in rows)
by_anchor_type = Counter(r["anchor_type"] for r in rows)

print("\n--- Lignes par domaine ---")
for dom, n in by_domain.most_common():
    print(f"{n:5d}  {dom}")

print("\n--- Lignes par pair_type ---")
for p, n in by_pair_type.most_common():
    print(f"{n:5d}  {p}")

print("\n--- Lignes par anchor_type ---")
for a, n in by_anchor_type.most_common():
    print(f"{n:5d}  {a}")

# Vérif: chaque cluster doit idéalement avoir def/context/qa (3 lignes)
cluster_pairs = defaultdict(set)
for r in rows:
    cluster_pairs[r["cluster_id"]].add(r["pair_type"])

incomplete = {cid: sorted(list(pairs)) for cid, pairs in cluster_pairs.items() if set(pairs) != {"def", "context", "qa"}}

print("\n--- Clusters incomplets / non standards ---")
if not incomplete:
    print("OK: tous les clusters ont def/context/qa")
else:
    for cid, pairs in sorted(incomplete.items()):
        print(f"{cid}: {pairs}")

# Doublons exacts (même cluster_id, pair_type, anchor, positive)
seen = set()
dups = []
for r in rows:
    key = (r.get("cluster_id"), r.get("pair_type"), r.get("anchor"), r.get("positive"))
    if key in seen:
        dups.append(key)
    else:
        seen.add(key)

print("\n--- Doublons exacts ---")
print(f"{len(dups)} doublon(s)")
if dups[:5]:
    print("Exemples:", dups[:5])

Total lignes: 750
Clusters uniques: 250

--- Lignes par domaine ---
  750  Fiscalité et recouvrement (DGFiP, DLF, contrôle, procédures)

--- Lignes par pair_type ---
  250  def
  250  context
  250  qa

--- Lignes par anchor_type ---
  375  acronym
  375  term

--- Clusters incomplets / non standards ---
OK: tous les clusters ont def/context/qa

--- Doublons exacts ---
0 doublon(s)


# **Budget, comptabilité publique et exécution (DB, AE/CP, Chorus)**

In [None]:
PATH = "dataset_domaine4_budget.jsonl"

rows = []
with open(PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        rows.append(json.loads(line))

print(f"Total lignes: {len(rows)}")

# Clusters uniques
clusters = {r["cluster_id"] for r in rows}
print(f"Clusters uniques: {len(clusters)}")

# Répartition par domaine / pair_type / anchor_type
by_domain = Counter(r["domain"] for r in rows)
by_pair_type = Counter(r["pair_type"] for r in rows)
by_anchor_type = Counter(r["anchor_type"] for r in rows)

print("\n--- Lignes par domaine ---")
for dom, n in by_domain.most_common():
    print(f"{n:5d}  {dom}")

print("\n--- Lignes par pair_type ---")
for p, n in by_pair_type.most_common():
    print(f"{n:5d}  {p}")

print("\n--- Lignes par anchor_type ---")
for a, n in by_anchor_type.most_common():
    print(f"{n:5d}  {a}")

# Vérif: chaque cluster doit idéalement avoir def/context/qa (3 lignes)
cluster_pairs = defaultdict(set)
for r in rows:
    cluster_pairs[r["cluster_id"]].add(r["pair_type"])

incomplete = {cid: sorted(list(pairs)) for cid, pairs in cluster_pairs.items() if set(pairs) != {"def", "context", "qa"}}

print("\n--- Clusters incomplets / non standards ---")
if not incomplete:
    print("OK: tous les clusters ont def/context/qa")
else:
    for cid, pairs in sorted(incomplete.items()):
        print(f"{cid}: {pairs}")

# Doublons exacts (même cluster_id, pair_type, anchor, positive)
seen = set()
dups = []
for r in rows:
    key = (r.get("cluster_id"), r.get("pair_type"), r.get("anchor"), r.get("positive"))
    if key in seen:
        dups.append(key)
    else:
        seen.add(key)

print("\n--- Doublons exacts ---")
print(f"{len(dups)} doublon(s)")
if dups[:5]:
    print("Exemples:", dups[:5])

Total lignes: 750
Clusters uniques: 250

--- Lignes par domaine ---
  750  Budget, comptabilité publique et exécution (DB, AE/CP, Chorus)

--- Lignes par pair_type ---
  250  def
  250  context
  250  qa

--- Lignes par anchor_type ---
  375  acronym
  375  term

--- Clusters incomplets / non standards ---
OK: tous les clusters ont def/context/qa

--- Doublons exacts ---
0 doublon(s)


# **Trésor, dette, financement et douanes/anti-fraude (DG_Trésor, AFT, DGDDI, Tracfin)**

In [None]:
PATH = "dataset_domaine5_tresor_douanes.jsonl"

rows = []
with open(PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        rows.append(json.loads(line))

print(f"Total lignes: {len(rows)}")

# Clusters uniques
clusters = {r["cluster_id"] for r in rows}
print(f"Clusters uniques: {len(clusters)}")

# Répartition par domaine / pair_type / anchor_type
by_domain = Counter(r["domain"] for r in rows)
by_pair_type = Counter(r["pair_type"] for r in rows)
by_anchor_type = Counter(r["anchor_type"] for r in rows)

print("\n--- Lignes par domaine ---")
for dom, n in by_domain.most_common():
    print(f"{n:5d}  {dom}")

print("\n--- Lignes par pair_type ---")
for p, n in by_pair_type.most_common():
    print(f"{n:5d}  {p}")

print("\n--- Lignes par anchor_type ---")
for a, n in by_anchor_type.most_common():
    print(f"{n:5d}  {a}")

# Vérif: chaque cluster doit idéalement avoir def/context/qa (3 lignes)
cluster_pairs = defaultdict(set)
for r in rows:
    cluster_pairs[r["cluster_id"]].add(r["pair_type"])

incomplete = {cid: sorted(list(pairs)) for cid, pairs in cluster_pairs.items() if set(pairs) != {"def", "context", "qa"}}

print("\n--- Clusters incomplets / non standards ---")
if not incomplete:
    print("OK: tous les clusters ont def/context/qa")
else:
    for cid, pairs in sorted(incomplete.items()):
        print(f"{cid}: {pairs}")

# Doublons exacts (même cluster_id, pair_type, anchor, positive)
seen = set()
dups = []
for r in rows:
    key = (r.get("cluster_id"), r.get("pair_type"), r.get("anchor"), r.get("positive"))
    if key in seen:
        dups.append(key)
    else:
        seen.add(key)

print("\n--- Doublons exacts ---")
print(f"{len(dups)} doublon(s)")
if dups[:5]:
    print("Exemples:", dups[:5])

Total lignes: 750
Clusters uniques: 250

--- Lignes par domaine ---
  750  Trésor, dette, financement et douanes/anti-fraude (DG_Trésor, AFT, DGDDI, Tracfin)

--- Lignes par pair_type ---
  250  def
  250  context
  250  qa

--- Lignes par anchor_type ---
  375  acronym
  375  term

--- Clusters incomplets / non standards ---
OK: tous les clusters ont def/context/qa

--- Doublons exacts ---
0 doublon(s)


# **Dataset baconnier (domaine administratif multi-termes)**

In [21]:
PATH = "dataset_domaine6_administratif_v3.jsonl"

rows = []
with open(PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        rows.append(json.loads(line))

print(f"Total lignes: {len(rows)}")

# Clusters uniques
clusters = {r["cluster_id"] for r in rows}
print(f"Clusters uniques: {len(clusters)}")

# Répartition par domaine / pair_type / anchor_type
by_domain = Counter(r["domain"] for r in rows)
by_pair_type = Counter(r["pair_type"] for r in rows)
by_anchor_type = Counter(r["anchor_type"] for r in rows)

print("\n--- Lignes par domaine ---")
for dom, n in by_domain.most_common():
    print(f"{n:5d}  {dom}")

print("\n--- Lignes par pair_type ---")
for p, n in by_pair_type.most_common():
    print(f"{n:5d}  {p}")

print("\n--- Lignes par anchor_type ---")
for a, n in by_anchor_type.most_common():
    print(f"{n:5d}  {a}")

# Vérif: chaque cluster doit idéalement avoir def/context/qa (3 lignes)
cluster_pairs = defaultdict(set)
for r in rows:
    cluster_pairs[r["cluster_id"]].add(r["pair_type"])

incomplete = {cid: sorted(list(pairs)) for cid, pairs in cluster_pairs.items() if set(pairs) != {"def", "context", "qa"}}

print("\n--- Clusters incomplets / non standards ---")
if not incomplete:
    print("OK: tous les clusters ont def/context/qa")
else:
    for cid, pairs in sorted(incomplete.items()):
        print(f"{cid}: {pairs}")

# Doublons exacts (même cluster_id, pair_type, anchor, positive)
seen = set()
dups = []
for r in rows:
    key = (r.get("cluster_id"), r.get("pair_type"), r.get("anchor"), r.get("positive"))
    if key in seen:
        dups.append(key)
    else:
        seen.add(key)

print("\n--- Doublons exacts ---")
print(f"{len(dups)} doublon(s)")
if dups[:5]:
    print("Exemples:", dups[:5])

Total lignes: 750
Clusters uniques: 250

--- Lignes par domaine ---
  750  Administratif & juridique (dataset Baconnier)

--- Lignes par pair_type ---
  250  def
  250  context
  250  qa

--- Lignes par anchor_type ---
  375  acronym
  375  term

--- Clusters incomplets / non standards ---
OK: tous les clusters ont def/context/qa

--- Doublons exacts ---
0 doublon(s)


# **Dataset final : Tous les domaines**

In [25]:
PATH = "Dataset_Bercy_4k_lines.jsonl"

rows = []
with open(PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        rows.append(json.loads(line))

print(f"Total lignes: {len(rows)}")

# Clusters uniques
clusters = {r["cluster_id"] for r in rows}
print(f"Clusters uniques: {len(clusters)}")

# Répartition par domaine / pair_type / anchor_type
by_domain = Counter(r["domain"] for r in rows)
by_pair_type = Counter(r["pair_type"] for r in rows)
by_anchor_type = Counter(r["anchor_type"] for r in rows)

print("\n--- Lignes par domaine ---")
for dom, n in by_domain.most_common():
    print(f"{n:5d}  {dom}")

print("\n--- Lignes par pair_type ---")
for p, n in by_pair_type.most_common():
    print(f"{n:5d}  {p}")

print("\n--- Lignes par anchor_type ---")
for a, n in by_anchor_type.most_common():
    print(f"{n:5d}  {a}")

# Vérif: chaque cluster doit idéalement avoir def/context/qa (3 lignes)
cluster_pairs = defaultdict(set)
for r in rows:
    cluster_pairs[r["cluster_id"]].add(r["pair_type"])

incomplete = {cid: sorted(list(pairs)) for cid, pairs in cluster_pairs.items() if set(pairs) != {"def", "context", "qa"}}

print("\n--- Clusters incomplets / non standards ---")
if not incomplete:
    print("OK: tous les clusters ont def/context/qa")
else:
    for cid, pairs in sorted(incomplete.items()):
        print(f"{cid}: {pairs}")

# Doublons exacts (même cluster_id, pair_type, anchor, positive)
seen = set()
dups = []
for r in rows:
    key = (r.get("cluster_id"), r.get("pair_type"), r.get("anchor"), r.get("positive"))
    if key in seen:
        dups.append(key)
    else:
        seen.add(key)

print("\n--- Doublons exacts ---")
print(f"{len(dups)} doublon(s)")
if dups[:5]:
    print("Exemples:", dups[:5])

Total lignes: 4500
Clusters uniques: 1500

--- Lignes par domaine ---
  750  Administration interne et pilotage (SG, RH, achats, systèmes)
  750  Juridique public et commande publique (DAJ, marchés publics, contentieux)
  750  Fiscalité et recouvrement (DGFiP, DLF, contrôle, procédures)
  750  Budget, comptabilité publique et exécution (DB, AE/CP, Chorus)
  750  Trésor, dette, financement et douanes/anti-fraude (DG_Trésor, AFT, DGDDI, Tracfin)
  750  Administratif & juridique (dataset Baconnier)

--- Lignes par pair_type ---
 1500  def
 1500  context
 1500  qa

--- Lignes par anchor_type ---
 2250  acronym
 2250  term

--- Clusters incomplets / non standards ---
OK: tous les clusters ont def/context/qa

--- Doublons exacts ---
0 doublon(s)


# **Split JSONL 90 (train) / 10 (test)**

In [26]:
import random
from pathlib import Path

In [27]:
# CONFIG
INPUT_PATH = Path("Dataset_Bercy_4k_lines.jsonl")
OUT_TRAIN = Path("bercy_train_90.jsonl")
OUT_TEST  = Path("bercy_test_10.jsonl")

TEST_RATIO = 0.10
SEED = 42
random.seed(SEED)

# 1) Load rows
rows = []
with INPUT_PATH.open("r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            rows.append(json.loads(line))
        except Exception:
            pass

print(f"Loaded rows: {len(rows)}")

# 2) Group by cluster_id
by_cluster = defaultdict(list)
for r in rows:
    cid = r.get("cluster_id")
    if cid:
        by_cluster[cid].append(r)

cluster_ids = list(by_cluster.keys())
print(f"Unique cluster_id: {len(cluster_ids)}")

# 3) Build cluster metadata (domain + anchor_type) and stratify
# Assume each cluster has consistent domain + anchor_type
cluster_meta = {}
strata = defaultdict(list)  # (domain, anchor_type) -> [cluster_id,...]
cluster_sizes = {}          # cluster_id -> nb_rows

for cid, items in by_cluster.items():
    domains = [it.get("domain") for it in items if it.get("domain")]
    anchors = [it.get("anchor_type") for it in items if it.get("anchor_type")]
    if not domains or not anchors:
        continue

    dom = Counter(domains).most_common(1)[0][0]
    at  = Counter(anchors).most_common(1)[0][0]

    cluster_meta[cid] = (dom, at)
    strata[(dom, at)].append(cid)
    cluster_sizes[cid] = len(items)

# Target test size in lines
target_test_lines = int(len(rows) * TEST_RATIO)
print(f"Target test lines (approx): {target_test_lines}")

# Compute target per domain, then split 50/50 by anchor_type inside domain
domains = sorted({d for (d, a) in strata.keys()})
n_domains = len(domains)

# Ideal: equal domain share in test lines
target_per_domain_lines = {d: target_test_lines // n_domains for d in domains}
# Distribute remainder
rem = target_test_lines - sum(target_per_domain_lines.values())
for d in domains[:rem]:
    target_per_domain_lines[d] += 1

# For each domain, 50% acronym / 50% term (in LINES, approx)
def pick_clusters_for_target(candidates, target_lines):
    """Greedy pick clusters until reaching target_lines (or as close as possible)."""
    random.shuffle(candidates)
    picked = []
    total = 0
    for cid in candidates:
        if total >= target_lines:
            break
        picked.append(cid)
        total += cluster_sizes[cid]
    return picked, total

test_clusters = set()
test_lines_count = 0

for d in domains:
    # candidates per anchor_type
    cand_acr = strata.get((d, "acronym"), []).copy()
    cand_term = strata.get((d, "term"), []).copy()

    dom_target = target_per_domain_lines[d]
    acr_target = dom_target // 2
    term_target = dom_target - acr_target

    picked_acr, acr_lines = pick_clusters_for_target(cand_acr, acr_target)
    picked_term, term_lines = pick_clusters_for_target(cand_term, term_target)

    test_clusters.update(picked_acr)
    test_clusters.update(picked_term)
    test_lines_count += (acr_lines + term_lines)

# 4) Build train/test rows
train_rows, test_rows = [], []
for cid, items in by_cluster.items():
    if cid in test_clusters:
        test_rows.extend(items)
    else:
        train_rows.extend(items)

# 5) Save
def write_jsonl(path: Path, data):
    with path.open("w", encoding="utf-8") as f:
        for r in data:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

write_jsonl(OUT_TRAIN, train_rows)
write_jsonl(OUT_TEST, test_rows)

print("Split done")
print(f"Train rows: {len(train_rows)} -> {OUT_TRAIN}")
print(f"Test  rows: {len(test_rows)} -> {OUT_TEST}")

# 6) Sanity checks
train_cids = {r["cluster_id"] for r in train_rows if "cluster_id" in r}
test_cids  = {r["cluster_id"] for r in test_rows if "cluster_id" in r}
overlap = train_cids.intersection(test_cids)
print(f"Overlap cluster_ids (should be 0): {len(overlap)}")

# Domain distribution
train_dom = Counter(r.get("domain") for r in train_rows)
test_dom  = Counter(r.get("domain") for r in test_rows)
print("\nTrain domain counts:", dict(train_dom))
print("Test domain counts:", dict(test_dom))

# Anchor_type distribution
train_at = Counter(r.get("anchor_type") for r in train_rows)
test_at  = Counter(r.get("anchor_type") for r in test_rows)
print("\nTrain anchor_type counts:", dict(train_at))
print("Test anchor_type counts:", dict(test_at))

Loaded rows: 4500
Unique cluster_id: 1500
Target test lines (approx): 450
Split done
Train rows: 4032 -> bercy_train_90.jsonl
Test  rows: 468 -> bercy_test_10.jsonl
Overlap cluster_ids (should be 0): 0

Train domain counts: {'Administration interne et pilotage (SG, RH, achats, systèmes)': 672, 'Juridique public et commande publique (DAJ, marchés publics, contentieux)': 672, 'Fiscalité et recouvrement (DGFiP, DLF, contrôle, procédures)': 672, 'Budget, comptabilité publique et exécution (DB, AE/CP, Chorus)': 672, 'Trésor, dette, financement et douanes/anti-fraude (DG_Trésor, AFT, DGDDI, Tracfin)': 672, 'Administratif & juridique (dataset Baconnier)': 672}
Test domain counts: {'Administration interne et pilotage (SG, RH, achats, systèmes)': 78, 'Juridique public et commande publique (DAJ, marchés publics, contentieux)': 78, 'Fiscalité et recouvrement (DGFiP, DLF, contrôle, procédures)': 78, 'Budget, comptabilité publique et exécution (DB, AE/CP, Chorus)': 78, 'Trésor, dette, financement e