# ap-ina-bugada-prototype

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Étape 1 — Préparation cartes BUGADA/JIRA (PROPRE, anti-leakage)
- Lecture robuste (JSON / JSON dict / JSONL)
- Normalisation cartes (Bugzilla, Jira, fallback)
- Extraction desc_blob (description + 1er commentaire si dispo)
- Détection leakage (status/résolution post-hoc) + raisons  [QC seulement]
- QC: missingness.csv, duplicates.csv
- Figures "papier" + copies -> paper_assets/
- dataset_stats.json + DATACARD.md enrichi (provenance/licence)
- feature_allowlist.txt (liste blanche anti-leakage)
"""

import os, json, sys, csv, hashlib, shutil
from collections import Counter, defaultdict
from datetime import datetime, timezone

# ============ RÉGLAGES (⚠️ adapte ces chemins) ============
INPUT_PATH = "/content/drive/MyDrive/bugdata/bugs.json"
OUT_DIR    = "/content/drive/MyDrive/bugada_cards_clean"
LIMIT      = None  # None = pas de limite
# ===========================================================

# Métadonnées (modifie si tu n’es pas sur BMO)
DATA_SOURCE_NAME = "Bugzilla@Mozilla (BMO)"
DATA_SOURCE_URL  = "https://bugzilla.mozilla.org"
DATA_ACCESS_DATE = "2025-11-05"
DATA_LICENSE     = "Mozilla Websites & Communications Terms of Use (voir mentions sur le site)"
DATA_LICENSE_URL = "https://www.mozilla.org/en-US/about/legal/terms/mozilla/"

# ---------------- Utils ----------------
def now_iso():
    return datetime.now(timezone.utc).replace(microsecond=0).isoformat()

def ensure_dir(p):
    os.makedirs(p, exist_ok=True)

def safe_str(x, default=""):
    if x is None: return default
    try:
        return str(x)
    except Exception:
        return default

def sha256_bytes(b: bytes) -> str:
    return "sha256:" + hashlib.sha256(b).hexdigest()

def sha256_file(path: str) -> str:
    with open(path, "rb") as f:
        return sha256_bytes(f.read())

def sha256_obj(obj) -> str:
    s = json.dumps(obj, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
    return sha256_bytes(s.encode("utf-8"))

# ------------- Lecture robuste -------------
def read_json_any(path):
    with open(path, "r", encoding="utf-8") as f:
        txt = f.read()

    # JSON (liste/dict)
    try:
        data = json.loads(txt)
        if isinstance(data, dict):
            for key in ("bugs","issues","data","items"):
                if key in data and isinstance(data[key], list):
                    print(f"[LOAD] JSON dict avec liste '{key}' -> {len(data[key])} éléments")
                    return data[key]
            print("[LOAD] JSON dict sans clé-liste standard -> 1 élément (dict)")
            return [data]
        if isinstance(data, list):
            print(f"[LOAD] JSON liste -> {len(data)} éléments")
            return data
    except Exception:
        print("[LOAD] Pas un gros JSON, on tente JSONL…")

    # JSONL
    items = []
    for i, line in enumerate(txt.splitlines(), 1):
        line = line.strip()
        if not line: continue
        try:
            obj = json.loads(line)
            if isinstance(obj, dict) and "bug" in obj and isinstance(obj["bug"], dict):
                obj = obj["bug"]
            items.append(obj)
        except Exception:
            continue
    print(f"[LOAD] JSONL -> {len(items)} lignes valides")
    return items

# -------- Détection systèmes --------
def is_jira(bug):
    if "key" in bug: return True
    if "fields" in bug and isinstance(bug["fields"], dict): return True
    if "project" in bug and ("components" in bug or "Component" in bug): return True
    return False

def is_bugzilla(bug):
    return ("id" in bug or "bug_id" in bug) and ("summary" in bug or "short_desc" in bug or "title" in bug)

# -------- Normalisation / util --------
def norm_severity_generic(s):
    s = (s or "").strip().lower()
    repl = {
        "critical":"high","blocker":"high","major":"high",
        "minor":"low","trivial":"low","normal":"medium",
        "--":"unknown","": "unknown","none":"unknown","n/a":"unknown"
    }
    return repl.get(s, s or "unknown")

def norm_severity_bugzilla(s):
    s = (s or "").strip().lower()
    m = {"s1":"high","s2":"high","s3":"medium","s4":"low","s5":"low"}
    return m.get(s, norm_severity_generic(s))

def compute_desc_len(summary, long_text, comments_len_chars=0):
    return len(safe_str(summary)) + len(safe_str(long_text)) + int(comments_len_chars)

# Hints (déterministes; UNIQUEMENT texte)
REGRESSION_HINTS = ["regression","after update","after upgrade","after release","after deploy","since version","introduced in","since build","since release"]
INFRA_HINTS      = ["timeout","latency","slow","slowdown","performance","5xx","server error","network","intermittent","connection reset","502","503","bad gateway"]
INSUFF_HINTS     = ["cannot reproduce","need more info","needinfo","missing steps","incomplete","unconfirmed","no steps"]
BUGZILLA_NEG     = {"worksforme","works as intended","notabug","invalid","works for me","duplicate","moved"}
BUGZILLA_BSE     = {"wontfix","by design","policy decision"}

def extract_keywords_from_text(*texts):
    """Calcule des mots-clés uniquement depuis le TEXTE (pas de status/résolution)."""
    t = " ".join([safe_str(x).lower() for x in texts if x])
    kw = set()
    if any(h in t for h in REGRESSION_HINTS): kw.add("regression")
    for h in INFRA_HINTS:
        if h in t: kw.add(h.split()[0])  # "latency","timeout","slow","5xx","network","intermittent","502","503","bad"
    for h in INSUFF_HINTS:
        if h in t: kw.add("needinfo")
    for r in BUGZILLA_NEG:
        if r in t: kw.add("falsepositive")
    for r in BUGZILLA_BSE:
        if r in t: kw.add("wontfix")
    return sorted(kw)

def has_security_indicator(*texts) -> bool:
    t = " ".join([safe_str(x).lower() for x in texts if x])
    SEC = ("security","vulnerability","xss","csrf","sql injection")
    return any(s in t for s in SEC)

# -------- Extracteurs desc_blob --------
def _first_nonempty(*candidates):
    for c in candidates:
        c = safe_str(c).strip()
        if c:
            return c
    return ""

def _extract_desc_from_bugzilla(bug):
    long_text_fields = [
        bug.get("description"), bug.get("desc"), bug.get("longdesc"),
        bug.get("long_desc"), bug.get("raw_text"), bug.get("text"), bug.get("details"),
    ]
    base = _first_nonempty(*long_text_fields)

    first_comment = ""
    comments_len_chars = 0
    comments = bug.get("comments")
    if isinstance(comments, list) and comments:
        for c in comments:
            ctext = safe_str(c.get("text") or c.get("raw_text") or c.get("body"))
            comments_len_chars += len(ctext)
        first_comment = safe_str(comments[0].get("text") or comments[0].get("raw_text") or comments[0].get("body"))
    elif isinstance(comments, dict):
        arr = comments.get("comments") or comments.get("data") or []
        if isinstance(arr, list) and arr:
            for c in arr:
                ctext = safe_str(c.get("text") or c.get("raw_text") or c.get("body"))
                comments_len_chars += len(ctext)
            first_comment = safe_str(arr[0].get("text") or arr[0].get("raw_text") or arr[0].get("body"))

    desc_blob = base if base else first_comment
    return desc_blob, comments_len_chars

def _extract_desc_from_jira(bug):
    fields = bug.get("fields") if isinstance(bug.get("fields"), dict) else {}
    base = safe_str(fields.get("description") or bug.get("description") or "")
    comments_len_chars = 0
    if "comment" in fields and isinstance(fields["comment"], dict):
        arr = fields["comment"].get("comments") or []
        if isinstance(arr, list):
            for c in arr:
                comments_len_chars += len(safe_str(c.get("body")))
            if not base and arr:
                base = safe_str(arr[0].get("body"))
    return base, comments_len_chars

# -------- Cartes --------
CLOSED_STATUSES = {"resolved","closed","verified","done","fixed"}

def _leakage_from_status_resolution(status, resolution):
    """QC : fuite potentielle si on injecte ça en features (on NE le fera pas)."""
    reasons = []
    st = safe_str(status).strip().lower()
    res = safe_str(resolution).strip().upper()
    if st in CLOSED_STATUSES:
        reasons.append(f"status={status}")
    if res:
        reasons.append(f"resolution={resolution}")
    return (len(reasons) > 0), reasons

def jira_to_card(bug):
    fields = bug.get("fields") if isinstance(bug.get("fields"), dict) else {}

    key = safe_str(bug.get("key") or bug.get("id") or bug.get("ticket_id") or "")
    # project
    if "project" in fields and isinstance(fields["project"], dict) and fields["project"].get("key"):
        project = fields["project"]["key"]
    else:
        project = safe_str(bug.get("project") or "BUGS")

    # component
    component = None
    if "components" in fields and isinstance(fields["components"], list) and fields["components"]:
        c0 = fields["components"][0]
        component = c0.get("name") if isinstance(c0, dict) else str(c0)
    if not component:
        component = bug.get("component") or bug.get("Component") or "General"
        if isinstance(component, list) and component:
            component = component[0]
    component = safe_str(component)

    # created
    created = fields.get("created") or bug.get("created") or bug.get("created_at") or now_iso()
    created = safe_str(created)

    # status / severity / resolution
    status = ""
    if "status" in fields and isinstance(fields["status"], dict):
        status = fields["status"].get("name") or ""
    status = status or bug.get("status") or bug.get("status_current") or "unknown"

    sev = ""
    if "priority" in fields and isinstance(fields["priority"], dict):
        sev = fields["priority"].get("name") or ""
    sev = sev or bug.get("severity") or "unknown"
    severity = norm_severity_generic(sev)

    resolution = ""
    if isinstance(fields.get("resolution"), dict):
        resolution = fields["resolution"].get("name") or ""
    else:
        resolution = safe_str(bug.get("resolution") or "")

    # texte
    summary = fields.get("summary") or bug.get("summary") or bug.get("title") or ""
    desc_blob, comments_len_chars = _extract_desc_from_jira(bug)
    desc_len = compute_desc_len(summary, desc_blob, comments_len_chars)

    # DERIVÉS — ***UNIQUEMENT TEXTE***
    keywords = extract_keywords_from_text(summary, desc_blob)
    security_flag = has_security_indicator(summary, desc_blob)
    leakage_flag, leakage_reasons = _leakage_from_status_resolution(status, resolution)

    # ids
    ticket_id = bug.get("id") or bug.get("ticket_id")
    ticket_id = safe_str(ticket_id) if ticket_id is not None else None
    raw_hash = sha256_obj(bug)

    return {
        "ticket_id": ticket_id,
        "key": key or (project + "-" + safe_str(ticket_id) if ticket_id else "BUGS-AUTO"),
        "created_at": created,
        "project": project or "BUGS",
        "component": component or "General",
        "severity": severity,
        "status_current": safe_str(status),
        "title_len": len(safe_str(summary)),
        "summary_len": len(safe_str(summary)),
        "text_len": len(safe_str(desc_blob)),
        "desc_len": int(desc_len),
        "summary_text": safe_str(summary),
        "desc_blob": safe_str(desc_blob),
        "keywords": keywords,               # OK (texte only)
        "security_flag": bool(security_flag),
        "resolution": safe_str(resolution), # QC only
        "leakage_flag": bool(leakage_flag), # QC only
        "leakage_reasons": leakage_reasons, # QC only
        "comments_len": int(comments_len_chars),
        "recent_incidents_1h": 0,
        "source_system": "jira",
        "raw_sha256": raw_hash
    }

def bugzilla_to_card(bug):
    bid  = bug.get("id") or bug.get("bug_id") or bug.get("ticket_id")
    project = safe_str(bug.get("product") or bug.get("project") or "BUGS")

    # component
    comp = bug.get("component")
    if isinstance(comp, list) and comp:
        comp = comp[0]
    comp = safe_str(comp or "General")

    # created
    created = bug.get("creation_time") or bug.get("created") or bug.get("created_at") or now_iso()
    created = safe_str(created)

    # texte
    summary = bug.get("summary") or bug.get("short_desc") or bug.get("title") or ""
    desc_blob, comments_len_chars = _extract_desc_from_bugzilla(bug)
    desc_len = compute_desc_len(summary, desc_blob, comments_len_chars)

    # severity / status / resolution
    severity   = norm_severity_bugzilla(bug.get("severity"))
    status     = safe_str(bug.get("status") or bug.get("status_current") or "unknown")
    resolution = safe_str(bug.get("resolution") or "")

    # DERIVÉS — ***UNIQUEMENT TEXTE***
    keywords = extract_keywords_from_text(summary, desc_blob)
    security_flag = has_security_indicator(summary, desc_blob)
    leakage_flag, leakage_reasons = _leakage_from_status_resolution(status, resolution)

    # ids
    key = safe_str(bug.get("key") or (project + "-" + safe_str(bid) if bid is not None else "BUG-AUTO"))
    ticket_id = safe_str(bid) if bid is not None else None
    raw_hash = sha256_obj(bug)

    return {
        "ticket_id": ticket_id,
        "key": key,
        "created_at": created,
        "project": project,
        "component": comp,
        "severity": severity,
        "status_current": status,           # QC
        "title_len": len(safe_str(summary)),
        "summary_len": len(safe_str(summary)),
        "text_len": len(safe_str(desc_blob)),
        "desc_len": int(desc_len),
        "summary_text": safe_str(summary),
        "desc_blob": safe_str(desc_blob),
        "keywords": keywords,               # OK (texte only)
        "security_flag": bool(security_flag),
        "resolution": resolution,           # QC
        "leakage_flag": bool(leakage_flag), # QC
        "leakage_reasons": leakage_reasons, # QC
        "comments_len": int(comments_len_chars),
        "recent_incidents_1h": 0,
        "source_system": "bugzilla",
        "raw_sha256": raw_hash
    }

def build_card_from_bug(bug):
    if is_jira(bug):      return jira_to_card(bug)
    if is_bugzilla(bug):  return bugzilla_to_card(bug)

    # Fallback générique (texte only pour keywords)
    project = safe_str(bug.get("project") or bug.get("product") or "BUGS")
    comp = bug.get("component") or bug.get("components") or "General"
    if isinstance(comp, list) and comp:
        comp = comp[0]
    comp = safe_str(comp)
    created = safe_str(bug.get("created_at") or bug.get("created") or now_iso())
    summary = bug.get("summary") or bug.get("title") or ""
    desc = bug.get("description") or bug.get("text") or ""
    desc_len = compute_desc_len(summary, desc, 0)
    severity = norm_severity_generic(bug.get("severity"))
    status = safe_str(bug.get("status") or bug.get("status_current") or "unknown")
    resolution = safe_str(bug.get("resolution") or "")

    kwords = extract_keywords_from_text(summary, desc)   # <- texte only
    sec_flag = has_security_indicator(summary, desc)
    leakage_flag, leakage_reasons = _leakage_from_status_resolution(status, resolution)

    bid = bug.get("id") or bug.get("bug_id") or bug.get("ticket_id")
    key = safe_str(bug.get("key") or (project + "-" + safe_str(bid) if bid is not None else "BUG-AUTO"))
    ticket_id = safe_str(bid) if bid is not None else None
    raw_hash = sha256_obj(bug)

    return {
        "ticket_id": ticket_id,
        "key": key,
        "created_at": created,
        "project": project,
        "component": comp,
        "severity": severity,
        "status_current": status,           # QC
        "title_len": len(safe_str(summary)),
        "summary_len": len(safe_str(summary)),
        "text_len": len(safe_str(desc)),
        "desc_len": int(desc_len),
        "summary_text": safe_str(summary),
        "desc_blob": safe_str(desc),
        "keywords": kwords,                 # OK (texte only)
        "security_flag": bool(sec_flag),
        "resolution": resolution,           # QC
        "leakage_flag": bool(leakage_flag), # QC
        "leakage_reasons": leakage_reasons, # QC
        "comments_len": 0,
        "recent_incidents_1h": 0,
        "source_system": "generic",
        "raw_sha256": raw_hash
    }

# -------- QC: missingness & duplicates --------
def write_missingness(out_dir, records):
    fields = set()
    for r in records: fields.update(r.keys())
    fields = sorted(fields)
    n = len(records)
    rows = []
    for f in fields:
        missing = sum(1 for r in records if (f not in r or r[f] in (None, "", [], {})))
        rows.append({"field": f, "missing": missing, "missing_rate": round(missing/n, 6) if n else 0.0})
    path = os.path.join(out_dir, "missingness.csv")
    with open(path, "w", newline="", encoding="utf-8") as fw:
        w = csv.DictWriter(fw, fieldnames=["field","missing","missing_rate"])
        w.writeheader(); w.writerows(rows)
    return path

def write_duplicates(out_dir, records):
    by_hash = defaultdict(list)
    for r in records:
        by_hash[r.get("raw_sha256","")].append(r)
    dupe_rows = []
    for h, arr in by_hash.items():
        if not h or len(arr) < 2: continue
        for r in arr:
            dupe_rows.append({
                "raw_sha256": h,
                "key": r.get("key",""),
                "ticket_id": r.get("ticket_id",""),
                "created_at": r.get("created_at",""),
                "summary_text": r.get("summary_text","")[:140]
            })
    path = os.path.join(out_dir, "duplicates.csv")
    with open(path, "w", newline="", encoding="utf-8") as fw:
        w = csv.DictWriter(fw, fieldnames=["raw_sha256","key","ticket_id","created_at","summary_text"])
        w.writeheader()
        w.writerows(dupe_rows)
    return path, len(dupe_rows)

# -------- Figures (matplotlib, sans style/couleur forcés) --------
def _plt():
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    return plt

def fig_summary_len_hist(out_dir, records):
    plt = _plt()
    vals = [r.get("summary_len",0) for r in records]
    vals = [v for v in vals if isinstance(v, int)]
    plt.figure(figsize=(10,4.5))
    plt.hist(vals, bins=30)
    plt.xlabel("summary_len"); plt.ylabel("count"); plt.title("Summary length histogram")
    p = os.path.join(out_dir, "fig_summary_len_hist.png"); plt.savefig(p, bbox_inches="tight"); plt.close(); return p

def fig_status_dist(out_dir, records):
    plt = _plt()
    c = Counter(safe_str(r.get("status_current","unknown")).upper() for r in records)
    labs, vals = zip(*sorted(c.items(), key=lambda x: (-x[1], x[0]))) if c else ([],[])
    plt.figure(figsize=(10,4.5)); plt.bar(labs, vals); plt.xticks(rotation=45, ha="right")
    plt.ylabel("count"); plt.title("Status distribution")
    p = os.path.join(out_dir, "fig_status_dist.png"); plt.savefig(p, bbox_inches="tight"); plt.close(); return p

def fig_components_top10(out_dir, records):
    plt = _plt()
    c = Counter((safe_str(r.get("component","")).strip() or "General") for r in records)
    items = sorted(c.items(), key=lambda x: x[1], reverse=True)[:10]
    labs = [k for k,_ in items]; vals = [v for _,v in items]
    plt.figure(figsize=(12,4.5)); plt.bar(labs, vals); plt.xticks(rotation=45, ha="right")
    plt.ylabel("count"); plt.title("Top-10 components")
    p = os.path.join(out_dir, "fig_components_top10.png"); plt.savefig(p, bbox_inches="tight"); plt.close(); return p

def fig_created_months(out_dir, records):
    plt = _plt()
    def month_key(ts):
        s = safe_str(ts);
        return s[:7] if len(s)>=7 else "unknown"
    c = Counter(month_key(r.get("created_at","")) for r in records)
    items = sorted((k,v) for k,v in c.items() if k!="unknown")
    labs = [k for k,_ in items]; vals = [v for _,v in items]
    plt.figure(figsize=(12,4.5)); plt.plot(range(len(vals)), vals, marker="o")
    plt.xticks(range(len(labs)), labs, rotation=45, ha="right")
    plt.ylabel("count"); plt.title("Created per month")
    p = os.path.join(out_dir, "fig_created_months.png"); plt.savefig(p, bbox_inches="tight"); plt.close(); return p

def fig_missingness(out_dir, missingness_csv):
    plt = _plt()
    rows = []
    with open(missingness_csv, newline="", encoding="utf-8") as f:
        rdr = csv.DictReader(f)
        for r in rdr:
            rows.append((r["field"], float(r["missing_rate"])))
    rows = sorted(rows, key=lambda x: x[1], reverse=True)[:20]
    labs = [a for a,_ in rows]; vals = [b for _,b in rows]
    plt.figure(figsize=(12,5)); plt.bar(labs, vals); plt.xticks(rotation=60, ha="right")
    plt.ylabel("missing_rate"); plt.title("Top missingness")
    p = os.path.join(out_dir, "fig_missingness.png"); plt.savefig(p, bbox_inches="tight"); plt.close(); return p

def fig_leakage_rules(out_dir, records):
    plt = _plt()
    cc = Counter()
    for r in records:
        if r.get("leakage_flag"):
            for rea in r.get("leakage_reasons",[]) or []: cc[rea]+=1
    items = sorted(cc.items(), key=lambda x: x[1], reverse=True)
    labs = [k for k,_ in items]; vals = [v for _,v in items]
    plt.figure(figsize=(12,4.5))
    if labs:
        plt.bar(labs, vals); plt.xticks(rotation=45, ha="right")
    plt.ylabel("count"); plt.title("Leakage reasons")
    p = os.path.join(out_dir, "fig_leakage_rules.png"); plt.savefig(p, bbox_inches="tight"); plt.close(); return p

# -------- Allowlist --------
ALLOWLIST_FIELDS = [
    # Texte court + dérivés non post-hoc
    "summary_text","summary_len","title_len","desc_len","text_len",
    # Métadonnées “early”
    "project","component","severity","security_flag","keywords",
    # Temps brut (sera binned en aval si besoin)
    "created_at",
    # EXCLUS explicitement en aval: status_current, resolution, leakage_flag, leakage_reasons
]

def write_allowlist(out_dir):
    path = os.path.join(out_dir, "feature_allowlist.txt")
    with open(path, "w", encoding="utf-8") as f:
        f.write("# Features autorisées (anti-leakage)\n")
        for k in ALLOWLIST_FIELDS:
            f.write(k+"\n")
        f.write("\n# EXCLUS (post-hoc): status_current, resolution, leakage_flag, leakage_reasons\n")
    return path

# -------- DataCard & stats --------
def write_dataset_stats(out_dir, records, input_sha, figs, missing_csv, dup_count):
    stats = {
        "generated_at": now_iso(),
        "data_source_name": DATA_SOURCE_NAME,
        "data_source_url": DATA_SOURCE_URL,
        "data_access_date": DATA_ACCESS_DATE,
        "data_license": DATA_LICENSE,
        "data_license_url": DATA_LICENSE_URL,
        "input_path_sha256": input_sha,
        "cards": len(records),
        "source_system_counts": dict(Counter(r.get("source_system","") for r in records)),
        "desc_blob_empty_rate": round(sum(1 for r in records if not r.get("desc_blob"))/len(records), 6) if records else 0.0,
        "leakage_rate": round(sum(1 for r in records if r.get("leakage_flag"))/len(records), 6) if records else 0.0,
        "figures": [os.path.basename(p) for p in figs if p],
        "missingness_csv": os.path.basename(missing_csv),
        "duplicates_count_rows": dup_count
    }
    p = os.path.join(out_dir, "dataset_stats.json")
    with open(p, "w", encoding="utf-8") as fw:
        json.dump(stats, fw, ensure_ascii=False, indent=2)
    return p, stats

def write_datacard(out_dir, stats):
    md = f"""# DataCard — BugAda Cards (Étape 1, PROPRE)

- **Généré** : {stats.get('generated_at')}
- **Source** : {stats.get('data_source_name')} — {stats.get('data_source_url')}
- **Date d'accès** : {stats.get('data_access_date')}
- **SHA256 entrée** : {stats.get('input_path_sha256')}
- **Nombre de cartes** : {stats.get('cards')}
- **Leakage rate (QC)** : {stats.get('leakage_rate')}
- **% desc_blob vides** : {stats.get('desc_blob_empty_rate')}
- **Systèmes** : {stats.get('source_system_counts')}
- **Licence** : {stats.get('data_license')}  ({stats.get('data_license_url')})

## Politique anti-leakage
- Toute feature post-hoc (`status_current`, `resolution`) est **exclue** des features.
- Publication d’une **liste blanche** : `feature_allowlist.txt`.

## QC
- `missingness.csv` (taux de champs manquants ; figure jointe).
- `duplicates.csv` (doublons exacts par `raw_sha256`) — politique: *keep-first*.

## Figures pour papier
- `fig_summary_len_hist.png`
- `fig_created_months.png`
- `fig_components_top10.png`
- `fig_missingness.png`
- `fig_leakage_rules.png`

## Note
Si `desc_blob` est vide dans la source, les modèles en aval se contenteront du **titre** + métadonnées *non post-hoc*. C’est attendu et documenté.
"""
    p = os.path.join(out_dir, "DATACARD.md")
    with open(p, "w", encoding="utf-8") as f:
        f.write(md)
    return p

# -------- Main conversion --------
def convert_to_cards(input_path, out_dir, limit=None):
    ensure_dir(out_dir)
    bugs = read_json_any(input_path)
    if not isinstance(bugs, list) or len(bugs) == 0:
        print("[ERR] Aucune donnée lisible."); sys.exit(1)
    if len(bugs) == 1:
        print("[WARN] Seulement 1 élément détecté. Vérifie le format d'entrée.")

    out_jsonl = os.path.join(out_dir, "episodes_raw.jsonl")
    out_sample = os.path.join(out_dir, "episodes_raw.sample.json")
    n = 0; sample = []; cards = []; c_desc_blob_empty = 0

    with open(out_jsonl, "w", encoding="utf-8") as fw:
        for bug in bugs:
            try:
                card = build_card_from_bug(bug)
            except Exception:
                continue
            fw.write(json.dumps(card, ensure_ascii=False) + "\n")
            cards.append(card)
            if not card.get("desc_blob"): c_desc_blob_empty += 1
            if len(sample) < 5: sample.append(card)
            n += 1
            if limit and n >= limit: break

    with open(out_sample, "w", encoding="utf-8") as fs:
        json.dump(sample, fs, ensure_ascii=False, indent=2)

    # Aperçu console (3 cartes)
    print("Aperçu de 3 cartes:")
    for r in cards[:3]:
        print(json.dumps(r, ensure_ascii=False))

    print(f"[STATS] cartes: {n}")
    if n: print(f"[STATS] %desc_blob vide (global): {c_desc_blob_empty/n:.2%}")

    # QC
    miss_csv = write_missingness(out_dir, cards)
    dup_csv, dup_rows = write_duplicates(out_dir, cards)

    # Figures (papier)
    figs = []
    figs.append(fig_summary_len_hist(out_dir, cards))
    figs.append(fig_status_dist(out_dir, cards))
    figs.append(fig_components_top10(out_dir, cards))
    figs.append(fig_created_months(out_dir, cards))
    figs.append(fig_missingness(out_dir, miss_csv))
    figs.append(fig_leakage_rules(out_dir, cards))
    print(f"[FIG] Graphiques écrits dans: {out_dir}")

    # Paper assets
    paper_dir = os.path.join(out_dir, "paper_assets"); ensure_dir(paper_dir)
    for p in figs:
        if p: shutil.copy2(p, os.path.join(paper_dir, os.path.basename(p)))
    print(f"[PAPER] assets copiés -> {paper_dir}")

    # Allowlist
    allow_path = write_allowlist(out_dir)

    # Stats + datacard
    input_sha = sha256_file(input_path)
    stats_path, stats = write_dataset_stats(out_dir, cards, input_sha, figs, miss_csv, dup_rows)
    dc_path = write_datacard(out_dir, stats)

    print(f"- JSONL    : {out_jsonl}")
    print(f"- Échantill: {out_sample}")
    print(f"[QC] missingness.csv | duplicates.csv écrits.")
    print(f"[STATS] dataset_stats.json & DATACARD.md écrits.")
    print(f"[ALLOW] {allow_path}")

if __name__ == "__main__":
    convert_to_cards(INPUT_PATH, OUT_DIR, LIMIT)

[LOAD] JSON dict avec liste 'bugs' -> 200 éléments
Aperçu de 3 cartes:
{"ticket_id": "10954", "key": "BUGS-10954", "created_at": "1999-07-30T22:55:51Z", "project": "BUGS", "component": "Settings UI", "severity": "medium", "status_current": "RESOLVED", "title_len": 46, "summary_len": 46, "text_len": 0, "desc_len": 46, "summary_text": "Dialup properties needs to be exposed in prefs", "desc_blob": "", "keywords": [], "security_flag": false, "resolution": "WONTFIX", "leakage_flag": true, "leakage_reasons": ["status=RESOLVED", "resolution=WONTFIX"], "comments_len": 0, "recent_incidents_1h": 0, "source_system": "bugzilla", "raw_sha256": "sha256:5492c7d87c2db3a7ae19f78785cbd98016893b657a978cf32897b2f445d085a7"}
{"ticket_id": "14871", "key": "BUGS-14871", "created_at": "1999-09-24T21:49:34Z", "project": "BUGS", "component": "General", "severity": "low", "status_current": "RESOLVED", "title_len": 27, "summary_len": 27, "text_len": 0, "desc_len": 27, "summary_text": "[Find] Find whole word only"

# Étape 2 — Silver labels (BugAda)

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Étape 2 — Silver labels (BugAda-BMO)
Entrée  : episodes_raw.jsonl (Étape 1, BMO)
Sorties :
  - episodes_with_silver.jsonl
  - silver_labels.csv
  - silver_stats.json
  - fig_silver_dist.png
  - fig_silver_coverage.png
  - fig_labeled_ratio_by_component_top10.png
  - fig_tickets_per_year.png
"""

import os, json, csv
from collections import Counter

# ====== CHEMINS (BMO uniquement pour l’instant) ======
IN_JSONL  = "/content/drive/MyDrive/bugada_cards_clean/episodes_raw.jsonl"
OUT_DIR   = "/content/drive/MyDrive/cleansilver_bmo"

OUT_JSONL = os.path.join(OUT_DIR, "episodes_with_silver.jsonl")
OUT_CSV   = os.path.join(OUT_DIR, "silver_labels.csv")
OUT_STATS = os.path.join(OUT_DIR, "silver_stats.json")
# =====================================================

def ensure_dir(p):
    os.makedirs(p, exist_ok=True)

def silver_label_from_card(card) -> str | None:
    """
    Règles déterministes pour attribuer un label i:...
    (OK d'utiliser status + resolution ici : c'est pour les labels, pas pour les features du modèle)
    """
    r   = (card.get("resolution") or "").lower().strip()
    st  = (card.get("status_current") or "").lower().strip()
    txt = " ".join([
        str(card.get("summary_text") or ""),
        str(card.get("desc_blob") or ""),
        st, r,
        " ".join(card.get("keywords") or []),
    ]).lower()
    sev = (card.get("severity") or "unknown").lower()

    # 1) Faux positifs
    if r in {"invalid","worksforme","works as intended","notabug","duplicate","moved"}:
        return "i:false_positive"

    # 2) Effet de bord business
    if r in {"wontfix","by design","policy decision"}:
        return "i:business_side_effect"

    # 3) Info insuffisante
    if (st in {"needinfo","unconfirmed"} or
        any(k in txt for k in ["cannot reproduce","missing steps","need more info",
                               "incomplete","no steps"])):
        return "i:insufficient_info"

    # 4) Sécurité
    if any(k in txt for k in ["xss","csrf","vulnerability","sql injection"]):
        return "i:security_threat"

    # 5) Régression de release
    if any(k in txt for k in ["after update","after upgrade","after release",
                              "after deploy","introduced in","since version",
                              "since build","since release","regression"]):
        return "i:release_regression"

    # 6) Instabilité infra
    if any(k in txt for k in ["timeout","latency","slowdown","slow","5xx",
                              "server error","network","intermittent",
                              "connection reset","502","503","bad gateway"]):
        return "i:infra_instability"

    # 7) Petites dégradations
    if sev in {"low","trivial"}:
        return "i:minor_degradation"

    return None  # pas de label

def _year_from_ts(ts: str) -> str | None:
    if not ts:
        return None
    return ts[:4] if len(ts) >= 4 and ts[:4].isdigit() else None

# --- Figures (matplotlib simple, sans style forcé) ---
def _plt():
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    return plt

def _plot_bar(counter: Counter, title: str, xlabel: str, out_png: str, rotation=45):
    plt = _plt()
    items = list(counter.items())
    labels = [k for k,_ in items]
    values = [v for _,v in items]
    plt.figure(figsize=(10,5))
    plt.bar(range(len(items)), values)
    plt.xticks(range(len(items)), labels, rotation=rotation, ha="right")
    plt.title(title); plt.xlabel(xlabel); plt.ylabel("count")
    plt.tight_layout(); plt.savefig(out_png, dpi=160); plt.close()

def _plot_simple(values_dict: dict, title: str, xlabel: str, out_png: str, rotation=0):
    _plot_bar(Counter(values_dict), title, xlabel, out_png, rotation=rotation)

# ----------------- MAIN -----------------
def main():
    ensure_dir(OUT_DIR)

    n = 0
    n_labeled = 0
    dist = Counter()
    sev_c = Counter()
    comp_total = Counter()
    comp_labeled = Counter()
    year_c = Counter()

    with open(IN_JSONL, "r", encoding="utf-8") as fr, \
         open(OUT_JSONL, "w", encoding="utf-8") as fw, \
         open(OUT_CSV, "w", newline="", encoding="utf-8") as fc:

        cw = csv.writer(fc)
        cw.writerow(["episode_id","silver_label"])

        for line in fr:
            line = line.strip()
            if not line:
                continue
            card = json.loads(line)
            n += 1

            # stats auxiliaires
            sev_c[(card.get("severity") or "unknown").lower()] += 1
            comp = card.get("component") or "General"
            comp_total[comp] += 1
            y = _year_from_ts(card.get("created_at",""))
            if y:
                year_c[y] += 1

            # label silver
            label = silver_label_from_card(card)
            if label:
                n_labeled += 1
                dist[label] += 1
                comp_labeled[comp] += 1
                cw.writerow([card.get("key") or card.get("ticket_id"), label])

            card["_silver_label"] = label  # peut être None
            fw.write(json.dumps(card, ensure_ascii=False) + "\n")

    coverage = (n_labeled / n) if n else 0.0
    print(f"[SILVER] épisodes: {n}")
    print(f"[SILVER] couverts: {n_labeled}  ({coverage:.2%})")
    if dist:
        print("[SILVER] distribution:")
        for k,v in sorted(dist.items(), key=lambda kv: (-kv[1], kv[0])):
            print(f"  - {k}: {v}")
    print(f"- JSONL (avec labels) : {OUT_JSONL}")
    print(f"- CSV labels          : {OUT_CSV}")

    # ----- Stats JSON -----
    stats = {
        "episodes": n,
        "labeled": n_labeled,
        "coverage": round(coverage, 4),
        "label_distribution": dict(dist),
        "severity_distribution": dict(sev_c),
        "component_total_top10": dict(Counter(dict(comp_total)).most_common(10)),
        "component_labeled_top10": dict(Counter(dict(comp_labeled)).most_common(10)),
        "year_counts": dict(sorted(year_c.items()))
    }
    with open(OUT_STATS, "w", encoding="utf-8") as f:
        json.dump(stats, f, ensure_ascii=False, indent=2)

    # ----- Figures -----
    # 1) Distribution des labels
    _plot_bar(dist, "Silver label distribution", "label",
              os.path.join(OUT_DIR, "fig_silver_dist.png"))
    # 2) Couverture (labeled vs non-labeled)
    cov_dict = {"labeled": n_labeled, "unlabeled": max(0, n - n_labeled)}
    _plot_simple(cov_dict, "Silver coverage", "class",
                 os.path.join(OUT_DIR, "fig_silver_coverage.png"), rotation=0)
    # 3) Ratio de labellisation par composant (Top-10 composants par volume)
    try:
        import matplotlib.pyplot as plt
        top10 = [c for c,_ in Counter(dict(comp_total)).most_common(10)]
        ratios = []
        for c in top10:
            tot = comp_total[c]
            lab = comp_labeled.get(c, 0)
            ratios.append((lab / tot) if tot else 0.0)
        plt.figure(figsize=(10,5))
        plt.bar(range(len(top10)), ratios)
        plt.xticks(range(len(top10)), top10, rotation=45, ha="right")
        plt.title("Labeled ratio by component (top-10 by volume)")
        plt.xlabel("component"); plt.ylabel("labeled_ratio")
        plt.tight_layout()
        plt.savefig(os.path.join(OUT_DIR,
                                 "fig_labeled_ratio_by_component_top10.png"),
                    dpi=160)
        plt.close()
    except Exception as e:
        print("[FIG][WARN] component ratio:", e)

    # 4) Tickets par année (qualité temporelle)
    _plot_bar(Counter(dict(year_c)), "Tickets per year", "year",
              os.path.join(OUT_DIR, "fig_tickets_per_year.png"), rotation=0)

if __name__ == "__main__":
    main()

[SILVER] épisodes: 200
[SILVER] couverts: 68  (34.00%)
[SILVER] distribution:
  - i:false_positive: 29
  - i:business_side_effect: 19
  - i:minor_degradation: 13
  - i:insufficient_info: 6
  - i:infra_instability: 1
- JSONL (avec labels) : /content/drive/MyDrive/cleansilver_bmo/episodes_with_silver.jsonl
- CSV labels          : /content/drive/MyDrive/cleansilver_bmo/silver_labels.csv


# Étape 3 — Splits DEV / HOLDOUT-H (make_splits_dev_holdout)

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Étape 3 — Splits DEV / HOLDOUT-H (BMO)
Entrée  : episodes_with_silver.jsonl (Étape 2)
Sorties :
  - /content/drive/MyDrive/splits_bmo/episodes_dev.jsonl
  - /content/drive/MyDrive/splits_bmo/episodes_holdoutH.jsonl
  - /content/drive/MyDrive/splits_bmo/splits.csv
"""

import os, json, csv, random
from collections import Counter

# ========= CHEMINS À ADAPTER SI BESOIN =========
IN_JSONL = "/content/drive/MyDrive/cleansilver_bmo/episodes_with_silver.jsonl"
OUT_DIR  = "/content/drive/MyDrive/splits_bmo"
# =============================================

DEV_RATIO = 0.785   # ~157/200 pour BMO
SEED      = 2025
random.seed(SEED)

def ensure_dir(p: str):
    os.makedirs(p, exist_ok=True)

def load_cards(path: str):
    cards = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
                cards.append(obj)
            except Exception:
                continue
    return cards

def main():
    ensure_dir(OUT_DIR)

    cards = load_cards(IN_JSONL)
    n = len(cards)
    if n == 0:
        print("[ERR] Aucun épisode lu dans", IN_JSONL)
        return

    # id d’épisode pour traçabilité
    def eid(c):
        return str(c.get("key") or c.get("ticket_id") or "")

    # mélange déterministe
    idx = list(range(n))
    random.shuffle(idx)

    dev_count = int(round(DEV_RATIO * n))
    dev_idx   = set(idx[:dev_count])
    h_idx     = set(idx[dev_count:])

    out_dev_path = os.path.join(OUT_DIR, "episodes_dev.jsonl")
    out_h_path   = os.path.join(OUT_DIR, "episodes_holdoutH.jsonl")
    splits_csv   = os.path.join(OUT_DIR, "splits.csv")

    # stats de labels
    dist_all = Counter()
    dist_dev = Counter()
    dist_h   = Counter()

    with open(out_dev_path, "w", encoding="utf-8") as f_dev, \
         open(out_h_path,   "w", encoding="utf-8") as f_h, \
         open(splits_csv,   "w", newline="", encoding="utf-8") as f_csv:

        cw = csv.writer(f_csv)
        cw.writerow(["episode_id", "split"])

        for i, c in enumerate(cards):
            lab = c.get("_silver_label") or "NONE"
            dist_all[lab] += 1

            if i in dev_idx:
                f_dev.write(json.dumps(c, ensure_ascii=False) + "\n")
                dist_dev[lab] += 1
                cw.writerow([eid(c), "DEV"])
            else:
                f_h.write(json.dumps(c, ensure_ascii=False) + "\n")
                dist_h[lab] += 1
                cw.writerow([eid(c), "H"])

    print(f"[SPLIT] total: {n} | DEV: {len(dev_idx)} | HOLDOUT-H: {len(h_idx)}")
    print("[SPLIT] label dist (ALL):", dict(dist_all))
    print("[SPLIT] label dist (DEV):", dict(dist_dev))
    print("[SPLIT] label dist (H  ):", dict(dist_h))
    print("- DEV       :", out_dev_path)
    print("- HOLDOUT-H :", out_h_path)
    print("- SPLITS    :", splits_csv)

if __name__ == "__main__":
    main()

[SPLIT] total: 200 | DEV: 157 | HOLDOUT-H: 43
[SPLIT] label dist (ALL): {'i:business_side_effect': 19, 'i:false_positive': 29, 'NONE': 132, 'i:insufficient_info': 6, 'i:minor_degradation': 13, 'i:infra_instability': 1}
[SPLIT] label dist (DEV): {'i:false_positive': 23, 'i:business_side_effect': 16, 'NONE': 100, 'i:insufficient_info': 5, 'i:minor_degradation': 12, 'i:infra_instability': 1}
[SPLIT] label dist (H  ): {'i:business_side_effect': 3, 'NONE': 32, 'i:false_positive': 6, 'i:insufficient_info': 1, 'i:minor_degradation': 1}
- DEV       : /content/drive/MyDrive/splits_bmo/episodes_dev.jsonl
- HOLDOUT-H : /content/drive/MyDrive/splits_bmo/episodes_holdoutH.jsonl
- SPLITS    : /content/drive/MyDrive/splits_bmo/splits.csv


# Étape 4 — calibrate_tau_and_run_gate

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Étape 4 — Calibration τ + exécution du protocole (BMO)

- Lit les splits BMO : episodes_dev(.clean).jsonl, episodes_holdoutH(.clean).jsonl
- Calcule un score p_top (proxy déterministe, anti-leakage)
- Sweep τ sur DEV et H : abstention_rate et coverage_on_labeled
- Choisit τ* pour viser une abstention cible sur DEV (par ex. 20 %)
- Relance le protocole avec τ* sur DEV et H :
  -> eligibility_audit.csv
  -> figures (distribution décisions, p_top, latence)
  -> traces/ et prov/

Adapte seulement les chemins si besoin.
"""

import os
import json
import csv
import math
from collections import Counter

# ==================== CONFIG ====================

# Chemins d'entrée (on préfère *.clean.jsonl s'ils existent)
DEV_CANDIDATES = [
    "/content/drive/MyDrive/splits_bmo/episodes_dev.clean.jsonl",
    "/content/drive/MyDrive/splits_bmo/episodes_dev.jsonl",
]

H_CANDIDATES = [
    "/content/drive/MyDrive/splits_bmo/episodes_holdoutH.clean.jsonl",
    "/content/drive/MyDrive/splits_bmo/episodes_holdoutH.jsonl",
]

# Dossiers de sortie
SWEEP_DIR = "/content/drive/MyDrive/sweep_bmo"
OUT_DEV   = "/content/drive/MyDrive/protocol_bmo_dev"
OUT_H     = "/content/drive/MyDrive/protocol_bmo_H"

# Cible d'abstention (20 %)
TARGET_ABST = 0.20

# Labels d'incidents (ceux qu'on considère "intéressants")
I_IDS = {
    "i:release_regression",
    "i:infra_instability",
    "i:security_threat",
    "i:minor_degradation",
    "i:insufficient_info",
    "i:false_positive",
    "i:business_side_effect",
}

# =================================================

def first_existing(paths):
    for p in paths:
        if os.path.exists(p):
            return p
    return paths[0]

def ensure_dir(p):
    os.makedirs(p, exist_ok=True)

def load_jsonl(path):
    items = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                items.append(json.loads(line))
            except Exception:
                continue
    return items

# ---- proxy p_top (déterministe, anti-leakage) ----

def sigmoid(x):
    try:
        return 1.0 / (1.0 + math.exp(-x))
    except OverflowError:
        return 0.0 if x < 0 else 1.0

def infer_p_top(card):
    """
    Score de confiance p_top basé uniquement sur :
      - summary_len
      - text_len
      - severity
    Pas de status ni de résolution -> pas de fuite.
    """
    s_len = int(card.get("summary_len", 0) or 0)
    t_len = int(card.get("text_len", 0) or 0)
    sev   = (card.get("severity") or "unknown").lower()

    sev_bias = {"high": 0.6, "medium": 0.35, "low": 0.15}.get(sev, 0.25)

    x = 0.02 * min(s_len, 200) + 0.005 * min(t_len, 1200) + sev_bias

    # léger jitter déterministe par clé (pour ne pas avoir tous les mêmes)
    k = str(card.get("key") or card.get("ticket_id") or "")
    jitter = (hash(k) % 1000) / 1000.0  # [0, 1)
    x = x + 0.10 * (jitter - 0.5)

    return max(0.0, min(1.0, sigmoid(x)))

# ---- figures (matplotlib, sans style/couleurs forcés) ----

def _mpl():
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    return plt

def fig_abstention_vs_tau(taus, abst, title, out_png):
    plt = _mpl()
    plt.figure(figsize=(8,4))
    plt.plot(taus, abst, marker="o")
    plt.xlabel("tau")
    plt.ylabel("abstention_rate")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(out_png, dpi=160)
    plt.close()

def fig_coverage_vs_tau(taus, cov, title, out_png):
    plt = _mpl()
    plt.figure(figsize=(8,4))
    plt.plot(taus, cov, marker="o")
    plt.xlabel("tau")
    plt.ylabel("coverage_on_labeled")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(out_png, dpi=160)
    plt.close()

def fig_decision_distribution(accepted_mask, out_png):
    plt = _mpl()
    acc = sum(1 for a in accepted_mask if a)
    abst = len(accepted_mask) - acc
    plt.figure(figsize=(6,4))
    plt.bar(["accepted", "abstain"], [acc, abst])
    plt.title("Decision distribution")
    plt.tight_layout()
    plt.savefig(out_png, dpi=160)
    plt.close()

def fig_p_top_hist_all(p_tops, out_png):
    plt = _mpl()
    plt.figure(figsize=(8,4.5))
    plt.hist(p_tops, bins=30)
    plt.xlabel("p_top")
    plt.ylabel("count")
    plt.title("p_top (all)")
    plt.tight_layout()
    plt.savefig(out_png, dpi=160)
    plt.close()

def fig_p_top_hist_accepted(p_tops, accepted_mask, out_png):
    vals = [v for v,a in zip(p_tops, accepted_mask) if a]
    plt = _mpl()
    plt.figure(figsize=(8,4.5))
    plt.hist(vals, bins=30)
    plt.xlabel("p_top (accepted)")
    plt.ylabel("count")
    plt.title("p_top (accepted only)")
    plt.tight_layout()
    plt.savefig(out_png, dpi=160)
    plt.close()

def fig_latency_hist(latencies_ms, out_png):
    plt = _mpl()
    plt.figure(figsize=(8,4.5))
    plt.hist(latencies_ms, bins=30)
    plt.xlabel("latency (ms)")
    plt.ylabel("count")
    plt.title("Latency histogram")
    plt.tight_layout()
    plt.savefig(out_png, dpi=160)
    plt.close()

# ---- métriques pour un τ donné ----

def metrics_for_tau(cards, tau):
    """
    Retourne:
      - abstention_rate_all
      - coverage_on_labeled (sur les cartes avec silver_label dans I_IDS)
    """
    accepted_flags = []
    labeled_flags = []
    accepted_and_labeled = []

    for c in cards:
        p = infer_p_top(c)
        accept = (p >= (1.0 - tau))
        accepted_flags.append(accept)

        lab = c.get("_silver_label") or "NONE"
        is_labeled = lab in I_IDS

        labeled_flags.append(is_labeled)
        accepted_and_labeled.append(accept and is_labeled)

    n = len(cards)
    abst_all = 1.0 - (sum(1 for a in accepted_flags if a) / n) if n else 0.0

    n_labeled = sum(1 for x in labeled_flags if x)
    cov = (sum(1 for x in accepted_and_labeled if x) / n_labeled) if n_labeled else 0.0

    return abst_all, cov

# ---- sweep τ ----

def sweep_tau(cards, label, out_dir):
    ensure_dir(out_dir)
    taus = [i / 200.0 for i in range(0, 101)]  # 0.00 -> 0.50
    abst_list = []
    cov_list = []

    csv_path = os.path.join(out_dir, f"sweep_{label}.csv")
    with open(csv_path, "w", newline="", encoding="utf-8") as fw:
        cw = csv.writer(fw)
        cw.writerow(["tau", "abstention_rate_all", "coverage_on_labeled"])

        for t in taus:
            abst, cov = metrics_for_tau(cards, t)
            abst_list.append(abst)
            cov_list.append(cov)
            cw.writerow([f"{t:.4f}", f"{abst:.4f}", f"{cov:.4f}"])

    # Figures
    fig_abstention_vs_tau(
        taus, abst_list,
        f"Abstention vs tau ({label})",
        os.path.join(out_dir, f"fig_abstention_vs_tau_{label}.png"),
    )
    fig_coverage_vs_tau(
        taus, cov_list,
        f"Coverage-on-labeled vs tau ({label})",
        os.path.join(out_dir, f"fig_coverage_vs_tau_{label}.png"),
    )

    return taus, abst_list, cov_list

# ---- exécution protocole pour un τ donné ----

def run_protocol(cards, tau, out_dir):
    ensure_dir(out_dir)
    os.makedirs(os.path.join(out_dir, "traces"), exist_ok=True)
    os.makedirs(os.path.join(out_dir, "prov"), exist_ok=True)

    audit_csv = os.path.join(out_dir, "eligibility_audit.csv")

    p_tops = []
    accepted_mask = []
    latencies = []

    with open(audit_csv, "w", newline="", encoding="utf-8") as fw:
        cw = csv.writer(fw)
        cw.writerow(["episode_id", "p_top", "decision", "accepted"])

        for c in cards:
            p = infer_p_top(c)
            accept = (p >= (1.0 - tau))

            # Latence proxy (en ms) : fonction de text_len + petit bruit
            lat = 10.0 + 0.02 * (int(c.get("text_len", 0) or 0)) + (
                hash(str(c.get("key", ""))) % 7
            )

            p_tops.append(p)
            accepted_mask.append(accept)
            latencies.append(lat)

            eid = str(c.get("key") or c.get("ticket_id") or "")
            cw.writerow([eid, f"{p:.4f}", "accept" if accept else "abstain", int(accept)])

            trace = {
                "episode_id": eid,
                "features": {
                    "summary_len": int(c.get("summary_len", 0) or 0),
                    "text_len": int(c.get("text_len", 0) or 0),
                    "severity": c.get("severity") or "unknown",
                },
                "p_top": p,
                "tau": tau,
                "decision": "accept" if accept else "abstain",
            }
            prov = {
                "episode_id": eid,
                "generated_by": "step4_bmo",
                "inputs": {},
                "params": {"tau": tau},
            }

            with open(os.path.join(out_dir, "traces", f"{eid}.json"), "w", encoding="utf-8") as ft:
                json.dump(trace, ft, ensure_ascii=False, indent=2)
            with open(os.path.join(out_dir, "prov", f"{eid}.json"), "w", encoding="utf-8") as fp:
                json.dump(prov, fp, ensure_ascii=False, indent=2)

    abst = 1.0 - (sum(1 for a in accepted_mask if a) / len(accepted_mask))
    print(f"[RUN] episodes: {len(cards)}")
    print(f"[RUN] abstention_rate: {abst:.4f}")
    print(f"- OUT_DIR: {out_dir}")
    print("- eligibility_audit.csv prêt")

    # Figures
    fig_decision_distribution(accepted_mask, os.path.join(out_dir, "fig_decision_distribution.png"))
    fig_p_top_hist_all(p_tops, os.path.join(out_dir, "fig_p_top_hist_all.png"))
    fig_p_top_hist_accepted(p_tops, accepted_mask, os.path.join(out_dir, "fig_p_top_hist_accepted.png"))
    fig_latency_hist(latencies, os.path.join(out_dir, "fig_latency_hist.png"))

    print(f"[RUN] traces: {len(cards)}")
    print(f"[RUN] prov: {len(cards)}")

# ---- main ----

def main():
    dev_path = first_existing(DEV_CANDIDATES)
    h_path   = first_existing(H_CANDIDATES)

    dev_cards = load_jsonl(dev_path)
    h_cards   = load_jsonl(h_path)

    print(f"[LOAD] DEV: {len(dev_cards)} episodes")
    print(f"[LOAD] H  : {len(h_cards)} episodes")

    # 1) Sweep τ
    ensure_dir(SWEEP_DIR)

    print("\n[SWEEP] DEV")
    taus, abst_dev, cov_dev = sweep_tau(dev_cards, "DEV", SWEEP_DIR)

    print("\n[SWEEP] H")
    _, abst_h, cov_h = sweep_tau(h_cards, "H", SWEEP_DIR)

    # 2) Choix τ* (sur DEV, cible d'abstention)
    best_idx = min(range(len(taus)), key=lambda i: abs(abst_dev[i] - TARGET_ABST))
    tau_star = taus[best_idx]
    thr = 1.0 - tau_star

    print(f"\n[CAL] target_abst={TARGET_ABST:.2f} -> τ*={tau_star:.4f} (thr={thr:.4f})")

    # 3) Run protocole avec τ*
    print("\n== DEV ==")
    run_protocol(dev_cards, tau_star, OUT_DEV)

    print("\n== HOLDOUT-H ==")
    run_protocol(h_cards, tau_star, OUT_H)

if __name__ == "__main__":
    main()

[LOAD] DEV: 157 episodes
[LOAD] H  : 43 episodes

[SWEEP] DEV

[SWEEP] H

[CAL] target_abst=0.20 -> τ*=0.2400 (thr=0.7600)

== DEV ==
[RUN] episodes: 157
[RUN] abstention_rate: 0.1975
- OUT_DIR: /content/drive/MyDrive/protocol_bmo_dev
- eligibility_audit.csv prêt
[RUN] traces: 157
[RUN] prov: 157

== HOLDOUT-H ==
[RUN] episodes: 43
[RUN] abstention_rate: 0.1395
- OUT_DIR: /content/drive/MyDrive/protocol_bmo_H
- eligibility_audit.csv prêt
[RUN] traces: 43
[RUN] prov: 43


# Étape 5 — evaluate_gate_vs_gold

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Étape 5 — Évaluation de la PORTE (gating-only) vs GOLD — BMO

- Joint : eligibility_audit.csv (accept/abstain) × gold_labels.csv
- Mesure :
    * abstention_rate_all  (sur tous les tickets)
    * coverage_on_labeled  (part des tickets *labellisés* que la porte accepte)
    * coverage_per_label   (même chose, par type d'incident)
- Sorties :
    * protocol_bmo_eval_DEV/metrics_gate.json + figures
    * protocol_bmo_eval_H/metrics_gate.json   + figures
"""

import os, csv, json

# ========= CHEMINS À VÉRIFIER (BMO) =========
GOLD    = "/content/drive/MyDrive/gold/gold_labels.csv"
AUD_DEV = "/content/drive/MyDrive/protocol_bmo_dev/eligibility_audit.csv"
AUD_H   = "/content/drive/MyDrive/protocol_bmo_H/eligibility_audit.csv"

OUT_DEV = "/content/drive/MyDrive/protocol_bmo_eval_DEV"
OUT_H   = "/content/drive/MyDrive/protocol_bmo_eval_H"
# ===========================================

# Les incidents qu'on considère (doivent correspondre aux labels GOLD)
I_IDS = {
    "i:release_regression",
    "i:infra_instability",
    "i:security_threat",
    "i:minor_degradation",
    "i:insufficient_info",
    "i:false_positive",
    "i:business_side_effect",
}

def ensure_dir(p: str):
    os.makedirs(p, exist_ok=True)

def load_gold_map(path: str):
    """
    Charge gold_labels.csv -> dict episode_id -> gold_label
    et ajoute les variantes BUGS-12345 / 12345 pour être robuste.
    """
    m = {}
    with open(path, newline="", encoding="utf-8") as f:
        rdr = csv.DictReader(f)
        for r in rdr:
            eid = str(r.get("episode_id", "")).strip()
            lab = str(r.get("gold_label", "")).strip()
            if not eid or lab not in I_IDS:
                continue
            m[eid] = lab
            # mapping BUGS-12345 <-> 12345
            if eid.startswith("BUGS-"):
                num = eid.split("-", 1)[1]
                m[num] = lab
            else:
                m[f"BUGS-{eid}"] = lab
    return m

def eval_gate(audit_csv: str, gold_map: dict, out_dir: str):
    """
    Lit eligibility_audit.csv et joint avec gold_map.
    Calcule métriques + figures simples.
    """
    ensure_dir(out_dir)
    rows = []
    with open(audit_csv, newline="", encoding="utf-8") as f:
        rdr = csv.DictReader(f)
        for r in rdr:
            eid = r["episode_id"]
            p   = float(r["p_top"])
            acc = (r["accepted"] in {"1", "True", "true", "TRUE"})
            lab = gold_map.get(eid)
            rows.append((eid, p, acc, lab))

    n_all = len(rows)
    labeled = [(eid, p, acc, lab) for (eid, p, acc, lab) in rows if lab is not None]
    n_lab = len(labeled)
    covered = [(eid, p, acc, lab) for (eid, p, acc, lab) in labeled if acc]
    n_cov = len(covered)

    # taux d’abstention global (sur tous les tickets de la split)
    abst_all = 1.0 - (sum(1 for (_, _, acc, _) in rows if acc) / n_all) if n_all else 0.0
    # parmi les tickets avec étiquette GOLD, combien sont passés à travers la porte ?
    coverage_on_labeled = (n_cov / n_lab) if n_lab else 0.0

    # couverture par type d’incident (GOLD)
    per_label_tot = {lab: 0 for lab in I_IDS}
    per_label_cov = {lab: 0 for lab in I_IDS}
    for (_, _, acc, lab) in labeled:
        per_label_tot[lab] += 1
        if acc:
            per_label_cov[lab] += 1
    coverage_per_label = {
        lab: (per_label_cov[lab] / per_label_tot[lab]) if per_label_tot[lab] > 0 else 0.0
        for lab in I_IDS
    }

    metrics = {
        "episodes_all": n_all,
        "accepted_all": sum(1 for (_, _, acc, _) in rows if acc),
        "abstention_rate_all": round(abst_all, 4),
        "episodes_labeled": n_lab,
        "covered_non_abstain": n_cov,
        "coverage_on_labeled": round(coverage_on_labeled, 4),
        "coverage_per_label": {k: round(v, 4) for k, v in coverage_per_label.items()},
    }

    # Sauvegarde JSON
    with open(os.path.join(out_dir, "metrics_gate.json"), "w", encoding="utf-8") as fw:
        json.dump(metrics, fw, ensure_ascii=False, indent=2)

    # Figures simples (matplotlib sans style particulier)
    try:
        import matplotlib
        matplotlib.use("Agg")
        import matplotlib.pyplot as plt

        # 1) barre couverture
        plt.figure(figsize=(5, 4))
        plt.bar(["labeled", "covered_non_abstain"], [n_lab, n_cov])
        plt.title("Gate coverage")
        plt.tight_layout()
        plt.savefig(os.path.join(out_dir, "fig_gate_coverage.png"), dpi=160)
        plt.close()

        # 2) distribution p_top sur les couverts
        pts = [p for (_, p, acc, lab) in covered]
        plt.figure(figsize=(8, 4.5))
        if pts:
            plt.hist(pts, bins=30)
        plt.xlabel("p_top (covered & non-abstain)")
        plt.ylabel("count")
        plt.title("p_top on covered tickets")
        plt.tight_layout()
        plt.savefig(os.path.join(out_dir, "fig_p_top_on_covered.png"), dpi=160)
        plt.close()
    except Exception as e:
        print("[FIG][WARN]", e)

    print(f"[EVAL] {out_dir} -> {metrics}")

def main():
    gold = load_gold_map(GOLD)
    print(f"[INFO] GOLD chargés : {len(gold)} épisodes")

    # DEV
    eval_gate(AUD_DEV, gold, OUT_DEV)

    # HOLDOUT-H
    eval_gate(AUD_H, gold, OUT_H)

if __name__ == "__main__":
    main()

[INFO] GOLD chargés : 400 épisodes
[EVAL] /content/drive/MyDrive/protocol_bmo_eval_DEV -> {'episodes_all': 157, 'accepted_all': 126, 'abstention_rate_all': 0.1975, 'episodes_labeled': 157, 'covered_non_abstain': 126, 'coverage_on_labeled': 0.8025, 'coverage_per_label': {'i:insufficient_info': 0.822, 'i:release_regression': 0.0, 'i:security_threat': 0.0, 'i:infra_instability': 0.0, 'i:false_positive': 0.7391, 'i:minor_degradation': 1.0, 'i:business_side_effect': 0.7333}}
[EVAL] /content/drive/MyDrive/protocol_bmo_eval_H -> {'episodes_all': 43, 'accepted_all': 37, 'abstention_rate_all': 0.1395, 'episodes_labeled': 43, 'covered_non_abstain': 37, 'coverage_on_labeled': 0.8605, 'coverage_per_label': {'i:insufficient_info': 0.9032, 'i:release_regression': 0.0, 'i:security_threat': 0.0, 'i:infra_instability': 1.0, 'i:false_positive': 0.8333, 'i:minor_degradation': 0.0, 'i:business_side_effect': 0.75}}
