In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Étape 2 — Silver labels (BugAda-BMO)
Entrée  : episodes_raw.jsonl (Étape 1, BMO)
Sorties :
  - episodes_with_silver.jsonl
  - silver_labels.csv
  - silver_stats.json
  - fig_silver_dist.png
  - fig_silver_coverage.png
  - fig_labeled_ratio_by_component_top10.png
  - fig_tickets_per_year.png
"""

import os, json, csv
from collections import Counter

# ====== CHEMINS (BMO uniquement pour l’instant) ======
IN_JSONL  = "/content/drive/MyDrive/bugada_cards_clean/episodes_raw.jsonl"
OUT_DIR   = "/content/drive/MyDrive/cleansilver_bmo"

OUT_JSONL = os.path.join(OUT_DIR, "episodes_with_silver.jsonl")
OUT_CSV   = os.path.join(OUT_DIR, "silver_labels.csv")
OUT_STATS = os.path.join(OUT_DIR, "silver_stats.json")
# =====================================================

def ensure_dir(p):
    os.makedirs(p, exist_ok=True)

def silver_label_from_card(card) -> str | None:
    """
    Règles déterministes pour attribuer un label i:...
    (OK d'utiliser status + resolution ici : c'est pour les labels, pas pour les features du modèle)
    """
    r   = (card.get("resolution") or "").lower().strip()
    st  = (card.get("status_current") or "").lower().strip()
    txt = " ".join([
        str(card.get("summary_text") or ""),
        str(card.get("desc_blob") or ""),
        st, r,
        " ".join(card.get("keywords") or []),
    ]).lower()
    sev = (card.get("severity") or "unknown").lower()

    # 1) Faux positifs
    if r in {"invalid","worksforme","works as intended","notabug","duplicate","moved"}:
        return "i:false_positive"

    # 2) Effet de bord business
    if r in {"wontfix","by design","policy decision"}:
        return "i:business_side_effect"

    # 3) Info insuffisante
    if (st in {"needinfo","unconfirmed"} or
        any(k in txt for k in ["cannot reproduce","missing steps","need more info",
                               "incomplete","no steps"])):
        return "i:insufficient_info"

    # 4) Sécurité
    if any(k in txt for k in ["xss","csrf","vulnerability","sql injection"]):
        return "i:security_threat"

    # 5) Régression de release
    if any(k in txt for k in ["after update","after upgrade","after release",
                              "after deploy","introduced in","since version",
                              "since build","since release","regression"]):
        return "i:release_regression"

    # 6) Instabilité infra
    if any(k in txt for k in ["timeout","latency","slowdown","slow","5xx",
                              "server error","network","intermittent",
                              "connection reset","502","503","bad gateway"]):
        return "i:infra_instability"

    # 7) Petites dégradations
    if sev in {"low","trivial"}:
        return "i:minor_degradation"

    return None  # pas de label

def _year_from_ts(ts: str) -> str | None:
    if not ts:
        return None
    return ts[:4] if len(ts) >= 4 and ts[:4].isdigit() else None

# --- Figures (matplotlib simple, sans style forcé) ---
def _plt():
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    return plt

def _plot_bar(counter: Counter, title: str, xlabel: str, out_png: str, rotation=45):
    plt = _plt()
    items = list(counter.items())
    labels = [k for k,_ in items]
    values = [v for _,v in items]
    plt.figure(figsize=(10,5))
    plt.bar(range(len(items)), values)
    plt.xticks(range(len(items)), labels, rotation=rotation, ha="right")
    plt.title(title); plt.xlabel(xlabel); plt.ylabel("count")
    plt.tight_layout(); plt.savefig(out_png, dpi=160); plt.close()

def _plot_simple(values_dict: dict, title: str, xlabel: str, out_png: str, rotation=0):
    _plot_bar(Counter(values_dict), title, xlabel, out_png, rotation=rotation)

# ----------------- MAIN -----------------
def main():
    ensure_dir(OUT_DIR)

    n = 0
    n_labeled = 0
    dist = Counter()
    sev_c = Counter()
    comp_total = Counter()
    comp_labeled = Counter()
    year_c = Counter()

    with open(IN_JSONL, "r", encoding="utf-8") as fr, \
         open(OUT_JSONL, "w", encoding="utf-8") as fw, \
         open(OUT_CSV, "w", newline="", encoding="utf-8") as fc:

        cw = csv.writer(fc)
        cw.writerow(["episode_id","silver_label"])

        for line in fr:
            line = line.strip()
            if not line:
                continue
            card = json.loads(line)
            n += 1

            # stats auxiliaires
            sev_c[(card.get("severity") or "unknown").lower()] += 1
            comp = card.get("component") or "General"
            comp_total[comp] += 1
            y = _year_from_ts(card.get("created_at",""))
            if y:
                year_c[y] += 1

            # label silver
            label = silver_label_from_card(card)
            if label:
                n_labeled += 1
                dist[label] += 1
                comp_labeled[comp] += 1
                cw.writerow([card.get("key") or card.get("ticket_id"), label])

            card["_silver_label"] = label  # peut être None
            fw.write(json.dumps(card, ensure_ascii=False) + "\n")

    coverage = (n_labeled / n) if n else 0.0
    print(f"[SILVER] épisodes: {n}")
    print(f"[SILVER] couverts: {n_labeled}  ({coverage:.2%})")
    if dist:
        print("[SILVER] distribution:")
        for k,v in sorted(dist.items(), key=lambda kv: (-kv[1], kv[0])):
            print(f"  - {k}: {v}")
    print(f"- JSONL (avec labels) : {OUT_JSONL}")
    print(f"- CSV labels          : {OUT_CSV}")

    # ----- Stats JSON -----
    stats = {
        "episodes": n,
        "labeled": n_labeled,
        "coverage": round(coverage, 4),
        "label_distribution": dict(dist),
        "severity_distribution": dict(sev_c),
        "component_total_top10": dict(Counter(dict(comp_total)).most_common(10)),
        "component_labeled_top10": dict(Counter(dict(comp_labeled)).most_common(10)),
        "year_counts": dict(sorted(year_c.items()))
    }
    with open(OUT_STATS, "w", encoding="utf-8") as f:
        json.dump(stats, f, ensure_ascii=False, indent=2)

    # ----- Figures -----
    # 1) Distribution des labels
    _plot_bar(dist, "Silver label distribution", "label",
              os.path.join(OUT_DIR, "fig_silver_dist.png"))
    # 2) Couverture (labeled vs non-labeled)
    cov_dict = {"labeled": n_labeled, "unlabeled": max(0, n - n_labeled)}
    _plot_simple(cov_dict, "Silver coverage", "class",
                 os.path.join(OUT_DIR, "fig_silver_coverage.png"), rotation=0)
    # 3) Ratio de labellisation par composant (Top-10 composants par volume)
    try:
        import matplotlib.pyplot as plt
        top10 = [c for c,_ in Counter(dict(comp_total)).most_common(10)]
        ratios = []
        for c in top10:
            tot = comp_total[c]
            lab = comp_labeled.get(c, 0)
            ratios.append((lab / tot) if tot else 0.0)
        plt.figure(figsize=(10,5))
        plt.bar(range(len(top10)), ratios)
        plt.xticks(range(len(top10)), top10, rotation=45, ha="right")
        plt.title("Labeled ratio by component (top-10 by volume)")
        plt.xlabel("component"); plt.ylabel("labeled_ratio")
        plt.tight_layout()
        plt.savefig(os.path.join(OUT_DIR,
                                 "fig_labeled_ratio_by_component_top10.png"),
                    dpi=160)
        plt.close()
    except Exception as e:
        print("[FIG][WARN] component ratio:", e)

    # 4) Tickets par année (qualité temporelle)
    _plot_bar(Counter(dict(year_c)), "Tickets per year", "year",
              os.path.join(OUT_DIR, "fig_tickets_per_year.png"), rotation=0)

if __name__ == "__main__":
    main()