Dans models/ :

features_tfidf_runtime_full_YYYYMMDD_HHMMSS.joblib
contient {"vectorizer_word", "vectorizer_char", "created", "params"}.

sentiment_bundle_runtime_YYYYMMDD_HHMMSS.joblib
contient {"model","selector_chi2","vectorizer_word","vectorizer_char","threshold","meta"}.

sentiment_metrics_YYYYMMDD_HHMMSS.json
AP/ROC/F1, seuil t*, tailles splits, etc. (petit JSON).

(optionnel) registry.jsonl — un journal ligne-par-ligne de tous les artefacts.

In [9]:
# === C5.3.1 — util d'enregistrement inline ===
from pathlib import Path
import json, hashlib, platform, datetime, os

def _sha256(p: Path, buf=1024*1024):
    h = hashlib.sha256()
    with p.open("rb") as f:
        while True:
            b = f.read(buf)
            if not b: break
            h.update(b)
    return h.hexdigest()

def register_artifact(task: str, filepath: Path, extra: dict | None = None):
    filepath = Path(filepath); assert filepath.exists(), filepath
    rec = {
        "task": task,
        "artifact": filepath.as_posix(),
        "hash": _sha256(filepath),
        "created": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "python": platform.python_version(),
        "git": os.environ.get("GIT_COMMIT", "NA"),
        **(extra or {})
    }
    reg = Path("models/registry.jsonl"); reg.parent.mkdir(parents=True, exist_ok=True)
    with reg.open("a", encoding="utf-8") as f:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")
    print("Registered:", rec)

In [10]:
# === C5.3.1 — post-enregistrement des artefacts déjà présents dans models/ ===
from pathlib import Path
import json, time
from joblib import load

MODELS = Path("models")

# 1) TF-IDF (si disponible)
tfidf = sorted(MODELS.glob("features_tfidf_runtime_full_*.joblib"))
if tfidf:
    register_artifact("sentiment/tfidf", tfidf[-1], {"kind":"tfidf"})
else:
    print("⚠️ Aucun TF-IDF runtime trouvé (features_tfidf_runtime_full_*.joblib).")

# 2) Sentiment bundle (obligatoire)
bundles = sorted(MODELS.glob("sentiment_bundle_runtime_*.joblib"))
assert bundles, "Aucun bundle sentiment trouvé dans models/."
bundle_path = bundles[-1]
register_artifact("sentiment/bundle", bundle_path, {"kind":"bundle","k_out": 50000})

# 3) Metrics JSON : on prend le plus récent ; sinon on en crée un minimal depuis le bundle.meta
metrics = sorted(MODELS.glob("sentiment_metrics_*.json"))
if metrics:
    register_artifact("sentiment/metrics", metrics[-1], {"kind":"metrics"})
else:
    b = load(bundle_path)
    meta = b.get("meta", {})
    stub = {
        "threshold": float(b.get("threshold", 0.5)),
        "n_rows": meta.get("n_rows"),
        "split": meta.get("split"),
        "note": "metrics minimal auto (AP/F1 non disponibles car non trouvés)."
    }
    stamp = time.strftime("%Y%m%d_%H%M%S")
    mjson = MODELS / f"sentiment_metrics_stub_{stamp}.json"
    mjson.write_text(json.dumps(stub, indent=2), encoding="utf-8")
    register_artifact("sentiment/metrics", mjson, {"kind":"metrics_stub"})

Registered: {'task': 'sentiment/tfidf', 'artifact': 'models/features_tfidf_runtime_full_20250922_045623.joblib', 'hash': '15484e9de2da0f0a48d8d6d1996b362d5b94499deb2dc6ef56155dfe67b4bd90', 'created': '2025-09-22 15:04:29', 'python': '3.11.5', 'git': 'NA', 'kind': 'tfidf'}
Registered: {'task': 'sentiment/bundle', 'artifact': 'models/sentiment_bundle_runtime_20250922_045812.joblib', 'hash': 'dae940e788d6defcd5a4b8c79b96754b02151ec67a3ba558653e05d497de43b6', 'created': '2025-09-22 15:04:29', 'python': '3.11.5', 'git': 'NA', 'kind': 'bundle', 'k_out': 50000}
Registered: {'task': 'sentiment/metrics', 'artifact': 'models/sentiment_metrics_stub_20250922_150431.json', 'hash': '646c7be45496fb5a241fd75b4bf03e5302b89343f5f0dbdeff526f0a96505f6e', 'created': '2025-09-22 15:04:31', 'python': '3.11.5', 'git': 'NA', 'kind': 'metrics_stub'}


In [12]:
# === C5.3.1 — contrôle du registre ===
from pathlib import Path
import json
import pandas as pd

reg = Path("models/registry.jsonl")
assert reg.exists(), "Aucun registre trouvé (models/registry.jsonl)."
rows = [json.loads(l) for l in reg.read_text(encoding="utf-8").splitlines()]
df = pd.DataFrame(rows).sort_values("created", ascending=False)
display(df[["task","artifact","hash","created","python"]])

Unnamed: 0,task,artifact,hash,created,python
2,sentiment/metrics,models/sentiment_metrics_stub_20250922_150431....,646c7be45496fb5a241fd75b4bf03e5302b89343f5f0db...,2025-09-22 15:04:31,3.11.5
0,sentiment/tfidf,models/features_tfidf_runtime_full_20250922_04...,15484e9de2da0f0a48d8d6d1996b362d5b94499deb2dc6...,2025-09-22 15:04:29,3.11.5
1,sentiment/bundle,models/sentiment_bundle_runtime_20250922_04581...,dae940e788d6defcd5a4b8c79b96754b02151ec67a3ba5...,2025-09-22 15:04:29,3.11.5
