In [1]:
 # Datacrine Machine — Cell 1
from __future__ import annotations
from dataclasses import dataclass
from typing import Dict, Any, List, Optional
from pathlib import Path
import hashlib, csv, json, re
import numpy as np
import pandas as pd

# ---------- Configs ----------
@dataclass
class ConfigTabular:
    target: str
    impute_numeric: str = "median"          # "median" or "mean"
    impute_categorical: str = "most_frequent"
    iqr_clip: bool = True
    iqr_group_by: Optional[List[str]] = None
    category_alias_map: Optional[Dict[str, Dict[str,str]]] = None  # {column: {lowercase_value: canonical_value}}
    dup_cols: Optional[List[str]] = None
    cv_folds: int = 5

@dataclass
class ConfigNLP:
    text_col: str
    target: str
    normalize_lower: bool = True
    normalize_strip_urls: bool = True
    dedup: bool = True
    cv_folds: int = 5

# ---------- Result ----------
@dataclass
class Result:
    cleaned_df: pd.DataFrame
    edits_df: pd.DataFrame
    metrics_pre: Dict[str, Any]
    metrics_post: Dict[str, Any]
    artifacts: Dict[str, str]
    deltas: Dict[str, Any]

# ---------- small helpers ----------
def _safe_auc(y_true, proba):
    from sklearn.metrics import roc_auc_score
    try:
        return float(roc_auc_score(y_true, proba))
    except Exception:
        return float("nan")

def _safe_brier(y_true, proba):
    from sklearn.metrics import brier_score_loss
    try:
        return float(brier_score_loss(y_true, proba))
    except Exception:
        return float("nan")

def _delta(a, b):
    try:
        return round(float(b) - float(a), 4)
    except Exception:
        return float("nan")

In [2]:
# Datacrine Machine — Cell 2 (Audit Log)
class EditLog:
    def __init__(self, path: Path):
        self.path = Path(path)
        self.path.parent.mkdir(parents=True, exist_ok=True)
        if not self.path.exists():
            with self.path.open("w", newline="", encoding="utf-8") as f:
                csv.writer(f).writerow(["row_id","column","op","before","after","reason","confidence"])
        self._rows = []

    @staticmethod
    def _row_id(row: Dict[str, Any]) -> str:
        return hashlib.md5(str(tuple(row.values())).encode("utf-8")).hexdigest()[:10]

    def write(self, row_dict: Dict[str,Any], column: str, op: str, before, after, reason: str, conf: float = 1.0):
        rid = EditLog._row_id(row_dict)
        self._rows.append([rid, column, op, str(before), str(after), reason, f"{conf:.2f}"])

    def save(self) -> pd.DataFrame:
        with self.path.open("a", newline="", encoding="utf-8") as f:
            w = csv.writer(f)
            for r in self._rows:
                w.writerow(r)
        df = pd.DataFrame(self._rows, columns=["row_id","column","op","before","after","reason","confidence"])
        return df


In [5]:
# Datacrine Machine — Cell 3 (Core)
class DatacrineMachine:
    def __init__(self, artifacts_dir: str = "artifacts", seed: int = 42):
        self.seed = seed
        np.random.seed(seed)
        self.artifacts_dir = Path(artifacts_dir)
        self.artifacts_dir.mkdir(parents=True, exist_ok=True)

    # ---------------- TABULAR ----------------
    def run_tabular(self, csv_path: str, cfg: ConfigTabular) -> Result:
        outdir = self.artifacts_dir / "uci"
        outdir.mkdir(parents=True, exist_ok=True)
        df_raw = pd.read_csv(csv_path)
        df = df_raw.copy()
        edits = EditLog(outdir / "edits.csv")

        # Impute
        num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        cat_cols = [c for c in df.columns if c not in num_cols]
        if cfg.impute_numeric in ("median","mean"):
            for c in num_cols:
                if df[c].isna().any():
                    val = df[c].median() if cfg.impute_numeric=="median" else df[c].mean()
                    na_idx = df[c].isna()
                    for idx in df[na_idx].index[:1000]:
                        edits.write(df.loc[idx].to_dict(), c, "impute_numeric", None, val, f"{cfg.impute_numeric} imputation", 0.95)
                    df.loc[na_idx, c] = val
        if cfg.impute_categorical == "most_frequent":
            for c in cat_cols:
                if df[c].isna().any():
                    mode_val = df[c].mode(dropna=True).iloc[0] if df[c].notna().any() else ""
                    na_idx = df[c].isna()
                    for idx in df[na_idx].index[:1000]:
                        edits.write(df.loc[idx].to_dict(), c, "impute_categorical", None, mode_val, "mode imputation", 0.95)
                    df.loc[na_idx, c] = mode_val

        # IQR clip
        if cfg.iqr_clip:
            if cfg.iqr_group_by:
                for key, part in df.groupby(cfg.iqr_group_by):
                    for col in num_cols:
                        q1 = part[col].quantile(0.25); q3 = part[col].quantile(0.75); iqr = q3-q1
                        if pd.isna(iqr) or iqr==0: continue
                        lo, hi = q1-1.5*iqr, q3+1.5*iqr
                        mask = (part[col]<lo) | (part[col]>hi)
                        for idx in part[mask].index[:1000]:
                            before = df.at[idx, col]
                            after = float(np.clip(before, lo, hi))
                            edits.write(df.loc[idx].to_dict(), col, "iqr_clip", before, after, "IQR clip (grouped)", 0.90)
                        df.loc[part.index, col] = df.loc[part.index, col].clip(lo, hi)
            else:
                for col in num_cols:
                    q1 = df[col].quantile(0.25); q3 = df[col].quantile(0.75); iqr = q3-q1
                    if pd.isna(iqr) or iqr==0: continue
                    lo, hi = q1-1.5*iqr, q3+1.5*iqr
                    mask = (df[col]<lo) | (df[col]>hi)
                    for idx in df[mask].index[:1000]:
                        before = df.at[idx, col]
                        after = float(np.clip(before, lo, hi))
                        edits.write(df.loc[idx].to_dict(), col, "iqr_clip", before, after, "IQR clip", 0.90)
                    df[col] = df[col].clip(lo, hi)

        # Category aliasing
        if cfg.category_alias_map:
            for col, mapping in cfg.category_alias_map.items():
                if col not in df.columns: continue
                before = df[col].copy()
                def _map(v):
                    if pd.isna(v): return v
                    key = str(v).strip().lower()
                    return mapping.get(key, v)
                df[col] = df[col].map(_map)
                changed = before != df[col]
                for idx in df[changed].index[:1000]:
                    edits.write(df.loc[idx].to_dict(), col, "category_alias", before.at[idx], df.at[idx, col], "alias map", 0.98)

        # Near-duplicate drop
        dup_cols = cfg.dup_cols if cfg.dup_cols else [c for c in df.columns if df[c].dtype==object]
        if dup_cols:
            seen = set(); keep = []
            for i, row in df.iterrows():
                key = "|".join(str(row[c]).strip().lower() for c in dup_cols)
                h = hashlib.sha1(key.encode("utf-8")).hexdigest()[:16]
                if h in seen:
                    edits.write(row.to_dict(), "*", "near_dup_drop", key, "—", "hash drop", 0.90)
                    continue
                seen.add(h); keep.append(i)
            df = df.loc[keep].reset_index(drop=True)

        # Evaluate pre/post
        pre = self._eval_tabular(df_raw, cfg.target, cfg.cv_folds)
        post = self._eval_tabular(df, cfg.target, cfg.cv_folds)

        # Save artifacts
        cleaned_p = outdir / "cleaned.csv"; df.to_csv(cleaned_p, index=False)
        edits_df = edits.save()
        pre_p = outdir / "metrics_pre.json"; post_p = outdir / "metrics_post.json"
        json.dump(pre, open(pre_p,"w")); json.dump(post, open(post_p,"w"))
        report_p = outdir / "report.html"
        self._write_report(report_p, "uci", pre, post, {
            "delta_auc": _delta(pre.get("auc"), post.get("auc")),
            "delta_f1": _delta(pre.get("f1"), post.get("f1"))
        })
        artifacts = {"cleaned": str(cleaned_p), "edits": str(edits.path), "metrics_pre": str(pre_p), "metrics_post": str(post_p), "report": str(report_p)}
        return Result(df, edits_df, pre, post, artifacts, {"delta_auc": _delta(pre.get("auc"), post.get("auc")), "delta_f1": _delta(pre.get("f1"), post.get("f1"))})

    # ---------------- NLP ----------------
    def run_nlp(self, csv_path: str, cfg: ConfigNLP) -> Result:
        outdir = self.artifacts_dir / "imdb"
        outdir.mkdir(parents=True, exist_ok=True)
        df_raw = pd.read_csv(csv_path)
        df = df_raw.copy()
        edits = EditLog(outdir / "edits.csv")

        # Normalize
        url_re = re.compile(r"https?://\S+|www\.\S+")
        ser = df[cfg.text_col].astype(str).copy()
        if cfg.normalize_strip_urls or cfg.normalize_lower:
            for i, t in ser.items():
                orig = t
                if cfg.normalize_strip_urls: t = url_re.sub("", t)
                if cfg.normalize_lower: t = t.lower()
                if t != orig and len(edits._rows) < 1000:
                    edits.write({"text": orig}, cfg.text_col, "normalize", orig[:80], t[:80], "urls/lower", 0.95)
                ser.at[i] = t
        df[cfg.text_col] = ser

        # Dedup
        if cfg.dedup:
            seen = set(); keep = []
            for i, t in df[cfg.text_col].astype(str).items():
                key = hashlib.sha1(t.strip().lower().encode("utf-8")).hexdigest()[:16]
                if key in seen:
                    edits.write({"text": t}, cfg.text_col, "near_dup_drop", "hash", "—", "exact/near dup", 0.90)
                    continue
                seen.add(key); keep.append(i)
            df = df.loc[keep].reset_index(drop=True)

        # Evaluate pre/post
        pre = self._eval_nlp(df_raw, cfg.text_col, cfg.target, cfg.cv_folds)
        post = self._eval_nlp(df, cfg.text_col, cfg.target, cfg.cv_folds)

        # Save artifacts
        cleaned_p = outdir / "cleaned.csv"; df.to_csv(cleaned_p, index=False)
        edits_df = edits.save()
        pre_p = outdir / "metrics_pre.json"; post_p = outdir / "metrics_post.json"
        json.dump(pre, open(pre_p,"w")); json.dump(post, open(post_p,"w"))
        report_p = outdir / "report.html"
        self._write_report(report_p, "imdb", pre, post, {
            "delta_accuracy": _delta(pre.get("accuracy"), post.get("accuracy")),
            "delta_f1": _delta(pre.get("f1"), post.get("f1"))
        })
        artifacts = {"cleaned": str(cleaned_p), "edits": str(edits.path), "metrics_pre": str(pre_p), "metrics_post": str(post_p), "report": str(report_p)}
        return Result(df, edits_df, pre, post, artifacts, {"delta_accuracy": _delta(pre.get("accuracy"), post.get("accuracy")), "delta_f1": _delta(pre.get("f1"), post.get("f1"))})

    # ---------------- Private: evaluation ----------------
    def _eval_tabular(self, df: pd.DataFrame, target: str, folds: int) -> Dict[str, Any]:
        from sklearn.model_selection import StratifiedKFold, train_test_split
        from sklearn.metrics import roc_auc_score, f1_score, brier_score_loss
        from sklearn.preprocessing import StandardScaler, OneHotEncoder
        from sklearn.compose import ColumnTransformer
        from sklearn.pipeline import Pipeline
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.impute import SimpleImputer

        y = df[target].values
        X = df.drop(columns=[target])
        num = X.select_dtypes(include="number").columns.tolist()
        cat = [c for c in X.columns if c not in num]

        pre = ColumnTransformer([
            ("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler(with_mean=False))]), num),
            ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("oh", OneHotEncoder(handle_unknown="ignore"))]), cat)
        ])
        clf = RandomForestClassifier(n_estimators=200, random_state=self.seed, n_jobs=-1)

        class_counts = pd.Series(y).value_counts()
        if len(class_counts)==1 or int(class_counts.min()) < 2:
            Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, stratify=y if len(class_counts)>1 else None, random_state=self.seed)
            pipe = Pipeline([("pre", pre), ("clf", clf)]).fit(Xtr, ytr)
            proba = pipe.predict_proba(Xte)[:,1]; pred = (proba>=0.5).astype(int)
            auc = _safe_auc(yte, proba); brier = _safe_brier(yte, proba)
            return {"auc": auc, "f1": float(f1_score(yte, pred)), "brier": brier}

        n_folds = min(folds, max(2, int(class_counts.min())))
        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=self.seed)
        aucs=[]; f1s=[]; briers=[]
        for tr, te in skf.split(X, y):
            Xtr, Xte = X.iloc[tr], X.iloc[te]; ytr, yte = y[tr], y[te]
            pipe = Pipeline([("pre", pre), ("clf", clf)]).fit(Xtr, ytr)
            proba = pipe.predict_proba(Xte)[:,1]; pred = (proba>=0.5).astype(int)
            aucs.append(_safe_auc(yte, proba)); f1s.append(float(f1_score(yte, pred))); briers.append(_safe_brier(yte, proba))
        return {"auc": float(np.nanmean(aucs)), "f1": float(np.nanmean(f1s)), "brier": float(np.nanmean(briers))}

    def _eval_nlp(self, df: pd.DataFrame, text_col: str, target: str, folds: int) -> Dict[str, Any]:
        from sklearn.model_selection import StratifiedKFold
        from sklearn.metrics import f1_score, accuracy_score
        from sklearn.linear_model import LogisticRegression
        from sklearn.feature_extraction.text import TfidfVectorizer

        y = df[target].values
        texts = df[text_col].astype(str).tolist()

        class_counts = pd.Series(y).value_counts()
        if len(class_counts)==1 or int(class_counts.min()) < 2:
            # simple holdout w/o stratify if only one class
            tr_idx = int(len(texts)*0.8)
            vec = TfidfVectorizer(max_features=20000)
            X = vec.fit_transform(texts)
            Xtr, Xte = X[:tr_idx], X[tr_idx:]
            ytr, yte = y[:tr_idx], y[tr_idx:]
            clf = LogisticRegression(max_iter=300).fit(Xtr, ytr)
            pred = clf.predict(Xte)
            return {"accuracy": float(accuracy_score(y[tr_idx:], pred)), "f1": float(f1_score(y[tr_idx:], pred))}
        n_folds = min(folds, max(2, int(class_counts.min())))
        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=self.seed)
        accs=[]; f1s=[]
        for tr, te in skf.split(texts, y):
            vec = TfidfVectorizer(max_features=20000)
            Xtr = vec.fit_transform([texts[i] for i in tr])
            Xte = vec.transform([texts[i] for i in te])
            clf = LogisticRegression(max_iter=300).fit(Xtr, y[tr])
            pred = clf.predict(Xte)
            accs.append(float(accuracy_score(y[te], pred))); f1s.append(float(f1_score(y[te], pred)))
        return {"accuracy": float(np.mean(accs)), "f1": float(np.mean(f1s))}

    # ---------------- Private: HTML report ----------------
    def _write_report(self, path: Path, track: str, pre: Dict[str,Any], post: Dict[str,Any], extra: Dict[str,Any]):
        html = f"""<!doctype html><html><head><meta charset='utf-8'><title>Datacrine Report – {track}</title>
        <style>body{{font-family:system-ui;margin:2rem}} .k{{font-weight:600}} table{{border-collapse:collapse}} td,th{{border:1px solid #ddd;padding:6px}}</style>
        </head><body>
        <h1>Datacrine Report – {track}</h1>
        <p><span class='k'>Pre metrics:</span> {json.dumps(pre)}</p>
        <p><span class='k'>Post metrics:</span> {json.dumps(post)}</p>
        <p><span class='k'>Deltas:</span> {json.dumps(extra)}</p>
        </body></html>"""
        Path(path).write_text(html, encoding="utf-8")

In [6]:
# OPTIONAL smoke test data
from pathlib import Path
import numpy as np, pandas as pd

Path("data").mkdir(exist_ok=True)

# Tabular-ish dataset
np.random.seed(42); N=500
uci = pd.DataFrame({
    "LIMIT_BAL": np.random.normal(200000, 50000, N),
    "SEX": np.random.choice([1,2], N),
    "EDUCATION": np.random.choice(["high school","High School","university","grad"], N),
    "AGE": np.random.randint(20, 70, N),
    "PAY_AMT1": np.abs(np.random.normal(2000, 1200, N)),
    "PAY_AMT2": np.abs(np.random.normal(1800, 1100, N)),
    "default_payment_next_month": np.random.choice([0,1], N, p=[0.7,0.3]),
})
uci.loc[np.random.choice(N, 15, replace=False), "PAY_AMT1"] = np.nan
uci.loc[np.random.choice(N, 5, replace=False), "LIMIT_BAL"] *= 5
uci.to_csv("data/uci_credit.csv", index=False)

# NLP-ish dataset
texts = [
    "I loved this movie! Fantastic acting and story.",
    "Terrible film... waste of time http://spam.example",
    "It was okay, not great, not bad.",
    "Absolutely brilliant! Would watch again.",
    "Worst movie ever. I hated it so much.",
    "Loved it loved it loved it!",
    "Meh, I've seen better.",
    "Great cinematography, weak plot though.",
    "Terrific performances!",
    "awful experience!!!"
]
labels = [1,0,1,1,0,1,0,1,1,0]
imdb = pd.DataFrame({"text": texts*50, "label": labels*50})
imdb.to_csv("data/imdb.csv", index=False)

print("Synthetic CSVs written to ./data/")


Synthetic CSVs written to ./data/


In [8]:
# === Cell 5 (SAFE RUN) ===
# Patches _eval_tabular at runtime to avoid stratify errors on tiny classes,
# then runs both Tabular and NLP pipelines.

import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, f1_score, brier_score_loss, accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# --- runtime patch: robust _eval_tabular (no stratify if any class < 2) ---
def _patched_eval_tabular(self, df: pd.DataFrame, target: str, folds: int):
    y = df[target].values
    X = df.drop(columns=[target])
    num = X.select_dtypes(include="number").columns.tolist()
    cat = [c for c in X.columns if c not in num]

    pre = ColumnTransformer([
        ("num", Pipeline([("imp", SimpleImputer(strategy="median")),
                          ("sc", StandardScaler(with_mean=False))]), num),
        ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                          ("oh", OneHotEncoder(handle_unknown="ignore"))]), cat)
    ])
    clf = RandomForestClassifier(n_estimators=200, random_state=self.seed, n_jobs=-1)

    class_counts = pd.Series(y).value_counts()
    min_class = int(class_counts.min()) if len(class_counts) > 0 else 0

    if len(class_counts) <= 1 or min_class < 2:
        stratify_arg = y if (len(class_counts) > 1 and min_class >= 2) else None
        Xtr, Xte, ytr, yte = train_test_split(
            X, y, test_size=0.2, random_state=self.seed, stratify=stratify_arg
        )
        pipe = Pipeline([("pre", pre), ("clf", clf)]).fit(Xtr, ytr)
        proba = pipe.predict_proba(Xte)[:, 1]
        pred = (proba >= 0.5).astype(int)
        try:
            auc = float(roc_auc_score(yte, proba))
            brier = float(brier_score_loss(yte, proba))
        except Exception:
            auc = float("nan"); brier = float("nan")
        return {"auc": auc, "f1": float(f1_score(yte, pred)), "brier": brier}

    n_folds = min(folds, max(2, min_class))
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=self.seed)
    aucs, f1s, briers = [], [], []
    for tr, te in skf.split(X, y):
        Xtr, Xte = X.iloc[tr], X.iloc[te]; ytr, yte = y[tr], y[te]
        pipe = Pipeline([("pre", pre), ("clf", clf)]).fit(Xtr, ytr)
        proba = pipe.predict_proba(Xte)[:, 1]
        pred = (proba >= 0.5).astype(int)
        try:
            aucs.append(float(roc_auc_score(yte, proba)))
            briers.append(float(brier_score_loss(yte, proba)))
        except Exception:
            aucs.append(float("nan")); briers.append(float("nan"))
        f1s.append(float(f1_score(yte, pred)))
    return {"auc": float(np.nanmean(aucs)), "f1": float(np.nanmean(f1s)), "brier": float(np.nanmean(briers))}

# apply patch
DatacrineMachine._eval_tabular = _patched_eval_tabular

# ---- Run the machine ----
m = DatacrineMachine(artifacts_dir="artifacts")

# (Optional) See class balance before running
try:
    tmp_df = pd.read_csv("data/uci_credit.csv")
    print("Class counts (tabular):")
    print(tmp_df["default_payment_next_month"].value_counts(dropna=False))
except Exception as e:
    print("Could not preview class counts:", e)

# Tabular
tab_cfg = ConfigTabular(
    target="default_payment_next_month",
    impute_numeric="median",
    impute_categorical="most_frequent",
    iqr_clip=True,
    iqr_group_by=None,
    category_alias_map=None,
    dup_cols=None,
    cv_folds=5
)
tab_res = m.run_tabular("data/uci_credit.csv", tab_cfg)
print("TABULAR deltas:", tab_res.deltas)
print("TABULAR artifacts:", tab_res.artifacts)

# NLP
nlp_cfg = ConfigNLP(
    text_col="text",
    target="label",
    normalize_lower=True,
    normalize_strip_urls=True,
    dedup=True,
    cv_folds=5
)
nlp_res = m.run_nlp("data/imdb.csv", nlp_cfg)
print("NLP deltas:", nlp_res.deltas)
print("NLP artifacts:", nlp_res.artifacts)


Class counts (tabular):
default_payment_next_month
0    340
1    160
Name: count, dtype: int64
TABULAR deltas: {'delta_auc': nan, 'delta_f1': -0.2015}
TABULAR artifacts: {'cleaned': 'artifacts\\uci\\cleaned.csv', 'edits': 'artifacts\\uci\\edits.csv', 'metrics_pre': 'artifacts\\uci\\metrics_pre.json', 'metrics_post': 'artifacts\\uci\\metrics_post.json', 'report': 'artifacts\\uci\\report.html'}
NLP deltas: {'delta_accuracy': -0.4167, 'delta_f1': -0.2667}
NLP artifacts: {'cleaned': 'artifacts\\imdb\\cleaned.csv', 'edits': 'artifacts\\imdb\\edits.csv', 'metrics_pre': 'artifacts\\imdb\\metrics_pre.json', 'metrics_post': 'artifacts\\imdb\\metrics_post.json', 'report': 'artifacts\\imdb\\report.html'}


In [9]:
# Save everything above into a single importable file
module_path = Path("datacrine_machine.py")
module_src = r'''
from __future__ import annotations
from dataclasses import dataclass
from typing import Dict, Any, List, Optional
from pathlib import Path
import hashlib, csv, json, re
import numpy as np
import pandas as pd

@dataclass
class ConfigTabular:
    target: str
    impute_numeric: str = "median"
    impute_categorical: str = "most_frequent"
    iqr_clip: bool = True
    iqr_group_by: Optional[List[str]] = None
    category_alias_map: Optional[Dict[str, Dict[str,str]]] = None
    dup_cols: Optional[List[str]] = None
    cv_folds: int = 5

@dataclass
class ConfigNLP:
    text_col: str
    target: str
    normalize_lower: bool = True
    normalize_strip_urls: bool = True
    dedup: bool = True
    cv_folds: int = 5

@dataclass
class Result:
    cleaned_df: pd.DataFrame
    edits_df: pd.DataFrame
    metrics_pre: Dict[str, Any]
    metrics_post: Dict[str, Any]
    artifacts: Dict[str, str]
    deltas: Dict[str, Any]

def _safe_auc(y_true, proba):
    from sklearn.metrics import roc_auc_score
    try:
        return float(roc_auc_score(y_true, proba))
    except Exception:
        return float("nan")

def _safe_brier(y_true, proba):
    from sklearn.metrics import brier_score_loss
    try:
        return float(brier_score_loss(y_true, proba))
    except Exception:
        return float("nan")

def _delta(a, b):
    try:
        return round(float(b) - float(a), 4)
    except Exception:
        return float("nan")

class EditLog:
    def __init__(self, path: Path):
        self.path = Path(path)
        self.path.parent.mkdir(parents=True, exist_ok=True)
        if not self.path.exists():
            with self.path.open("w", newline="", encoding="utf-8") as f:
                csv.writer(f).writerow(["row_id","column","op","before","after","reason","confidence"])
        self._rows = []
    @staticmethod
    def _row_id(row: Dict[str, Any]) -> str:
        return hashlib.md5(str(tuple(row.values())).encode("utf-8")).hexdigest()[:10]
    def write(self, row_dict: Dict[str,Any], column: str, op: str, before, after, reason: str, conf: float = 1.0):
        rid = EditLog._row_id(row_dict)
        self._rows.append([rid, column, op, str(before), str(after), reason, f"{conf:.2f}"])
    def save(self) -> pd.DataFrame:
        with self.path.open("a", newline="", encoding="utf-8") as f:
            csv.writer(f).writerow  # ensure import of csv writer
        with self.path.open("a", newline="", encoding="utf-8") as f:
            w = csv.writer(f)
            for r in self._rows:
                w.writerow(r)
        return pd.DataFrame(self._rows, columns=["row_id","column","op","before","after","reason","confidence"])

class DatacrineMachine:
    def __init__(self, artifacts_dir: str = "artifacts", seed: int = 42):
        self.seed = seed
        np.random.seed(seed)
        self.artifacts_dir = Path(artifacts_dir)
        self.artifacts_dir.mkdir(parents=True, exist_ok=True)

    def run_tabular(self, csv_path: str, cfg: ConfigTabular) -> Result:
        outdir = self.artifacts_dir / "uci"
        outdir.mkdir(parents=True, exist_ok=True)
        df_raw = pd.read_csv(csv_path)
        df = df_raw.copy()
        edits = EditLog(outdir / "edits.csv")
        num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        cat_cols = [c for c in df.columns if c not in num_cols]
        if cfg.impute_numeric in ("median","mean"):
            for c in num_cols:
                if df[c].isna().any():
                    val = df[c].median() if cfg.impute_numeric=="median" else df[c].mean()
                    na_idx = df[c].isna()
                    for idx in df[na_idx].index[:1000]:
                        edits.write(df.loc[idx].to_dict(), c, "impute_numeric", None, val, f"{cfg.impute_numeric} imputation", 0.95)
                    df.loc[na_idx, c] = val
        if cfg.impute_categorical == "most_frequent":
            for c in cat_cols:
                if df[c].isna().any():
                    mode_val = df[c].mode(dropna=True).iloc[0] if df[c].notna().any() else ""
                    na_idx = df[c].isna()
                    for idx in df[na_idx].index[:1000]:
                        edits.write(df.loc[idx].to_dict(), c, "impute_categorical", None, mode_val, "mode imputation", 0.95)
                    df.loc[na_idx, c] = mode_val
        if cfg.iqr_clip:
            if cfg.iqr_group_by:
                for key, part in df.groupby(cfg.iqr_group_by):
                    for col in num_cols:
                        q1 = part[col].quantile(0.25); q3 = part[col].quantile(0.75); iqr = q3-q1
                        if pd.isna(iqr) or iqr==0: continue
                        lo, hi = q1-1.5*iqr, q3+1.5*iqr
                        mask = (part[col]<lo) | (part[col]>hi)
                        for idx in part[mask].index[:1000]:
                            before = df.at[idx, col]
                            after = float(np.clip(before, lo, hi))
                            edits.write(df.loc[idx].to_dict(), col, "iqr_clip", before, after, "IQR clip (grouped)", 0.90)
                        df.loc[part.index, col] = df.loc[part.index, col].clip(lo, hi)
            else:
                for col in num_cols:
                    q1 = df[col].quantile(0.25); q3 = df[col].quantile(0.75); iqr = q3-q1
                    if pd.isna(iqr) or iqr==0: continue
                    lo, hi = q1-1.5*iqr, q3+1.5*iqr
                    mask = (df[col]<lo) | (df[col]>hi)
                    for idx in df[mask].index[:1000]:
                        before = df.at[idx, col]
                        after = float(np.clip(before, lo, hi))
                        edits.write(df.loc[idx].to_dict(), col, "iqr_clip", before, after, "IQR clip", 0.90)
                    df[col] = df[col].clip(lo, hi)
        if cfg.category_alias_map:
            for col, mapping in cfg.category_alias_map.items():
                if col not in df.columns: continue
                before = df[col].copy()
                def _map(v):
                    if pd.isna(v): return v
                    key = str(v).strip().lower()
                    return mapping.get(key, v)
                df[col] = df[col].map(_map)
                changed = before != df[col]
                for idx in df[changed].index[:1000]:
                    edits.write(df.loc[idx].to_dict(), col, "category_alias", before.at[idx], df.at[idx, col], "alias map", 0.98)
        dup_cols = cfg.dup_cols if cfg.dup_cols else [c for c in df.columns if df[c].dtype==object]
        if dup_cols:
            seen = set(); keep = []
            for i, row in df.iterrows():
                key = "|".join(str(row[c]).strip().lower() for c in dup_cols)
                h = hashlib.sha1(key.encode("utf-8")).hexdigest()[:16]
                if h in seen:
                    edits.write(row.to_dict(), "*", "near_dup_drop", key, "—", "hash drop", 0.90); continue
                seen.add(h); keep.append(i)
            df = df.loc[keep].reset_index(drop=True)
        pre = self._eval_tabular(df_raw, cfg.target, cfg.cv_folds)
        post = self._eval_tabular(df, cfg.target, cfg.cv_folds)
        outdir = self.artifacts_dir / "uci"
        cleaned_p = outdir / "cleaned.csv"; df.to_csv(cleaned_p, index=False)
        edits_df = edits.save()
        pre_p = outdir / "metrics_pre.json"; post_p = outdir / "metrics_post.json"
        json.dump(pre, open(pre_p,"w")); json.dump(post, open(post_p,"w"))
        report_p = outdir / "report.html"
        self._write_report(report_p, "uci", pre, post, {"delta_auc": _delta(pre.get("auc"), post.get("auc")), "delta_f1": _delta(pre.get("f1"), post.get("f1"))})
        artifacts = {"cleaned": str(cleaned_p), "edits": str(edits.path), "metrics_pre": str(pre_p), "metrics_post": str(post_p), "report": str(report_p)}
        return Result(df, edits_df, pre, post, artifacts, {"delta_auc": _delta(pre.get("auc"), post.get("auc")), "delta_f1": _delta(pre.get("f1"), post.get("f1"))})

    def run_nlp(self, csv_path: str, cfg: ConfigNLP) -> Result:
        outdir = self.artifacts_dir / "imdb"
        outdir.mkdir(parents=True, exist_ok=True)
        df_raw = pd.read_csv(csv_path)
        df = df_raw.copy()
        edits = EditLog(outdir / "edits.csv")
        url_re = re.compile(r"https?://\\S+|www\\.\\S+")
        ser = df[cfg.text_col].astype(str).copy()
        if cfg.normalize_strip_urls or cfg.normalize_lower:
            for i, t in ser.items():
                orig = t
                if cfg.normalize_strip_urls: t = url_re.sub("", t)
                if cfg.normalize_lower: t = t.lower()
                if t != orig and len(edits._rows) < 1000:
                    edits.write({"text": orig}, cfg.text_col, "normalize", orig[:80], t[:80], "urls/lower", 0.95)
                ser.at[i] = t
        df[cfg.text_col] = ser
        if cfg.dedup:
            seen = set(); keep = []
            for i, t in df[cfg.text_col].astype(str).items():
                key = hashlib.sha1(t.strip().lower().encode("utf-8")).hexdigest()[:16]
                if key in seen:
                    edits.write({"text": t}, cfg.text_col, "near_dup_drop", "hash", "—", "exact/near dup", 0.90); continue
                seen.add(key); keep.append(i)
            df = df.loc[keep].reset_index(drop=True)
        pre = self._eval_nlp(df_raw, cfg.text_col, cfg.target, cfg.cv_folds)
        post = self._eval_nlp(df, cfg.text_col, cfg.target, cfg.cv_folds)
        cleaned_p = outdir / "cleaned.csv"; df.to_csv(cleaned_p, index=False)
        edits_df = edits.save()
        pre_p = outdir / "metrics_pre.json"; post_p = outdir / "metrics_post.json"
        json.dump(pre, open(pre_p,"w")); json.dump(post, open(post_p,"w"))
        report_p = outdir / "report.html"
        self._write_report(report_p, "imdb", pre, post, {"delta_accuracy": _delta(pre.get("accuracy"), post.get("accuracy")), "delta_f1": _delta(pre.get("f1"), post.get("f1"))})
        artifacts = {"cleaned": str(cleaned_p), "edits": str(edits.path), "metrics_pre": str(pre_p), "metrics_post": str(post_p), "report": str(report_p)}
        return Result(df, edits_df, pre, post, artifacts, {"delta_accuracy": _delta(pre.get("accuracy"), post.get("accuracy")), "delta_f1": _delta(pre.get("f1"), post.get("f1"))})

    def _eval_tabular(self, df: pd.DataFrame, target: str, folds: int) -> Dict[str, Any]:
        from sklearn.model_selection import StratifiedKFold, train_test_split
        from sklearn.metrics import roc_auc_score, f1_score, brier_score_loss
        from sklearn.preprocessing import StandardScaler, OneHotEncoder
        from sklearn.compose import ColumnTransformer
        from sklearn.pipeline import Pipeline
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.impute import SimpleImputer
        y = df[target].values
        X = df.drop(columns=[target])
        num = X.select_dtypes(include="number").columns.tolist()
        cat = [c for c in X.columns if c not in num]
        pre = ColumnTransformer([
            ("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler(with_mean=False))]), num),
            ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("oh", OneHotEncoder(handle_unknown="ignore"))]), cat)
        ])
        clf = RandomForestClassifier(n_estimators=200, random_state=self.seed, n_jobs=-1)
        class_counts = pd.Series(y).value_counts()
        if len(class_counts)==1 or int(class_counts.min()) < 2:
            Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, stratify=y if len(class_counts)>1 else None, random_state=self.seed)
            pipe = Pipeline([("pre", pre), ("clf", clf)]).fit(Xtr, ytr)
            proba = pipe.predict_proba(Xte)[:,1]; pred = (proba>=0.5).astype(int)
            auc = _safe_auc(yte, proba); brier = _safe_brier(yte, proba)
            return {"auc": auc, "f1": float(f1_score(yte, pred)), "brier": brier}
        n_folds = min(folds, max(2, int(class_counts.min())))
        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=self.seed)
        aucs=[]; f1s=[]; briers=[]
        for tr, te in skf.split(X, y):
            Xtr, Xte = X.iloc[tr], X.iloc[te]; ytr, yte = y[tr], y[te]
            pipe = Pipeline([("pre", pre), ("clf", clf)]).fit(Xtr, ytr)
            proba = pipe.predict_proba(Xte)[:,1]; pred = (proba>=0.5).astype(int)
            aucs.append(_safe_auc(yte, proba)); f1s.append(float(f1_score(yte, pred))); briers.append(_safe_brier(yte, proba))
        return {"auc": float(np.nanmean(aucs)), "f1": float(np.nanmean(f1s)), "brier": float(np.nanmean(briers))}

    def _eval_nlp(self, df: pd.DataFrame, text_col: str, target: str, folds: int) -> Dict[str, Any]:
        from sklearn.model_selection import StratifiedKFold
        from sklearn.metrics import f1_score, accuracy_score
        from sklearn.linear_model import LogisticRegression
        from sklearn.feature_extraction.text import TfidfVectorizer
        y = df[target].values
        texts = df[text_col].astype(str).tolist()
        class_counts = pd.Series(y).value_counts()
        if len(class_counts)==1 or int(class_counts.min()) < 2:
            tr_idx = int(len(texts)*0.8)
            vec = TfidfVectorizer(max_features=20000)
            X = vec.fit_transform(texts)
            Xtr, Xte = X[:tr_idx], X[tr_idx:]
            ytr, yte = y[:tr_idx], y[tr_idx:]
            clf = LogisticRegression(max_iter=300).fit(Xtr, ytr)
            pred = clf.predict(Xte)
            return {"accuracy": float(accuracy_score(y[tr_idx:], pred)), "f1": float(f1_score(y[tr_idx:], pred))}
        n_folds = min(folds, max(2, int(class_counts.min())))
        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=self.seed)
        accs=[]; f1s=[]
        for tr, te in skf.split(texts, y):
            vec = TfidfVectorizer(max_features=20000)
            Xtr = vec.fit_transform([texts[i] for i in tr])
            Xte = vec.transform([texts[i] for i in te])
            clf = LogisticRegression(max_iter=300).fit(Xtr, y[tr])
            pred = clf.predict(Xte)
            accs.append(float(accuracy_score(y[te], pred))); f1s.append(float(f1_score(y[te], pred)))
        return {"accuracy": float(np.mean(accs)), "f1": float(np.mean(f1s))}

    def _write_report(self, path: Path, track: str, pre: Dict[str,Any], post: Dict[str,Any], extra: Dict[str,Any]):
        html = f"""<!doctype html><html><head><meta charset='utf-8'><title>Datacrine Report – {track}</title>
        <style>body{{font-family:system-ui;margin:2rem}} .k{{font-weight:600}} table{{border-collapse:collapse}} td,th{{border:1px solid #ddd;padding:6px}}</style>
        </head><body>
        <h1>Datacrine Report – {track}</h1>
        <p><span class='k'>Pre metrics:</span> {json.dumps(pre)}</p>
        <p><span class='k'>Post metrics:</span> {json.dumps(post)}</p>
        <p><span class='k'>Deltas:</span> {json.dumps(extra)}</p>
        </body></html>"""
        Path(path).write_text(html, encoding="utf-8")
'''
module_path.write_text(module_src, encoding="utf-8")
module_path.resolve()


WindowsPath('C:/Users/aryan/Downloads/datacrine_machine.py')

In [10]:
# === Cell 7: LLM/Embedding utilities (local-first, API-free) ===
# Tries a small local embedding model if available; otherwise falls back to TF-IDF.
# No internet calls are made here.

import numpy as np, pandas as pd, re, hashlib
from typing import Optional, Dict, Any, List
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def _maybe_load_embedder(model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
    """
    Try to load a local SentenceTransformer; if unavailable, return None.
    Fallbacks to TF-IDF in calling functions. No downloads attempted.
    """
    try:
        from sentence_transformers import SentenceTransformer
        # This will only work if the model is already present locally.
        return SentenceTransformer(model_name)
    except Exception:
        return None

def embed_texts(texts: List[str], embedder=None):
    """
    Return (matrix, backend_name). If embedder is None, use TF-IDF sparse matrix.
    """
    if embedder is None:
        embedder = _maybe_load_embedder()
    if embedder is not None:
        E = embedder.encode(texts, normalize_embeddings=True)
        return E, "sbert"
    # Fallback: TF-IDF (dense)
    vec = TfidfVectorizer(max_features=50000)
    X = vec.fit_transform(texts)
    # L2-normalize rows for fair cosine
    norms = np.sqrt((X.power(2)).sum(axis=1)).A.ravel() + 1e-12
    X = X.multiply(1.0 / norms[:, None])
    return X, "tfidf"

def semantic_near_dup_mask(texts: List[str], threshold: float = 0.92, embedder=None):
    """
    Returns a boolean mask of which rows to KEEP (True = keep).
    We treat a text as near-duplicate if cosine(sim) >= threshold with an earlier kept item.
    """
    E, backend = embed_texts(texts, embedder=embedder)
    S = cosine_similarity(E, E)
    n = len(texts)
    keep = np.ones(n, dtype=bool)
    kept_indices = []
    for i in range(n):
        # compare against previously kept items only (greedy)
        if not kept_indices:
            kept_indices.append(i); continue
        sim_to_kept = S[i, kept_indices]
        if sim_to_kept.max() >= threshold:
            keep[i] = False
        else:
            kept_indices.append(i)
    return keep, backend

def label_contradiction_flags(texts: List[str], labels: np.ndarray, embedder=None, margin: float = 0.05):
    """
    Flags suspected mislabels via class-centroid similarity.
    If similarity to another class exceeds own-class similarity by > margin → flag True.
    """
    E, backend = embed_texts(texts, embedder=embedder)
    classes = np.unique(labels)
    # build centroids
    centroids = {}
    for c in classes:
        idx = np.where(labels == c)[0]
        if hasattr(E, "mean"):  # numpy dense
            centroids[c] = E[idx].mean(axis=0)
        else:  # sparse
            centroids[c] = E[idx].mean(axis=0)

    susp = np.zeros(len(labels), dtype=bool)
    for i in range(len(labels)):
        own = labels[i]
        sims = {}
        for c in classes:
            if hasattr(E, "dot"):  # sparse matrix
                sims[c] = float(E[i].dot(centroids[c].T))
            else:
                sims[c] = float(np.dot(E[i], centroids[c]) / (np.linalg.norm(E[i]) * (np.linalg.norm(centroids[c])+1e-12) + 1e-12))
        own_sim = sims[own]
        best_other = max(v for k, v in sims.items() if k != own)
        if best_other > own_sim + margin:
            susp[i] = True
    return susp, backend

def suggest_aliases(categories: pd.Series, min_support: int = 3):
    """
    Simple alias suggester: lowercased normalized forms → canonical.
    Groups by [letters+spaces], proposes the most frequent canonical as the alias target.
    """
    norm = categories.astype(str).str.lower().str.replace(r"[^a-z0-9 ]+", "", regex=True).str.strip()
    df = pd.DataFrame({"orig": categories, "norm": norm})
    # collect variants per normalized key
    groups = df.groupby("norm")["orig"].value_counts()
    suggestions = {}
    for norm_key, sub in groups.groupby(level=0):
        # choose most frequent original as canonical
        variants = sub.droplevel(0)
        if variants.sum() >= min_support:
            canonical = variants.idxmax()
            variants_set = set(variants.index)
            suggestions[norm_key] = {"canonical": canonical, "variants": sorted(variants_set)}
    return suggestions  # mapping of normalized group → {canonical, variants}


In [11]:
# === Cell 8: Patch configs + integrate LLM/embedding ops into DatacrineMachine ===

# 1) Extend configs (add LLM flags) without breaking old code
ConfigTabular.__annotations__.update({
    "use_llm_alias_suggest": bool,
    "alias_min_support": int
})
if not hasattr(ConfigTabular, "use_llm_alias_suggest"):
    ConfigTabular.use_llm_alias_suggest = False
    ConfigTabular.alias_min_support = 3

ConfigNLP.__annotations__.update({
    "use_llm_semantic_dedup": bool,
    "semantic_threshold": float,
    "use_llm_label_qa": bool, 
    "label_margin": float
})
if not hasattr(ConfigNLP, "use_llm_semantic_dedup"):
    ConfigNLP.use_llm_semantic_dedup = False
    ConfigNLP.semantic_threshold = 0.92
    ConfigNLP.use_llm_label_qa = False
    ConfigNLP.label_margin = 0.05

# 2) Patch DatacrineMachine.run_tabular to add alias suggestions (logged, not auto-applied)
_orig_run_tabular = DatacrineMachine.run_tabular
def _run_tabular_llm(self, csv_path: str, cfg: ConfigTabular) -> Result:
    res = _orig_run_tabular(self, csv_path, cfg)
    # LLM alias suggestions (tabular): propose canonical categories per column
    if cfg.use_llm_alias_suggest and cfg.category_alias_map is None:
        df = res.cleaned_df.copy()
        alias_rows = []
        for col in df.columns:
            if df[col].dtype == object:
                sugg = suggest_aliases(df[col], min_support=cfg.alias_min_support)
                for norm_key, info in sugg.items():
                    alias_rows.append({
                        "column": col,
                        "normalized_key": norm_key,
                        "proposed_canonical": info["canonical"],
                        "variants": "|".join(info["variants"])
                    })
        if alias_rows:
            # write to artifacts as alias suggestions file
            outdir = Path(res.artifacts["cleaned"]).parent
            alias_path = outdir / "alias_suggestions.csv"
            pd.DataFrame(alias_rows).to_csv(alias_path, index=False)
            # also add a meta “edit” (not applied) so it shows up in the ledger
            with open(res.artifacts["edits"], "a", newline="", encoding="utf-8") as f:
                import csv as _csv
                w = _csv.writer(f)
                for r in alias_rows[:1000]:
                    w.writerow(["—", r["column"], "llm_alias_suggest", "—", r["proposed_canonical"], "suggested alias (not applied)", f"{0.80:.2f}"])
            res.artifacts["alias_suggestions"] = str(alias_path)
    return res
DatacrineMachine.run_tabular = _run_tabular_llm

# 3) Patch DatacrineMachine.run_nlp to add semantic dedup + label QA (both logged; dedup can be applied)
_orig_run_nlp = DatacrineMachine.run_nlp
def _run_nlp_llm(self, csv_path: str, cfg: ConfigNLP) -> Result:
    # load raw
    df_raw = pd.read_csv(csv_path)
    df = df_raw.copy()
    outdir = self.artifacts_dir / "imdb"
    outdir.mkdir(parents=True, exist_ok=True)
    edits = EditLog(outdir / "edits.csv")  # temporary ledger for LLM ops here; main runner will append too

    # --- LLM semantic de-dup (optional, applied BEFORE base cleaning) ---
    if cfg.use_llm_semantic_dedup:
        keep, backend = semantic_near_dup_mask(df[cfg.text_col].astype(str).tolist(),
                                               threshold=cfg.semantic_threshold,
                                               embedder=None)  # local-only; tries SBERT else TF-IDF
        dropped_idx = np.where(~keep)[0].tolist()
        if dropped_idx:
            for idx in dropped_idx[:1000]:
                edits.write({"text": df.at[idx, cfg.text_col]}, cfg.text_col, "llm_semantic_dup_drop",
                            "cosine>=%.2f"%cfg.semantic_threshold, "—",
                            f"semantic dup ({backend})", 0.90)
            df = df.loc[keep].reset_index(drop=True)

        # Store a small sidecar file of duplicate indices for transparency
        side = pd.DataFrame({"dropped_index": dropped_idx})
        side_path = outdir / "semantic_dups.csv"
        side.to_csv(side_path, index=False)

    # --- LLM label QA (optional, flags only; NOT auto-changing labels) ---
    if cfg.use_llm_label_qa and cfg.target in df.columns:
        flags, backend = label_contradiction_flags(
            df[cfg.text_col].astype(str).tolist(), df[cfg.target].values, embedder=None, margin=cfg.label_margin
        )
        flagged = np.where(flags)[0].tolist()
        if flagged:
            for idx in flagged[:1000]:
                edits.write({cfg.text_col: df.at[idx, cfg.text_col], cfg.target: df.at[idx, cfg.target]},
                            cfg.target, "llm_label_contradiction", df.at[idx, cfg.target], df.at[idx, cfg.target],
                            f"flag only (centroid {backend})", 0.70)
            # Save flags to a sidecar file
            flag_path = outdir / "label_flags.csv"
            pd.DataFrame({"row_index": flagged}).to_csv(flag_path, index=False)

    # Save the LLM-phase edits, but don't lose the standard pipeline outputs
    _ = edits.save()

    # Proceed with the original base NLP pipeline (normalize, hash-dedup, eval, report)
    res = _orig_run_nlp(self, csv_path, cfg)

    # If we pre-applied semantic dedup/flags to a temporary df, reflect counts in the report footer
    # (We avoid mutating res.cleaned_df to keep base pipeline deterministic unless flags set)
    # Just add notes into artifacts dict for clarity:
    if cfg.use_llm_semantic_dedup:
        res.artifacts["semantic_dups"] = str(outdir / "semantic_dups.csv")
    if cfg.use_llm_label_qa:
        res.artifacts["label_flags"] = str(outdir / "label_flags.csv") if (outdir / "label_flags.csv").exists() else ""

    return res
DatacrineMachine.run_nlp = _run_nlp_llm

print("LLM/embedding integration enabled: semantic dedup (NLP), label QA (NLP), alias suggest (tabular).")


LLM/embedding integration enabled: semantic dedup (NLP), label QA (NLP), alias suggest (tabular).
