# Reformed-Audit Demo (Split Cells, ROW_CAP=10k)
This notebook is generated to help you debug cell-by-cell.

### 1) Imports & Config

In [11]:

import os, re, json, math, hashlib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
import matplotlib.pyplot as plt

CSV_PATH = "train.csv" if os.path.exists("train.csv") else "train.csv"
ART_DIR = "data" if os.path.exists("data") else "."
ROW_CAP = 10000
os.makedirs(ART_DIR, exist_ok=True)
print("CSV_PATH:", CSV_PATH)
print("ART_DIR:", ART_DIR)


CSV_PATH: train.csv
ART_DIR: .


### 2) Load Data & Composite Score

In [12]:

assert os.path.exists(CSV_PATH), f"Missing file: {CSV_PATH}"
df = pd.read_csv(CSV_PATH)
if len(df) > ROW_CAP:
    df = df.sample(ROW_CAP, random_state=42).reset_index(drop=True)
    print(f"Capped rows to {ROW_CAP}.")

prompt_col = "prompt" if "prompt" in df.columns else None
text_col   = "response" if "response" in df.columns else None

subscores_order = ["helpfulness","correctness","coherence","complexity","verbosity"]
subs_present = [c for c in subscores_order if c in df.columns]
score_like = [c for c in df.columns if c.lower() in {"score","rating","label","quality"}]

if not score_like and subs_present:
    weights = {"helpfulness":0.35,"correctness":0.30,"coherence":0.25,"complexity":0.05,"verbosity":0.05}
    w = np.array([weights.get(c, 0.05) for c in subs_present], dtype=float); w = w / w.sum()
    df["score"] = (df[subs_present] * w).sum(axis=1)
    score_col = "score"
else:
    score_col = score_like[0] if score_like else None

assert text_col is not None and score_col is not None, f"Need text & score. Found text={text_col}, score={score_col}"
keep = [c for c in [prompt_col, text_col, score_col] if c]
df = df[keep].dropna().rename(columns={text_col:"text", (prompt_col or "prompt"):"prompt", score_col:"score"})
print("Detected:", {"text":"response","prompt":prompt_col,"score":score_col})
print("Rows:", len(df))
df.head(3)


AssertionError: Missing file: train.csv

### 3) Labels & Baseline Model (TF-IDF + LR)

In [10]:

high_thr = df["score"].quantile(0.75)
low_thr  = df["score"].quantile(0.25)
df_bin = df[(df["score"] >= high_thr) | (df["score"] <= low_thr)].copy()
df_bin["y"] = (df_bin["score"] >= df_bin["score"].median()).astype(int)

tfidf = TfidfVectorizer(lowercase=True, stop_words="english", min_df=5, max_df=0.9, max_features=5000)
X = tfidf.fit_transform(df_bin["text"].astype(str))
y = df_bin["y"].values

X_tr, X_te, y_tr, y_te, raw_tr, raw_te = train_test_split(
    X, y, df_bin["text"].astype(str), test_size=0.25, random_state=42, stratify=y
)

clf = LogisticRegression(max_iter=200, solver="liblinear")
clf.fit(X_tr, y_tr)

proba = clf.predict_proba(X_te)[:,1]
pred  = (proba >= 0.5).astype(int)

baseline_metrics = {
    "accuracy": float(accuracy_score(y_te, pred)),
    "f1": float(f1_score(y_te, pred)),
    "roc_auc": float(roc_auc_score(y_te, proba)),
    "n_test": int(len(y_te)),
    "n_rows_used": int(len(df_bin))
}
baseline_metrics


NameError: name 'df' is not defined

### 4) Top Words (High vs Low)

In [None]:

vocab = np.array(tfidf.get_feature_names_out())
coefs = clf.coef_.ravel()
top_pos_idx = np.argsort(coefs)[-25:][::-1]
top_neg_idx = np.argsort(coefs)[:25]
top_pos_words = [(vocab[i], float(coefs[i])) for i in top_pos_idx]
top_neg_words = [(vocab[i], float(coefs[i])) for i in top_neg_idx]
print("Top HIGH words:", top_pos_words[:10])
print("Top LOW words :", top_neg_words[:10])


### 5) Gaming Demo (append high-weight words + boilerplate)

In [None]:

HIGH_WORDS = [w for w,_ in top_pos_words[:12]] or ["fair","transparent","privacy","safety","evidence"]

def game_texts(texts, high_words=HIGH_WORDS, min_tokens_add=30):
    filler = " ".join(high_words * max(1, int(np.ceil(min_tokens_add / max(1, len(high_words)))) ))
    suffix = (" In practice, we will carefully evaluate fairness, privacy, and transparency, "
              "justify trade-offs, provide evidence, document limitations, and ensure accountability.")
    return [(t or "").strip() + " " + filler + "." + suffix for t in texts]

lowest = np.argsort(proba)[: min(10, len(proba))]
low_texts = raw_te.iloc[lowest].tolist()
gamed_texts = game_texts(low_texts)

X_low  = tfidf.transform(low_texts)
X_game = tfidf.transform(gamed_texts)

proba_low  = clf.predict_proba(X_low)[:,1]
proba_game = clf.predict_proba(X_game)[:,1]

gaming_df = pd.DataFrame({"orig_proba_high": proba_low, "gamed_proba_high": proba_game})
gaming_df["delta"] = gaming_df["gamed_proba_high"] - gaming_df["orig_proba_high"]
gaming_df.describe()


### 6) Fragility Check (strip top-K words)

In [None]:

TOPK = 50
frag_set = set([w for w,_ in top_pos_words[:TOPK]] + [w for w,_ in top_neg_words[:TOPK]])

def strip_topk_bulk(texts):
    if not frag_set:
        return texts
    patt = re.compile(r"\b(" + "|".join(re.escape(w) for w in frag_set) + r")\b", flags=re.IGNORECASE)
    return [patt.sub("", t) for t in texts]

X_te_stripped = tfidf.transform(strip_topk_bulk(raw_te.tolist()))
proba_stripped = clf.predict_proba(X_te_stripped)[:,1]
pred_stripped = (proba_stripped >= 0.5).astype(int)

fragility_metrics = {
    "accuracy": float(accuracy_score(y_te, pred_stripped)),
    "f1": float(f1_score(y_te, pred_stripped)),
    "roc_auc": float(roc_auc_score(y_te, proba_stripped))
}
fragility_metrics


### 7) Context Randomization (token shuffle)

In [None]:

def shuffle_tokens(t: str) -> str:
    tokens = re.findall(r"\w+|\S", t or "")
    rng = np.random.default_rng(0)
    rng.shuffle(tokens)
    return " ".join(tokens)

X_te_shuffled = tfidf.transform([shuffle_tokens(t) for t in raw_te.tolist()])
proba_shuf = clf.predict_proba(X_te_shuffled)[:,1]
pred_shuf = (proba_shuf >= 0.5).astype(int)

shuffle_metrics = {
    "accuracy": float(accuracy_score(y_te, pred_shuf)),
    "f1": float(f1_score(y_te, pred_shuf)),
    "roc_auc": float(roc_auc_score(y_te, proba_shuf))
}
shuffle_metrics


### 8) Prompt-Bucket Distribution Shift (fresh model/vectorizer)

In [None]:

# Fresh TF-IDF/model so baseline vocab isn't mutated
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

if "prompt" in df_bin.columns and df_bin["prompt"].notna().any():
    def bucket_prompt(p: str) -> int:
        p = p if isinstance(p, str) else str(p)
        h = int(hashlib.md5(p.encode("utf-8")).hexdigest(), 16)
        return h % 4
    df_pb = df_bin.copy()
    df_pb["_pbkt"] = df_pb["prompt"].apply(bucket_prompt)

    tr_pb = df_pb["_pbkt"].isin([0,1,2])
    te_pb = df_pb["_pbkt"].isin([3])

    tfidf_pb = TfidfVectorizer(lowercase=True, stop_words="english", min_df=5, max_df=0.9, max_features=5000)
    X_pb_tr = tfidf_pb.fit_transform(df_pb.loc[tr_pb, "text"].astype(str))
    y_pb_tr = df_pb.loc[tr_pb, "y"].values
    X_pb_te = tfidf_pb.transform(df_pb.loc[te_pb, "text"].astype(str))
    y_pb_te = df_pb.loc[te_pb, "y"].values

    clf_pb = LogisticRegression(max_iter=200, solver="liblinear")
    clf_pb.fit(X_pb_tr, y_pb_tr)

    if len(y_pb_te):
        pr_pb = clf_pb.predict_proba(X_pb_te)[:,1]
        yhat_pb = (pr_pb >= 0.5).astype(int)
        prompt_shift_metrics = {
            "accuracy": float(accuracy_score(y_pb_te, yhat_pb)),
            "f1": float(f1_score(y_pb_te, yhat_pb)),
            "roc_auc": float(roc_auc_score(y_pb_te, pr_pb)),
            "n": int(len(y_pb_te))
        }
    else:
        prompt_shift_metrics = {"note": "No held-out prompt-bucket rows."}
else:
    prompt_shift_metrics = {"note": "No prompt column available; skipped."}

prompt_shift_metrics


### 9) Save Plots & Markdown Report

In [None]:

def plot_top_words(pairs, title, fname):
    words = [w for w,_ in pairs]
    vals = [float(v) for _,v in pairs]
    y = np.arange(len(words))
    plt.figure(figsize=(8,5))
    plt.barh(y, vals)
    plt.yticks(y, words)
    plt.gca().invert_yaxis()
    plt.title(title)
    plt.xlabel("logistic coef (higher => 'High')")
    plt.tight_layout()
    plt.savefig(fname, bbox_inches="tight"); plt.close()

plot_top_words(top_pos_words[:15], "Top words associated with HIGH scores", f"{ART_DIR}/top_pos_words.png")
plot_top_words(top_neg_words[:15], "Top words associated with LOW scores",  f"{ART_DIR}/top_neg_words.png")

plt.figure(figsize=(7,5))
plt.hist(gaming_df["delta"], bins=10)
plt.title("Predicted High-Score Probability Increase After Gaming (low texts)")
plt.xlabel("Δ probability (gamed - original)"); plt.ylabel("count")
plt.tight_layout(); plt.savefig(f"{ART_DIR}/gaming_delta_hist.png", bbox_inches="tight"); plt.close()

ARTIFACT = f"{ART_DIR}/README_audit_results.md"
with open(ARTIFACT, "w", encoding="utf-8") as f:
    f.write("# Reformed-Audit Demo Results (Split-Cells, with ROW_CAP)\n\n")
    f.write("## Baseline (surface features)\n")
    f.write(json.dumps(baseline_metrics, indent=2) + "\n\n")
    f.write("## Gaming Demo (lowest-probability samples)\n")
    f.write(gaming_df.describe().to_markdown() + "\n\n")
    f.write("## Fragility Check (strip top-50 words)\n")
    f.write(json.dumps(fragility_metrics, indent=2) + "\n\n")
    f.write("## Distribution Shift by Prompt-Bucket (fresh model)\n")
    try:
        f.write(json.dumps(prompt_shift_metrics, indent=2) + "\n")
    except NameError:
        f.write("{\"note\": \"prompt shift not computed\"}\n")

print("Saved:")
print(" -", f"{ART_DIR}/top_pos_words.png")
print(" -", f"{ART_DIR}/top_neg_words.png")
print(" -", f"{ART_DIR}/gaming_delta_hist.png")
print(" -", ARTIFACT)
