<a href="https://colab.research.google.com/github/Dey313/ResEthiq/blob/main/P4_Test_9_AI_CS2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install openpyxl reportlab datasketch scikit-learn scipy


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/2.0 MB[0m [31m9.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/2.0 MB[0m [31m29.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/96.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.1/96.1 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import io, os, re, math, datetime, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from google.colab import files

from datasketch import MinHash, MinHashLSH

from scipy.stats import chi2
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import IsolationForest, HistGradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, brier_score_loss

from reportlab.lib.pagesizes import A4
from reportlab.lib.units import cm
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image, PageBreak
from reportlab.lib.enums import TA_CENTER


In [3]:
ART_DIR = "artifacts"
os.makedirs(ART_DIR, exist_ok=True)

CONFIG = {
    # speed caps
    "max_rows_for_heavy": 5000,
    "max_num_cols": 40,

    # near-dup / templating
    "lsh_threshold": 0.90,
    "min_cluster_size": 5,

    # likert detection
    "min_likert_items": 8,

    # geometry
    "knn_k": 5,

    # permutation tests
    "perm_runs": 120,   # increase for stronger p-values (250–1000)

    # bootstrap for confidence
    "bootstrap_iters": 150,
    "bootstrap_sample_frac": 0.85,

    # calibration
    "cv_folds": 5,

    # strict gating to reduce false positives
    "p_strict": 0.82,      # only call AI_GENERATED or MANIPULATED if >= this
    "p_medium": 0.65,      # otherwise report low confidence / lean ORIGINAL

    # multi-signal requirement
    "min_synth_signals": 2,
    "min_manip_signals": 2,
}

EXCLUDE_COL_HINTS = {"__row_id__", "id", "respondent_id", "response_id", "name", "phone", "email"}

def clamp01(x: float) -> float:
    return float(max(0.0, min(1.0, x)))

def safe_str(x, max_len=140):
    s = str(x)
    return s if len(s) <= max_len else s[:max_len-3] + "..."

def add_row_id(df: pd.DataFrame, col="__row_id__"):
    df = df.copy()
    if col not in df.columns:
        df[col] = [f"r_{i:06d}" for i in range(len(df))]
    return df

def _sample_df(df: pd.DataFrame, nmax: int):
    if len(df) <= nmax:
        return df
    return df.sample(nmax, random_state=42)

def detect_likert_cols(df: pd.DataFrame, exclude=None):
    exclude = set(exclude or [])
    lik = []
    for c in df.columns:
        if c in exclude:
            continue
        s = df[c].dropna()
        if len(s) < 30:
            continue
        if pd.api.types.is_numeric_dtype(s):
            vals = np.asarray(s)
            ivals = np.round(vals).astype(int)
            uniq = sorted(set(ivals.tolist()))
            if 4 <= len(uniq) <= 7 and (max(uniq) - min(uniq) <= 6):
                lik.append(c)
    return lik

def _numeric_frame(df: pd.DataFrame, exclude=None, max_cols=40):
    exclude = set(exclude or [])
    Xn = df.select_dtypes(include="number").drop(columns=[c for c in exclude if c in df.columns], errors="ignore")
    if Xn.shape[1] > max_cols:
        vars_ = Xn.var(numeric_only=True).sort_values(ascending=False)
        Xn = Xn[vars_.index[:max_cols]]
    return Xn

def fdr_bh(pvals):
    """Benjamini-Hochberg FDR. Returns q-values aligned with pvals."""
    p = np.array(pvals, dtype=float)
    n = len(p)
    order = np.argsort(p)
    ranked = p[order]
    q = np.empty(n)
    prev = 1.0
    for i in range(n-1, -1, -1):
        rank = i+1
        val = ranked[i] * n / rank
        prev = min(prev, val)
        q[order[i]] = prev
    return q


In [4]:
def feat_missingness(df):
    miss_cols = df.isna().mean()
    return {
        "miss_overall": float(miss_cols.mean()),
        "miss_maxcol": float(miss_cols.max()) if len(miss_cols) else 0.0,
        "miss_spread": float(miss_cols.std(ddof=1)) if len(miss_cols) > 1 else 0.0,
    }

def feat_duplicates_exact(df, exclude=None):
    exclude = set(exclude or [])
    cols = [c for c in df.columns if c not in exclude]
    X = df[cols].fillna("__MISSING__")
    dup_any = X.duplicated(keep=False)
    return {
        "dup_exact_rate": float(dup_any.mean()),
        "dup_exact_excess": float(X.duplicated(keep="first").mean())
    }

def feat_near_duplicates_lsh(df, exclude=None, threshold=0.90, min_cluster=5):
    exclude = set(exclude or [])
    cols = [c for c in df.columns if c not in exclude]
    if len(cols) < 8 or len(df) < 80:
        return {"near_dup_rate": np.nan, "near_dup_clusters": 0, "near_dup_max_cluster": 0}

    df_s = _sample_df(df, min(len(df), 2500)).copy()
    lsh = MinHashLSH(threshold=threshold, num_perm=64)
    mhs, keys = {}, []

    for i, (_, row) in enumerate(df_s.iterrows()):
        mh = MinHash(num_perm=64)
        for c in cols:
            v = row[c]
            if pd.isna(v): v = "__MISSING__"
            mh.update(f"{c}={v}".encode("utf8"))
        k = f"k{i}"
        lsh.insert(k, mh)
        mhs[k] = mh
        keys.append(k)

    visited, clusters = set(), []
    for k in keys:
        if k in visited:
            continue
        nbrs = lsh.query(mhs[k])
        if len(nbrs) >= min_cluster:
            clusters.append(nbrs)
            visited.update(nbrs)

    clustered = sum(len(c) for c in clusters)
    near_rate = float(clustered / max(1, len(df_s)))
    max_cluster = int(max([len(c) for c in clusters], default=0))
    return {
        "near_dup_rate": near_rate,
        "near_dup_clusters": int(len(clusters)),
        "near_dup_max_cluster": max_cluster
    }

def feat_digit_preference_suite(df, exclude=None):
    """
    Digit preference forensic suite:
    - last-digit chi-square per numeric column (rounded ints)
    - FDR q-values across columns
    - report max chi2, min q, max digit spike
    """
    exclude = set(exclude or [])
    Xn = _numeric_frame(df, exclude=exclude, max_cols=CONFIG["max_num_cols"])
    if Xn.shape[1] == 0:
        return {
            "digit_chi2_max": np.nan, "digit_q_min": np.nan,
            "digit_spike_max": np.nan, "digit_sig_cols": 0
        }

    chi2_list, spike_list, cols_used = [], [], []
    for c in Xn.columns:
        s = Xn[c].dropna()
        if len(s) < 250:
            continue
        vals = np.abs(np.round(s.values).astype(int))
        digits = vals % 10
        counts = np.bincount(digits, minlength=10).astype(float)
        n = counts.sum()
        if n <= 0:
            continue
        exp = np.ones(10) * (n/10.0)
        chi2_stat = float(((counts-exp)**2/(exp+1e-9)).sum())
        pval = float(1.0 - chi2.cdf(chi2_stat, df=9))
        p_obs = counts / n
        spike = float(p_obs.max())

        chi2_list.append((c, chi2_stat, pval))
        spike_list.append(spike)
        cols_used.append(c)

    if not chi2_list:
        return {
            "digit_chi2_max": np.nan, "digit_q_min": np.nan,
            "digit_spike_max": np.nan, "digit_sig_cols": 0
        }

    pvals = [x[2] for x in chi2_list]
    qvals = fdr_bh(pvals)
    # count significant columns after FDR
    sig = int(np.sum(np.array(qvals) < 0.05))
    chi2_max = float(max(x[1] for x in chi2_list))
    q_min = float(min(qvals))
    spike_max = float(max(spike_list)) if spike_list else np.nan

    return {
        "digit_chi2_max": chi2_max,
        "digit_q_min": q_min,
        "digit_spike_max": spike_max,
        "digit_sig_cols": sig
    }

def feat_heaping(df, exclude=None):
    exclude = set(exclude or [])
    Xn = _numeric_frame(df, exclude=exclude, max_cols=CONFIG["max_num_cols"])
    if Xn.shape[1] == 0:
        return {"heap_05_max": np.nan, "heap_intshare_max": np.nan}

    heap05 = 0.0
    intshare = 0.0
    for c in Xn.columns:
        s = Xn[c].dropna()
        if len(s) < 250:
            continue
        x = s.values.astype(float)
        # share of near-integers
        int_share = float(np.mean(np.isclose(x, np.round(x), atol=1e-8)))
        intshare = max(intshare, int_share)

        vals = np.abs(np.round(x).astype(int))
        last = vals % 10
        share05 = float(np.mean(np.isin(last, [0,5])))
        heap05 = max(heap05, share05)

    return {"heap_05_max": heap05, "heap_intshare_max": intshare}

def feat_distribution_heterogeneity(df, exclude=None):
    exclude = set(exclude or [])
    Xn = _numeric_frame(df, exclude=exclude, max_cols=CONFIG["max_num_cols"])
    if Xn.shape[1] < 4:
        return {"het_skew_std": np.nan, "het_kurt_std": np.nan, "het_entropy_std": np.nan, "het_unique_std": np.nan}

    skews, kurts, ents, uniqs = [], [], [], []
    for c in Xn.columns:
        x = Xn[c].dropna().values
        x = x[np.isfinite(x)]
        if len(x) < 200:
            continue

        m = x.mean()
        sd = x.std(ddof=1) + 1e-9
        z = (x - m) / sd
        skew = float(np.mean(z**3))
        kurt = float(np.mean(z**4) - 3.0)

        bins = np.histogram_bin_edges(x, bins="auto")
        hist, _ = np.histogram(x, bins=bins)
        p = hist / max(1.0, hist.sum())
        p = p[p > 0]
        ent = float(-(p * np.log(p)).sum())

        uniq = float(len(np.unique(np.round(x, 6))) / max(1, len(x)))

        skews.append(skew); kurts.append(kurt); ents.append(ent); uniqs.append(uniq)

    def _std(a):
        return float(np.std(a, ddof=1)) if len(a) > 1 else np.nan

    return {
        "het_skew_std": _std(skews),
        "het_kurt_std": _std(kurts),
        "het_entropy_std": _std(ents),
        "het_unique_std": _std(uniqs)
    }

def feat_knn_geometry(df, exclude=None, k=5):
    exclude = set(exclude or [])
    Xn = _numeric_frame(df, exclude=exclude, max_cols=CONFIG["max_num_cols"])
    if Xn.shape[1] < 6 or len(df) < 300:
        return {"knn_dist_cv": np.nan, "knn_dist_skew": np.nan}

    df_s = _sample_df(df, CONFIG["max_rows_for_heavy"])
    X = _numeric_frame(df_s, exclude=exclude, max_cols=CONFIG["max_num_cols"]).copy()
    X = X.fillna(X.median(numeric_only=True))

    Z = StandardScaler().fit_transform(X.values)
    nn = NearestNeighbors(n_neighbors=min(k+1, len(Z)))
    nn.fit(Z)
    dists, _ = nn.kneighbors(Z)
    dk = dists[:, 1:].mean(axis=1)

    mu = float(dk.mean())
    sd = float(dk.std(ddof=1) + 1e-9)
    cv = float(sd / (mu + 1e-9))
    z = (dk - mu) / sd
    skew = float(np.mean(z**3))

    return {"knn_dist_cv": cv, "knn_dist_skew": skew}

def feat_corr_spectrum(df, exclude=None):
    exclude = set(exclude or [])
    Xn = _numeric_frame(df, exclude=exclude, max_cols=CONFIG["max_num_cols"])
    if Xn.shape[1] < 8 or len(df) < 300:
        return {"spec_entropy": np.nan, "spec_top_share": np.nan}

    df_s = _sample_df(df, CONFIG["max_rows_for_heavy"])
    X = _numeric_frame(df_s, exclude=exclude, max_cols=CONFIG["max_num_cols"]).copy()
    X = X.fillna(X.median(numeric_only=True))

    C = np.corrcoef(X.values, rowvar=False)
    C = np.nan_to_num(C, nan=0.0, posinf=0.0, neginf=0.0)
    w = np.linalg.eigvalsh(C)
    w = np.clip(w, 1e-9, None)
    p = w / w.sum()
    ent = float(-(p * np.log(p)).sum())
    top_share = float(p.max())

    return {"spec_entropy": ent, "spec_top_share": top_share}

def feat_anomaly_ensemble(df, exclude=None):
    exclude = set(exclude or [])
    Xn = _numeric_frame(df, exclude=exclude, max_cols=CONFIG["max_num_cols"])
    if Xn.shape[1] < 6 or len(df) < 300:
        return {"anom_rate": np.nan, "anom_score_p95": np.nan}

    df_s = _sample_df(df, CONFIG["max_rows_for_heavy"])
    X = _numeric_frame(df_s, exclude=exclude, max_cols=CONFIG["max_num_cols"]).copy()
    X = X.fillna(X.median(numeric_only=True))

    iso = IsolationForest(n_estimators=300, contamination=0.02, random_state=42)
    iso.fit(X)
    s = -iso.score_samples(X)
    p95 = float(np.percentile(s, 95))
    thr = float(np.percentile(s, 98))
    rate = float(np.mean(s >= thr))

    return {"anom_rate": rate, "anom_score_p95": p95}

def feat_survey_fraud(df, exclude=None, min_items=8):
    exclude = set(exclude or [])
    lik = detect_likert_cols(df, exclude=exclude)
    if len(lik) < min_items:
        return {"likert_count": len(lik), "straight_rate": np.nan, "longstring_norm_p95": np.nan, "resp_entropy_p10": np.nan}

    B = df[lik]
    answered = B.notna().sum(axis=1) >= min_items
    nun = B.nunique(axis=1)
    straight = (nun == 1) & answered
    straight_rate = float(straight.mean())

    arr = B.to_numpy(dtype=float)
    runs, entropies = [], []
    for i in range(arr.shape[0]):
        row = arr[i, :]
        r = row[np.isfinite(row)]
        if len(r) >= min_items:
            vals, cnt = np.unique(r, return_counts=True)
            p = cnt / cnt.sum()
            ent = float(-(p * np.log(p + 1e-12)).sum())
            entropies.append(ent)

        best = run = 1
        for j in range(1, len(row)):
            if np.isnan(row[j]) or np.isnan(row[j-1]):
                run = 1
            elif row[j] == row[j-1]:
                run += 1
                best = max(best, run)
            else:
                run = 1
        runs.append(best)

    p95 = float(np.percentile(runs, 95))
    long_norm = float(p95 / max(1, len(lik)))
    ent_p10 = float(np.percentile(entropies, 10)) if len(entropies) else np.nan

    return {"likert_count": len(lik), "straight_rate": straight_rate, "longstring_norm_p95": long_norm, "resp_entropy_p10": ent_p10}


In [5]:
def _prep_numeric_matrix(df, exclude=None):
    exclude = set(exclude or [])
    df_s = _sample_df(df, CONFIG["max_rows_for_heavy"])
    X = _numeric_frame(df_s, exclude=exclude, max_cols=CONFIG["max_num_cols"]).copy()
    if X.shape[1] < 6 or len(X) < 250:
        return None
    X = X.fillna(X.median(numeric_only=True))
    Z = StandardScaler().fit_transform(X.values)
    return Z.astype(np.float32)

def rbf_kernel(X, Y, gamma):
    # ||x-y||^2 = x^2 + y^2 - 2xy
    X2 = np.sum(X*X, axis=1, keepdims=True)
    Y2 = np.sum(Y*Y, axis=1, keepdims=True).T
    dist2 = X2 + Y2 - 2.0*np.dot(X, Y.T)
    return np.exp(-gamma * np.maximum(dist2, 0.0))

def mmd_rbf(X, Y, gamma=None):
    n, m = len(X), len(Y)
    if gamma is None:
        # median heuristic on a subsample
        S = X[np.random.choice(n, min(n, 300), replace=False)]
        T = Y[np.random.choice(m, min(m, 300), replace=False)]
        d = np.sum((S[:,None,:]-T[None,:,:])**2, axis=2).ravel()
        med = np.median(d[d>0]) if np.any(d>0) else 1.0
        gamma = 1.0/(2.0*med + 1e-9)

    Kxx = rbf_kernel(X, X, gamma)
    Kyy = rbf_kernel(Y, Y, gamma)
    Kxy = rbf_kernel(X, Y, gamma)

    # unbiased-ish (remove diagonal)
    np.fill_diagonal(Kxx, 0.0)
    np.fill_diagonal(Kyy, 0.0)
    mmd2 = (Kxx.sum()/(n*(n-1)+1e-9)) + (Kyy.sum()/(m*(m-1)+1e-9)) - 2.0*(Kxy.mean())
    return float(max(mmd2, 0.0)), float(gamma)

def energy_distance(X, Y):
    # Energy distance using Euclidean norms
    # E = 2E||X-Y|| - E||X-X'|| - E||Y-Y'||
    def pdist_mean(A, B):
        # mean pairwise distance
        # for speed: sample if large
        na, nb = len(A), len(B)
        if na*nb > 120000:
            ia = np.random.choice(na, min(350, na), replace=False)
            ib = np.random.choice(nb, min(350, nb), replace=False)
            A, B = A[ia], B[ib]
        d = np.sqrt(np.maximum(0.0, np.sum((A[:,None,:]-B[None,:,:])**2, axis=2)))
        return float(d.mean())

    exy = pdist_mean(X, Y)
    exx = pdist_mean(X, X)
    eyy = pdist_mean(Y, Y)
    return float(max(0.0, 2*exy - exx - eyy))

def perm_test_stat(X, Y, stat_fn, runs=120, seed=42):
    rng = np.random.default_rng(seed)
    stat_obs = stat_fn(X, Y)
    Z = np.vstack([X, Y])
    n = len(X)
    cnt = 0
    for _ in range(runs):
        idx = rng.permutation(len(Z))
        Xp = Z[idx[:n]]
        Yp = Z[idx[n:]]
        sp = stat_fn(Xp, Yp)
        if sp >= stat_obs:
            cnt += 1
    pval = (cnt + 1) / (runs + 1)
    return float(stat_obs), float(pval)

def c2st_auc(X, Y):
    # AI-based two-sample test (Classifier Two-Sample Test)
    # return AUC where 0.5 means indistinguishable; >0.65 indicates separability
    n = min(len(X), len(Y))
    Xs = X[np.random.choice(len(X), n, replace=False)]
    Ys = Y[np.random.choice(len(Y), n, replace=False)]
    Z = np.vstack([Xs, Ys])
    y = np.array([0]*n + [1]*n)
    # robust tree model
    clf = HistGradientBoostingClassifier(max_depth=4, learning_rate=0.08, max_iter=250, random_state=42)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []
    for tr, te in cv.split(Z, y):
        clf.fit(Z[tr], y[tr])
        p = clf.predict_proba(Z[te])[:,1]
        aucs.append(roc_auc_score(y[te], p))
    return float(np.mean(aucs)), float(np.std(aucs, ddof=1))

def gaussian_copula_sample(Z, n_out, seed=42):
    # Z: standardized numeric matrix (rows x dims)
    rng = np.random.default_rng(seed)
    n, d = Z.shape
    # rank->normal scores
    U = np.zeros_like(Z, dtype=np.float64)
    for j in range(d):
        r = pd.Series(Z[:,j]).rank(method="average").to_numpy()
        u = (r - 0.5) / n
        u = np.clip(u, 1e-6, 1-1e-6)
        U[:,j] = scipy_norm_ppf(u)
    # correlation in normal space
    C = np.corrcoef(U, rowvar=False)
    C = np.nan_to_num(C, nan=0.0)
    # sample MVN
    L = np.linalg.cholesky(C + 1e-6*np.eye(d))
    G = rng.standard_normal((n_out, d)) @ L.T
    # invert to empirical marginals via rank mapping (approx)
    Xout = np.zeros_like(G, dtype=np.float64)
    for j in range(d):
        # map G[:,j] to ranks using source U[:,j]
        src = np.sort(U[:,j])
        # rank positions in src
        ranks = np.searchsorted(src, G[:,j], side="left")
        ranks = np.clip(ranks, 0, n-1)
        # use original Z marginal order stats
        z_sorted = np.sort(Z[:,j])
        Xout[:,j] = z_sorted[ranks]
    return Xout.astype(np.float32)

def scipy_norm_ppf(u):
    # inverse CDF approx via scipy if available; fallback to numpy erfinv
    try:
        from scipy.stats import norm
        return norm.ppf(u)
    except:
        # approx: ppf(u) = sqrt(2)*erfinv(2u-1)
        from scipy.special import erfinv
        return np.sqrt(2) * erfinv(2*u - 1)

def two_sample_forensics(df):
    """
    Compare dataset vs a null distribution:
    - Null1: marginal shuffle (break dependence)
    - Null2: Gaussian copula sample (over-regularization baseline)
    Output: MMD p-value, Energy p-value, C2ST AUC
    """
    exclude = set(EXCLUDE_COL_HINTS)
    Z = _prep_numeric_matrix(df, exclude=exclude)
    if Z is None:
        return {
            "mmd_p_shuffle": np.nan, "mmd_stat_shuffle": np.nan,
            "energy_p_shuffle": np.nan, "energy_stat_shuffle": np.nan,
            "c2st_auc_shuffle": np.nan,
            "mmd_p_copula": np.nan, "energy_p_copula": np.nan, "c2st_auc_copula": np.nan
        }

    # Null1: marginal shuffle
    Zs = Z.copy()
    rng = np.random.default_rng(123)
    for j in range(Zs.shape[1]):
        rng.shuffle(Zs[:,j])

    # MMD permutation
    mmd_stat, gamma = mmd_rbf(Z, Zs)
    mmd_s, mmd_p = perm_test_stat(Z, Zs, lambda A,B: mmd_rbf(A,B,gamma=gamma)[0], runs=CONFIG["perm_runs"], seed=1)

    # Energy permutation
    e_s, e_p = perm_test_stat(Z, Zs, energy_distance, runs=CONFIG["perm_runs"], seed=2)

    # C2ST
    auc_s, auc_sd = c2st_auc(Z, Zs)

    # Null2: copula sample
    Zc = gaussian_copula_sample(Z, n_out=len(Z), seed=77)
    mmd_stat2, gamma2 = mmd_rbf(Z, Zc)
    _, mmd_p2 = perm_test_stat(Z, Zc, lambda A,B: mmd_rbf(A,B,gamma=gamma2)[0], runs=CONFIG["perm_runs"], seed=3)
    _, e_p2 = perm_test_stat(Z, Zc, energy_distance, runs=CONFIG["perm_runs"], seed=4)
    auc2, _ = c2st_auc(Z, Zc)

    return {
        "mmd_p_shuffle": mmd_p, "mmd_stat_shuffle": mmd_s,
        "energy_p_shuffle": e_p, "energy_stat_shuffle": e_s,
        "c2st_auc_shuffle": auc_s,
        "mmd_p_copula": mmd_p2,
        "energy_p_copula": e_p2,
        "c2st_auc_copula": auc2
    }


In [6]:
def build_feature_vector(df):
    df = add_row_id(df)
    exclude = set(EXCLUDE_COL_HINTS)

    feats = {}
    feats.update(feat_missingness(df))
    feats.update(feat_duplicates_exact(df, exclude=exclude))
    feats.update(feat_near_duplicates_lsh(df, exclude=exclude, threshold=CONFIG["lsh_threshold"], min_cluster=CONFIG["min_cluster_size"]))
    feats.update(feat_digit_preference_suite(df, exclude=exclude))
    feats.update(feat_heaping(df, exclude=exclude))
    feats.update(feat_distribution_heterogeneity(df, exclude=exclude))
    feats.update(feat_knn_geometry(df, exclude=exclude, k=CONFIG["knn_k"]))
    feats.update(feat_corr_spectrum(df, exclude=exclude))
    feats.update(feat_anomaly_ensemble(df, exclude=exclude))
    feats.update(feat_survey_fraud(df, exclude=exclude, min_items=CONFIG["min_likert_items"]))

    # formal two-sample tests + AI C2ST
    feats.update(two_sample_forensics(df))

    feats["n_rows"] = float(len(df))
    feats["n_cols"] = float(df.shape[1])
    feats["num_cols"] = float(df.select_dtypes(include="number").shape[1])

    return feats


In [7]:
def make_variants(df):
    """
    Stronger internal calibration variants (still prototype).
    For mandate-grade: replace/augment with a true reference corpus of trusted real datasets.
    """
    df0 = add_row_id(df.copy())
    exclude = set(EXCLUDE_COL_HINTS)
    num_cols = [c for c in df0.select_dtypes(include="number").columns if c not in exclude]
    lik = detect_likert_cols(df0, exclude=exclude)

    variants = []

    # ORIGINAL-like: bootstraps (preserve organic irregularity)
    for _ in range(14):
        d = df0.sample(frac=1.0, replace=True, random_state=random.randint(0, 999999)).reset_index(drop=True)
        variants.append(("ORIGINAL", d))

    # MANIPULATED-like: inject classic artifacts
    for _ in range(12):
        d = df0.copy()

        # (1) block copy
        if len(d) >= 400:
            b = d.sample(n=min(220, len(d)//4), random_state=random.randint(0, 999999))
            idx = np.random.choice(d.index, size=len(b), replace=False)
            d.loc[idx, b.columns] = b.values

        # (2) rounding to 5/10 and digit spikes
        for c in random.sample(num_cols, k=min(8, len(num_cols))):
            s = d[c].astype(float).copy()
            mask = np.random.rand(len(d)) < 0.35
            s[mask] = np.round(s[mask] / 10.0) * 10.0
            d[c] = s

        # (3) missingness block
        if len(num_cols) >= 3:
            cols_m = random.sample(num_cols, k=min(4, len(num_cols)))
            start = random.randint(0, max(0, len(d)-250))
            end = min(len(d), start + random.randint(120, 280))
            d.loc[start:end, cols_m] = np.nan

        # (4) Likert straight-lining
        if len(lik) >= CONFIG["min_likert_items"]:
            rows = np.random.choice(d.index, size=max(10, int(0.07*len(d))), replace=False)
            for r in rows:
                v = random.choice([1,2,3,4,5])
                d.loc[r, lik] = v

        variants.append(("MANIPULATED", d))

    # AI_GENERATED-like: over-regularization + geometry smoothing
    for _ in range(12):
        d = df0.copy()

        for c in random.sample(num_cols, k=min(14, len(num_cols))):
            x = d[c].astype(float)
            med = np.nanmedian(x)
            mad = np.nanmedian(np.abs(x - med)) + 1e-9
            z = (x - med) / (1.4826*mad + 1e-9)
            z = np.clip(z, -2.0, 2.0)                 # tail suppression
            x2 = med + z*(1.4826*mad)
            x2 = x2 + np.random.normal(0, 0.03*np.nanstd(x2), size=len(x2))  # small jitter
            d[c] = x2

        # uniform-ish missingness
        if len(num_cols) >= 4:
            cols_m = random.sample(num_cols, k=min(4, len(num_cols)))
            mask = np.random.rand(len(d), len(cols_m)) < 0.05
            for j, c in enumerate(cols_m):
                s = d[c].astype(float).copy()
                s[mask[:, j]] = np.nan
                d[c] = s

        # reduce Likert heterogeneity (pull to middle)
        if len(lik) >= CONFIG["min_likert_items"]:
            for c in random.sample(lik, k=min(10, len(lik))):
                s = d[c].copy()
                s = s.where(pd.isna(s), np.clip(np.round((s.astype(float) + 3.0)/2.0), 1, 5))
                d[c] = s

        variants.append(("AI_GENERATED", d))

    return variants

def build_training_matrix(df):
    variants = make_variants(df)
    rows = []
    for label, d in variants:
        f = build_feature_vector(d)
        f["__label__"] = label
        rows.append(f)
    T = pd.DataFrame(rows)
    y = T["__label__"].values
    X = T.drop(columns=["__label__"]).copy()
    for c in X.columns:
        if X[c].isna().any():
            X[c] = X[c].fillna(X[c].median())
    return X, y


In [8]:
def train_model(X, y):
    base = HistGradientBoostingClassifier(
        max_depth=4, learning_rate=0.06, max_iter=350, random_state=42
    )
    cv = StratifiedKFold(n_splits=CONFIG["cv_folds"], shuffle=True, random_state=42)
    calib = CalibratedClassifierCV(base, method="sigmoid", cv=cv)
    calib.fit(X, y)
    return calib

def bootstrap_probs(df, model, feature_cols):
    probs = []
    for b in range(CONFIG["bootstrap_iters"]):
        d = df.sample(frac=CONFIG["bootstrap_sample_frac"], replace=True, random_state=1000+b)
        f = build_feature_vector(d)
        x = pd.DataFrame([f])[feature_cols].copy()
        for c in x.columns:
            if x[c].isna().any():
                x[c] = x[c].fillna(np.nanmedian(x[c].values))
        probs.append(model.predict_proba(x)[0])
    return np.vstack(probs)

def decision_gating(p_mean, classes, feats):
    """
    Reduce false positives:
    - Require high probability + multiple independent supporting signals
    - Otherwise lean to ORIGINAL or low-confidence output
    """
    p_map = dict(zip(classes, p_mean))
    best = max(p_map, key=p_map.get)
    p_best = float(p_map[best])

    # Synthetic supporting signals (independent)
    synth_signals = 0
    if pd.notna(feats.get("c2st_auc_copula")) and feats["c2st_auc_copula"] >= 0.67:
        synth_signals += 1
    if pd.notna(feats.get("mmd_p_copula")) and feats["mmd_p_copula"] <= 0.05:
        synth_signals += 1
    if pd.notna(feats.get("knn_dist_cv")) and feats["knn_dist_cv"] <= 0.20:
        synth_signals += 1
    if pd.notna(feats.get("het_kurt_std")) and feats["het_kurt_std"] <= 0.30:
        synth_signals += 1

    manip_signals = 0
    if pd.notna(feats.get("digit_q_min")) and feats["digit_q_min"] <= 0.05:
        manip_signals += 1
    if pd.notna(feats.get("heap_05_max")) and feats["heap_05_max"] >= 0.35:
        manip_signals += 1
    if pd.notna(feats.get("near_dup_rate")) and feats["near_dup_rate"] >= 0.05:
        manip_signals += 1
    if pd.notna(feats.get("straight_rate")) and feats["straight_rate"] >= 0.08:
        manip_signals += 1

    # strict calls
    if best == "AI_GENERATED" and p_best >= CONFIG["p_strict"] and synth_signals >= CONFIG["min_synth_signals"]:
        return "AI_GENERATED", "HIGH"
    if best == "MANIPULATED" and p_best >= CONFIG["p_strict"] and manip_signals >= CONFIG["min_manip_signals"]:
        return "MANIPULATED", "HIGH"

    # medium confidence
    if p_best >= CONFIG["p_medium"]:
        # choose the best but keep medium confidence
        return best, "MEDIUM"

    # conservative: lean ORIGINAL with low confidence if nothing strong
    return "ORIGINAL", "LOW"


In [9]:
print("Upload your dataset (CSV/XLSX):")
up = files.upload()
ds_name = next(iter(up.keys()))

def load_df(name, b):
    if name.lower().endswith(".csv"):
        return pd.read_csv(io.BytesIO(b))
    if name.lower().endswith(".xlsx"):
        return pd.read_excel(io.BytesIO(b))
    raise ValueError("Upload CSV or XLSX")

df = load_df(ds_name, up[ds_name])
df = add_row_id(df)
print("Dataset loaded:", df.shape)

# Build training set (prototype calibration)
X_train, y_train = build_training_matrix(df)
feature_cols = list(X_train.columns)

model = train_model(X_train, y_train)

# Predict for the dataset
feats_main = build_feature_vector(df)
x_main = pd.DataFrame([feats_main])[feature_cols].copy()
for c in x_main.columns:
    if x_main[c].isna().any():
        x_main[c] = x_main[c].fillna(X_train[c].median())

Pboot = bootstrap_probs(df, model, feature_cols)
p_mean = Pboot.mean(axis=0)
p_lo = np.quantile(Pboot, 0.025, axis=0)
p_hi = np.quantile(Pboot, 0.975, axis=0)

classes = list(model.classes_)
prob_df = pd.DataFrame({
    "Class": classes,
    "P_mean": p_mean,
    "P_2.5%": p_lo,
    "P_97.5%": p_hi
}).sort_values("P_mean", ascending=False)

label, confidence = decision_gating(p_mean, classes, feats_main)

label, confidence, prob_df


Upload your dataset (CSV/XLSX):


Saving Ad-data-Assignment-1 Dataset.csv to Ad-data-Assignment-1 Dataset.csv
Dataset loaded: (16834, 17)


  x[c] = x[c].fillna(np.nanmedian(x[c].values))
  x[c] = x[c].fillna(np.nanmedian(x[c].values))
  x[c] = x[c].fillna(np.nanmedian(x[c].values))
  x[c] = x[c].fillna(np.nanmedian(x[c].values))
  x[c] = x[c].fillna(np.nanmedian(x[c].values))
  x[c] = x[c].fillna(np.nanmedian(x[c].values))
  x[c] = x[c].fillna(np.nanmedian(x[c].values))
  x[c] = x[c].fillna(np.nanmedian(x[c].values))
  x[c] = x[c].fillna(np.nanmedian(x[c].values))
  x[c] = x[c].fillna(np.nanmedian(x[c].values))
  x[c] = x[c].fillna(np.nanmedian(x[c].values))
  x[c] = x[c].fillna(np.nanmedian(x[c].values))
  x[c] = x[c].fillna(np.nanmedian(x[c].values))
  x[c] = x[c].fillna(np.nanmedian(x[c].values))
  x[c] = x[c].fillna(np.nanmedian(x[c].values))
  x[c] = x[c].fillna(np.nanmedian(x[c].values))
  x[c] = x[c].fillna(np.nanmedian(x[c].values))
  x[c] = x[c].fillna(np.nanmedian(x[c].values))
  x[c] = x[c].fillna(np.nanmedian(x[c].values))
  x[c] = x[c].fillna(np.nanmedian(x[c].values))
  x[c] = x[c].fillna(np.nanmedian(x[c].v

('ORIGINAL',
 'LOW',
           Class    P_mean    P_2.5%   P_97.5%
 2      ORIGINAL  0.361302  0.361302  0.361302
 1   MANIPULATED  0.320527  0.320527  0.320527
 0  AI_GENERATED  0.318170  0.318170  0.318170)

In [10]:
# Evidence: robust Z against calibration baseline
Xref = X_train.copy()
ref_med = Xref.median()
ref_iqr = (Xref.quantile(0.75) - Xref.quantile(0.25)).replace(0, np.nan)

xv = x_main.iloc[0]
z = (xv - ref_med) / (ref_iqr + 1e-9)
z = z.replace([np.inf, -np.inf], np.nan).fillna(0.0)

evidence = pd.DataFrame({
    "Feature": feature_cols,
    "Value": [float(xv[c]) if pd.notna(xv[c]) else np.nan for c in feature_cols],
    "RobustZ": [float(z[c]) for c in feature_cols],
})
evidence["AbsZ"] = evidence["RobustZ"].abs()
top_evidence = evidence.sort_values("AbsZ", ascending=False).head(14)[["Feature","Value","RobustZ"]]

FEATURE_EXPLAIN = {
    "dup_exact_rate": "Exact duplicate rate",
    "near_dup_rate": "Near-duplicate template clustered share (MinHash/LSH)",
    "digit_q_min": "Digit-preference min q-value (FDR corrected)",
    "digit_chi2_max": "Digit-preference chi-square max",
    "heap_05_max": "Max share ending in 0/5 (heaping)",
    "heap_intshare_max": "Max near-integer share (decimal stripping)",
    "het_skew_std": "Skew heterogeneity across columns",
    "het_kurt_std": "Kurtosis heterogeneity across columns",
    "knn_dist_cv": "kNN distance CV (geometry uniformity)",
    "spec_entropy": "Correlation eigen-spectrum entropy",
    "anom_rate": "Anomaly concentration (IForest)",
    "straight_rate": "Straight-lining rate (Likert)",
    "resp_entropy_p10": "Low respondent entropy (Likert)",
    "mmd_p_copula": "MMD p-value vs copula baseline",
    "energy_p_copula": "Energy-distance p-value vs copula baseline",
    "c2st_auc_copula": "C2ST AUC vs copula baseline",
}

def explain_feat(k): return FEATURE_EXPLAIN.get(k, k)

# Prob chart
plt.figure(figsize=(7,4))
plt.bar(prob_df["Class"], prob_df["P_mean"])
plt.title("Compliance Probabilities (Bootstrap Mean)")
plt.ylabel("Probability")
plt.ylim(0,1)
plt.tight_layout()
prob_png = os.path.join(ART_DIR, "probabilities.png")
plt.savefig(prob_png, dpi=220)
plt.close()

# Drivers chart
tp = top_evidence.copy()
tp["Label"] = tp["Feature"].apply(explain_feat)
plt.figure(figsize=(9,5))
plt.barh(tp["Label"][::-1], tp["RobustZ"][::-1])
plt.title("Top Statistical Drivers (Robust Z vs Calibration Baseline)")
plt.xlabel("Robust Z (IQR-scaled)")
plt.tight_layout()
drivers_png = os.path.join(ART_DIR, "drivers.png")
plt.savefig(drivers_png, dpi=220)
plt.close()

prob_png, drivers_png


('artifacts/probabilities.png', 'artifacts/drivers.png')

In [11]:
def conf_color_hex(conf):
    return {"HIGH":"#2e7d32","MEDIUM":"#ef6c00","LOW":"#c62828"}.get(conf, "#000000")

styles = getSampleStyleSheet()
title_style = ParagraphStyle("T", parent=styles["Title"], fontName="Helvetica-Bold", fontSize=20,
                             alignment=TA_CENTER, textColor=colors.HexColor("#1f2a44"), spaceAfter=10)
h_style = ParagraphStyle("H", parent=styles["Heading2"], fontName="Helvetica-Bold", fontSize=13,
                         textColor=colors.HexColor("#1f2a44"), spaceBefore=10, spaceAfter=6)
body_style = ParagraphStyle("B", parent=styles["BodyText"], fontName="Helvetica", fontSize=10, leading=14)
small_style = ParagraphStyle("S", parent=styles["BodyText"], fontName="Helvetica", fontSize=9, leading=12,
                             textColor=colors.HexColor("#444444"))

def make_table(df, col_widths):
    data = [list(df.columns)] + df.astype(str).values.tolist()
    t = Table(data, colWidths=col_widths)
    t.setStyle(TableStyle([
        ("BACKGROUND", (0,0), (-1,0), colors.HexColor("#e8edf6")),
        ("TEXTCOLOR", (0,0), (-1,0), colors.HexColor("#1f2a44")),
        ("FONTNAME", (0,0), (-1,0), "Helvetica-Bold"),
        ("FONTNAME", (0,1), (-1,-1), "Helvetica"),
        ("FONTSIZE", (0,0), (-1,-1), 9),
        ("GRID", (0,0), (-1,-1), 0.4, colors.HexColor("#b8c2d6")),
        ("ROWBACKGROUNDS", (0,1), (-1,-1), [colors.white, colors.HexColor("#f7f9fd")]),
        ("VALIGN", (0,0), (-1,-1), "TOP"),
        ("LEFTPADDING", (0,0), (-1,-1), 6),
        ("RIGHTPADDING", (0,0), (-1,-1), 6),
        ("TOPPADDING", (0,0), (-1,-1), 4),
        ("BOTTOMPADDING", (0,0), (-1,-1), 4),
    ]))
    return t

# Format tables
prob_pdf = prob_df.copy()
for c in ["P_mean","P_2.5%","P_97.5%"]:
    prob_pdf[c] = prob_pdf[c].map(lambda x: f"{x:.3f}")

ev = top_evidence.copy()
ev["Meaning"] = ev["Feature"].apply(explain_feat)
ev = ev[["Feature","Meaning","Value","RobustZ"]]
ev["Value"] = ev["Value"].map(lambda x: f"{x:.4g}" if pd.notna(x) else "NA")
ev["RobustZ"] = ev["RobustZ"].map(lambda x: f"{x:.3f}")

# Key forensic stats summary
forensic_summary = pd.DataFrame([
    {"Test/Signal":"C2ST AUC vs Copula baseline", "Value": safe_str(feats_main.get("c2st_auc_copula"), 30)},
    {"Test/Signal":"MMD p-value vs Copula baseline", "Value": safe_str(feats_main.get("mmd_p_copula"), 30)},
    {"Test/Signal":"Energy p-value vs Copula baseline", "Value": safe_str(feats_main.get("energy_p_copula"), 30)},
    {"Test/Signal":"Digit preference min q (FDR)", "Value": safe_str(feats_main.get("digit_q_min"), 30)},
    {"Test/Signal":"Heaping (0/5) max share", "Value": safe_str(feats_main.get("heap_05_max"), 30)},
    {"Test/Signal":"Near-duplicate template rate", "Value": safe_str(feats_main.get("near_dup_rate"), 30)},
])

pdf_path = "Dataset_Integrity_Compliance_Report_Advanced.pdf"
doc = SimpleDocTemplate(pdf_path, pagesize=A4, leftMargin=2*cm, rightMargin=2*cm, topMargin=1.6*cm, bottomMargin=1.6*cm)

elements = []
elements.append(Paragraph("Dataset Integrity Compliance Report (Advanced • Dataset-only)", title_style))
elements.append(Paragraph(f"<b>Date:</b> {datetime.datetime.now().strftime('%d %b %Y, %H:%M')}", body_style))
elements.append(Paragraph(f"<b>Dataset:</b> {safe_str(ds_name, 90)}", body_style))
elements.append(Paragraph(f"<b>Rows × Cols:</b> {df.shape[0]} × {df.shape[1]}", body_style))
elements.append(Spacer(1, 10))

# decision box
c_hex = conf_color_hex(confidence)
decision = Table([[
    Paragraph(f"<b>Compliance Label:</b> {label}", body_style),
    Paragraph(f"<b>Confidence:</b> <font color='{c_hex}'>{confidence}</font>", body_style),
]], colWidths=[9.0*cm, 6.5*cm])
decision.setStyle(TableStyle([
    ("BACKGROUND", (0,0), (-1,-1), colors.HexColor("#f0f4fb")),
    ("BOX", (0,0), (-1,-1), 1.0, colors.HexColor("#b8c2d6")),
    ("LEFTPADDING", (0,0), (-1,-1), 10),
    ("RIGHTPADDING", (0,0), (-1,-1), 10),
    ("TOPPADDING", (0,0), (-1,-1), 10),
    ("BOTTOMPADDING", (0,0), (-1,-1), 10),
]))
elements.append(decision)
elements.append(Spacer(1, 8))

elements.append(Paragraph(
    "This system uses an evidence-first forensic methodology with calibrated AI scoring. "
    "False positives are reduced via multi-signal gating: high-confidence calls require both a high posterior probability "
    "and multiple independent supporting signals (e.g., two-sample tests + geometry/spectrum indicators for AI-generated; "
    "digit preference + templating/heaping signals for manipulation).",
    body_style
))
elements.append(Spacer(1, 10))

elements.append(Paragraph("Calibrated Probabilities (95% Bootstrap Interval)", h_style))
elements.append(make_table(prob_pdf, [4.2*cm, 3.6*cm, 3.6*cm, 3.6*cm]))
elements.append(Spacer(1, 8))
elements.append(Image(prob_png, width=16*cm, height=9*cm))
elements.append(PageBreak())

elements.append(Paragraph("Key Forensic Test Summary", h_style))
elements.append(make_table(forensic_summary, [9.0*cm, 7.0*cm]))
elements.append(Spacer(1, 10))

elements.append(Paragraph("Top Statistical Drivers (Audit Evidence)", h_style))
elements.append(make_table(ev, [3.0*cm, 6.0*cm, 3.0*cm, 3.0*cm]))
elements.append(Spacer(1, 10))
elements.append(Image(drivers_png, width=16*cm, height=8*cm))
elements.append(PageBreak())

elements.append(Paragraph("Mandated Review Checklist", h_style))
checklist = [
    "<b>Provenance:</b> Obtain raw collection logs/exports, questionnaire/codebook, and dataset version history.",
    "<b>Uniqueness & templating:</b> Audit exact/near-duplicate clusters; investigate repeat submissions and merge artifacts.",
    "<b>Editing artifacts:</b> Review digit preference (FDR-significant), heaping, and integer-only patterns; request pre-cleaning dataset.",
    "<b>Two-sample evidence:</b> If MMD/Energy p-values are significant or C2ST AUC is high, require explanation of generation/augmentation methods.",
    "<b>Reproducibility:</b> Verify transformations and inclusion/exclusion rules; regenerate key tables from raw.",
    "<b>Decision trail:</b> Record pass/request-clarification/fail with evidence snapshots."
]
for item in checklist:
    elements.append(Paragraph("• " + item, body_style))
    elements.append(Spacer(1, 4))

elements.append(Spacer(1, 10))
elements.append(Paragraph(
    "<i>Note:</i> For mandate-grade deployment, the strongest enhancement is adding a trusted reference corpus "
    "(historical real datasets from the same instrument/domain). Two-sample tests and C2ST against that corpus "
    "significantly improve separation of ORIGINAL vs AI_GENERATED while minimizing false positives.",
    small_style
))

doc.build(elements)
print("PDF created:", pdf_path)
files.download(pdf_path)


PDF created: Dataset_Integrity_Compliance_Report_Advanced.pdf


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>