<a href="https://colab.research.google.com/github/Dey313/ResEthiq/blob/main/P3_Test_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install openpyxl reportlab datasketch scikit-learn

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/2.0 MB[0m [31m9.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/2.0 MB[0m [31m31.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/96.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.1/96.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import io, os, re, math, datetime, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from google.colab import files

from datasketch import MinHash, MinHashLSH
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample

from reportlab.lib.pagesizes import A4
from reportlab.lib.units import cm
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image, PageBreak
from reportlab.lib.enums import TA_CENTER

# -------- CELL 3: Config --------
ART_DIR = "artifacts"
os.makedirs(ART_DIR, exist_ok=True)

CONFIG = {
    # sampling / speed
    "max_rows_for_heavy": 5000,
    "max_num_cols": 40,

    # duplication / templating
    "lsh_threshold": 0.90,
    "min_cluster_size": 5,

    # survey fraud (if Likert detected)
    "min_likert_items": 8,

    # kNN geometry
    "knn_k": 5,

    # bootstrap for confidence
    "bootstrap_iters": 120,   # increase later for stronger CI
    "bootstrap_sample_frac": 0.85,

    # calibration CV
    "cv_folds": 5,

    # report thresholds for qualitative bands
    "p_high": 0.80,
    "p_mid": 0.60,
}

EXCLUDE_COL_HINTS = {"__row_id__", "id", "respondent_id", "response_id", "name", "phone", "email"}

def clamp01(x: float) -> float:
    return float(max(0.0, min(1.0, x)))

def safe_str(x, max_len=120):
    s = str(x)
    return s if len(s) <= max_len else s[:max_len-3] + "..."

def add_row_id(df: pd.DataFrame, col="__row_id__"):
    df = df.copy()
    if col not in df.columns:
        df[col] = [f"r_{i:06d}" for i in range(len(df))]
    return df

def detect_likert_cols(df: pd.DataFrame, exclude=None):
    exclude = set(exclude or [])
    lik = []
    for c in df.columns:
        if c in exclude:
            continue
        s = df[c].dropna()
        if len(s) < 30:
            continue
        if pd.api.types.is_numeric_dtype(s):
            vals = np.asarray(s)
            ivals = np.round(vals).astype(int)
            uniq = sorted(set(ivals.tolist()))
            if 4 <= len(uniq) <= 7 and (max(uniq) - min(uniq) <= 6):
                lik.append(c)
    return lik

def _numeric_frame(df: pd.DataFrame, exclude=None, max_cols=40):
    exclude = set(exclude or [])
    Xn = df.select_dtypes(include="number").drop(columns=[c for c in exclude if c in df.columns], errors="ignore")
    # cap to max columns (largest variance)
    if Xn.shape[1] > max_cols:
        vars_ = Xn.var(numeric_only=True).sort_values(ascending=False)
        Xn = Xn[vars_.index[:max_cols]]
    return Xn

def _sample_df(df: pd.DataFrame, nmax: int):
    if len(df) <= nmax:
        return df
    return df.sample(nmax, random_state=42)

In [3]:
def feat_missingness(df):
    miss_cols = df.isna().mean()
    overall = float(miss_cols.mean())
    maxcol = float(miss_cols.max()) if len(miss_cols) else 0.0
    # missingness "structure": std of missingness across columns (patterned missingness tends to be higher)
    spread = float(miss_cols.std(ddof=1)) if len(miss_cols) > 1 else 0.0
    return {
        "miss_overall": overall,
        "miss_maxcol": maxcol,
        "miss_spread": spread,
    }

def feat_duplicates_exact(df, exclude=None):
    exclude = set(exclude or [])
    cols = [c for c in df.columns if c not in exclude]
    X = df[cols].fillna("__MISSING__")
    dup_any = X.duplicated(keep=False)
    dup_rate = float(dup_any.mean())
    # concentration: how many duplicates beyond first?
    dup_excess = float(X.duplicated(keep="first").mean())
    return {
        "dup_exact_rate": dup_rate,
        "dup_exact_excess": dup_excess,
    }

def feat_near_duplicates_lsh(df, exclude=None, threshold=0.90, min_cluster=5):
    exclude = set(exclude or [])
    cols = [c for c in df.columns if c not in exclude]
    if len(cols) < 8 or len(df) < 80:
        return {"near_dup_rate": np.nan, "near_dup_clusters": 0, "near_dup_max_cluster": 0}

    # sample for speed
    df_s = _sample_df(df, min(len(df), 2500)).copy()
    # build MinHash LSH
    lsh = MinHashLSH(threshold=threshold, num_perm=64)
    mhs = {}
    keys = []
    for i, (_, row) in enumerate(df_s.iterrows()):
        mh = MinHash(num_perm=64)
        for c in cols:
            v = row[c]
            if pd.isna(v): v = "__MISSING__"
            mh.update(f"{c}={v}".encode("utf8"))
        k = f"k{i}"
        lsh.insert(k, mh)
        mhs[k] = mh
        keys.append(k)

    visited = set()
    clusters = []
    for k in keys:
        if k in visited:
            continue
        nbrs = lsh.query(mhs[k])
        if len(nbrs) >= min_cluster:
            clusters.append(nbrs)
            visited.update(nbrs)

    clustered = sum(len(c) for c in clusters)
    near_rate = float(clustered / max(1, len(df_s)))
    max_cluster = int(max([len(c) for c in clusters], default=0))
    return {
        "near_dup_rate": near_rate,
        "near_dup_clusters": int(len(clusters)),
        "near_dup_max_cluster": max_cluster
    }

def feat_digit_preference(df, exclude=None):
    # full last-digit chi-square vs uniform for numeric columns (rounded to int)
    exclude = set(exclude or [])
    Xn = _numeric_frame(df, exclude=exclude, max_cols=CONFIG["max_num_cols"])
    if Xn.shape[1] == 0:
        return {"digit_chi2_max": np.nan, "digit_spike_max": np.nan}

    chi2_max = 0.0
    spike_max = 0.0
    for c in Xn.columns:
        s = Xn[c].dropna()
        if len(s) < 200:
            continue
        vals = np.abs(np.round(s.values).astype(int))
        digits = vals % 10
        counts = np.bincount(digits, minlength=10).astype(float)
        p_obs = counts / max(1.0, counts.sum())
        p_exp = np.ones(10) / 10.0
        # chi-square statistic (no p-value to keep deps minimal; we use stat magnitude)
        chi2 = float(((counts - counts.sum()*p_exp)**2 / (counts.sum()*p_exp + 1e-9)).sum())
        chi2_max = max(chi2_max, chi2)
        spike_max = max(spike_max, float(p_obs.max()))
    return {
        "digit_chi2_max": chi2_max,
        "digit_spike_max": spike_max
    }

def feat_heaping(df, exclude=None):
    # share of values ending in 0 or 5 (rounded) for most "heaped" numeric col
    exclude = set(exclude or [])
    Xn = _numeric_frame(df, exclude=exclude, max_cols=CONFIG["max_num_cols"])
    if Xn.shape[1] == 0:
        return {"heap_05_max": np.nan}

    best = 0.0
    for c in Xn.columns:
        s = Xn[c].dropna()
        if len(s) < 200:
            continue
        vals = np.abs(np.round(s.values).astype(int))
        last = vals % 10
        share = float(np.mean(np.isin(last, [0, 5])))
        best = max(best, share)
    return {"heap_05_max": best}

def feat_distribution_heterogeneity(df, exclude=None):
    # advanced: column-wise skew/kurtosis/entropy dispersion (organic data is heterogeneous; some AI tables are overly regular)
    exclude = set(exclude or [])
    Xn = _numeric_frame(df, exclude=exclude, max_cols=CONFIG["max_num_cols"])
    if Xn.shape[1] < 4:
        return {"het_skew_std": np.nan, "het_kurt_std": np.nan, "het_entropy_std": np.nan, "het_unique_std": np.nan}

    skews, kurts, ents, uniqs = [], [], [], []
    for c in Xn.columns:
        x = Xn[c].dropna().values
        if len(x) < 150:
            continue
        x = x[np.isfinite(x)]
        if len(x) < 150:
            continue

        m = x.mean()
        sd = x.std(ddof=1) + 1e-9
        z = (x - m) / sd
        skew = float(np.mean(z**3))
        kurt = float(np.mean(z**4) - 3.0)

        # entropy via coarse binning
        bins = np.histogram_bin_edges(x, bins="auto")
        hist, _ = np.histogram(x, bins=bins)
        p = hist / max(1.0, hist.sum())
        p = p[p > 0]
        ent = float(-(p * np.log(p)).sum())

        uniq = float(len(np.unique(np.round(x, 6))) / max(1, len(x)))

        skews.append(skew); kurts.append(kurt); ents.append(ent); uniqs.append(uniq)

    def _std(a):
        return float(np.std(a, ddof=1)) if len(a) > 1 else np.nan

    return {
        "het_skew_std": _std(skews),
        "het_kurt_std": _std(kurts),
        "het_entropy_std": _std(ents),
        "het_unique_std": _std(uniqs),
    }

def feat_knn_geometry(df, exclude=None, k=5):
    # geometry forensic: kNN distance distribution "tightness" (AI-like tables can be overly uniform)
    exclude = set(exclude or [])
    Xn = _numeric_frame(df, exclude=exclude, max_cols=CONFIG["max_num_cols"])
    if Xn.shape[1] < 6 or len(df) < 250:
        return {"knn_dist_cv": np.nan, "knn_dist_skew": np.nan}

    df_s = _sample_df(df, CONFIG["max_rows_for_heavy"])
    X = _numeric_frame(df_s, exclude=exclude, max_cols=CONFIG["max_num_cols"]).copy()
    X = X.fillna(X.median(numeric_only=True))

    scaler = StandardScaler()
    Z = scaler.fit_transform(X.values)

    nn = NearestNeighbors(n_neighbors=min(k+1, len(Z)), algorithm="auto")
    nn.fit(Z)
    dists, _ = nn.kneighbors(Z)
    # exclude self-distance at idx 0
    dk = dists[:, 1:].mean(axis=1)
    mu = float(dk.mean())
    sd = float(dk.std(ddof=1) + 1e-9)
    cv = float(sd / (mu + 1e-9))

    z = (dk - mu) / sd
    skew = float(np.mean(z**3))

    return {"knn_dist_cv": cv, "knn_dist_skew": skew}

def feat_corr_spectrum(df, exclude=None):
    # correlation spectrum metrics: eigenvalue entropy + top eigen share (captures higher-order dependence shape)
    exclude = set(exclude or [])
    Xn = _numeric_frame(df, exclude=exclude, max_cols=CONFIG["max_num_cols"])
    if Xn.shape[1] < 8 or len(df) < 250:
        return {"spec_entropy": np.nan, "spec_top_share": np.nan}

    df_s = _sample_df(df, CONFIG["max_rows_for_heavy"])
    X = _numeric_frame(df_s, exclude=exclude, max_cols=CONFIG["max_num_cols"]).copy()
    X = X.fillna(X.median(numeric_only=True))

    C = np.corrcoef(X.values, rowvar=False)
    C = np.nan_to_num(C, nan=0.0, posinf=0.0, neginf=0.0)
    # eigenvalues (symmetric)
    w = np.linalg.eigvalsh(C)
    w = np.clip(w, 1e-9, None)
    p = w / w.sum()
    ent = float(-(p * np.log(p)).sum())
    top_share = float(p.max())
    return {"spec_entropy": ent, "spec_top_share": top_share}

def feat_anomaly_ensemble(df, exclude=None):
    # anomaly concentration (IForest score distribution)
    exclude = set(exclude or [])
    Xn = _numeric_frame(df, exclude=exclude, max_cols=CONFIG["max_num_cols"])
    if Xn.shape[1] < 6 or len(df) < 250:
        return {"anom_rate": np.nan, "anom_score_p95": np.nan}

    df_s = _sample_df(df, CONFIG["max_rows_for_heavy"])
    X = _numeric_frame(df_s, exclude=exclude, max_cols=CONFIG["max_num_cols"]).copy()
    X = X.fillna(X.median(numeric_only=True))

    iso = IsolationForest(n_estimators=250, contamination=0.02, random_state=42)
    iso.fit(X)
    s = -iso.score_samples(X)  # higher = more anomalous
    p95 = float(np.percentile(s, 95))
    thr = float(np.percentile(s, 98))
    rate = float(np.mean(s >= thr))
    return {"anom_rate": rate, "anom_score_p95": p95}

def feat_survey_fraud(df, exclude=None, min_items=8):
    # straight-lining + long-string + respondent entropy for Likert-like blocks
    exclude = set(exclude or [])
    lik = detect_likert_cols(df, exclude=exclude)
    if len(lik) < min_items:
        return {"likert_count": len(lik), "straight_rate": np.nan, "longstring_norm_p95": np.nan, "resp_entropy_p10": np.nan}

    B = df[lik]
    answered = B.notna().sum(axis=1) >= min_items
    nun = B.nunique(axis=1)
    straight = (nun == 1) & answered
    straight_rate = float(straight.mean())

    # long-string (runs of identical answers)
    arr = B.to_numpy(dtype=float)
    runs = []
    entropies = []
    for i in range(arr.shape[0]):
        row = arr[i, :]
        # entropy of response distribution (low entropy = templated)
        r = row[np.isfinite(row)]
        if len(r) >= min_items:
            vals, cnt = np.unique(r, return_counts=True)
            p = cnt / cnt.sum()
            ent = float(-(p * np.log(p + 1e-12)).sum())
            entropies.append(ent)

        best = run = 1
        for j in range(1, len(row)):
            if np.isnan(row[j]) or np.isnan(row[j-1]):
                run = 1
            elif row[j] == row[j-1]:
                run += 1
                best = max(best, run)
            else:
                run = 1
        runs.append(best)

    p95 = float(np.percentile(runs, 95))
    long_norm = float(p95 / max(1, len(lik)))
    ent_p10 = float(np.percentile(entropies, 10)) if len(entropies) else np.nan

    return {"likert_count": len(lik), "straight_rate": straight_rate, "longstring_norm_p95": long_norm, "resp_entropy_p10": ent_p10}

def build_feature_vector(df):
    df = add_row_id(df)
    exclude = set(EXCLUDE_COL_HINTS)
    feats = {}
    feats.update(feat_missingness(df))
    feats.update(feat_duplicates_exact(df, exclude=exclude))
    feats.update(feat_near_duplicates_lsh(df, exclude=exclude, threshold=CONFIG["lsh_threshold"], min_cluster=CONFIG["min_cluster_size"]))
    feats.update(feat_digit_preference(df, exclude=exclude))
    feats.update(feat_heaping(df, exclude=exclude))
    feats.update(feat_distribution_heterogeneity(df, exclude=exclude))
    feats.update(feat_knn_geometry(df, exclude=exclude, k=CONFIG["knn_k"]))
    feats.update(feat_corr_spectrum(df, exclude=exclude))
    feats.update(feat_anomaly_ensemble(df, exclude=exclude))
    feats.update(feat_survey_fraud(df, exclude=exclude, min_items=CONFIG["min_likert_items"]))

    # dataset meta
    feats["n_rows"] = float(len(df))
    feats["n_cols"] = float(df.shape[1])
    feats["num_cols"] = float(df.select_dtypes(include="number").shape[1])

    return feats

In [5]:
import io, random
import numpy as np
import pandas as pd

from google.colab import files

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# -------- CELL 5: Generate internal variants (ORIGINAL / MANIPULATED / AI_GENERATED) --------
def make_variants(df):
    """
    Creates internal baselines to calibrate a 3-class compliance model.
    - ORIGINAL-like: bootstrap resamples (preserve natural irregularities)
    - MANIPULATED-like: add rounding/heaping, block copying, partial templating, missingness blocks
    - AI_GENERATED-like: create over-regularized tables by smoothing distributions and reducing heterogeneity
    """
    df0 = df.copy()
    df0 = add_row_id(df0)

    # Work on numeric + (optional) likert subset for transformations
    exclude = set(EXCLUDE_COL_HINTS)
    num_cols = [c for c in df0.select_dtypes(include="number").columns if c not in exclude]
    lik = detect_likert_cols(df0, exclude=exclude)

    variants = []

    # ORIGINAL-like bootstraps
    for _ in range(12):
        d = df0.sample(
            frac=1.0,
            replace=True,
            random_state=random.randint(0, 10_000)
        ).reset_index(drop=True)
        variants.append(("ORIGINAL", d))

    # MANIPULATED-like transformations
    for _ in range(10):
        d = df0.copy()

        # 1) block copy (templating)
        if len(d) >= 400:
            b = d.sample(
                n=min(200, len(d)//4),
                random_state=random.randint(0, 10_000)
            )
            insert_idx = np.random.choice(range(len(d)), size=len(b), replace=False)
            d.loc[insert_idx, b.columns] = b.values

        # 2) rounding/heaping injection on numeric columns
        if len(num_cols) > 0:
            for c in random.sample(num_cols, k=min(6, len(num_cols))):
                s = d[c].astype(float)
                # force some proportion to end with 0/5 by rounding to nearest 5
                mask = np.random.rand(len(d)) < 0.30
                s2 = s.copy()
                s2[mask] = np.round(s2[mask] / 5.0) * 5.0
                d[c] = s2

        # 3) missingness block
        if len(num_cols) >= 3:
            cols_m = random.sample(num_cols, k=min(3, len(num_cols)))
            start = random.randint(0, max(0, len(d) - 200))
            end = min(len(d), start + random.randint(80, 220))
            d.loc[start:end, cols_m] = np.nan

        # 4) likert straight-lining injection
        if len(lik) >= CONFIG["min_likert_items"]:
            rows = np.random.choice(
                d.index,
                size=max(10, int(0.06 * len(d))),
                replace=False
            )
            for r in rows:
                v = random.choice([1, 2, 3, 4, 5])
                d.loc[r, lik] = v

        variants.append(("MANIPULATED", d))

    # AI_GENERATED-like transformations (over-regularization)
    for _ in range(10):
        d = df0.copy()

        # for numeric columns: shrink tails + add small Gaussian noise + enforce smoothness
        if len(num_cols) > 0:
            for c in random.sample(num_cols, k=min(12, len(num_cols))):
                x = d[c].astype(float)
                med = np.nanmedian(x)
                mad = np.nanmedian(np.abs(x - med)) + 1e-9
                z = (x - med) / (1.4826 * mad + 1e-9)
                # clamp extremes (reduces natural heavy tails)
                z = np.clip(z, -2.2, 2.2)
                x2 = med + z * (1.4826 * mad)
                # add small jitter for "plausible variability"
                x2 = x2 + np.random.normal(0, 0.05 * np.nanstd(x2), size=len(x2))
                d[c] = x2

        # missingness becomes more uniform-ish
        if len(num_cols) >= 4:
            cols_m = random.sample(num_cols, k=min(4, len(num_cols)))
            mask = np.random.rand(len(d), len(cols_m)) < 0.05
            for j, c in enumerate(cols_m):
                s = d[c].astype(float).copy()
                s[mask[:, j]] = np.nan
                d[c] = s

        # if Likert exists: reduce heterogeneity by smoothing response distribution
        if len(lik) >= CONFIG["min_likert_items"]:
            for c in random.sample(lik, k=min(10, len(lik))):
                s = d[c].copy()
                # pull towards middle
                s = s.where(
                    pd.isna(s),
                    np.clip(
                        np.round((s.astype(float) + 3.0) / 2.0),
                        1,
                        5
                    )
                )
                d[c] = s

        variants.append(("AI_GENERATED", d))

    return variants


# -------- CELL 6: Build training matrix (NaNs allowed; handled later by pipeline) --------
def build_training_matrix(df):
    variants = make_variants(df)
    rows = []
    for label, d in variants:
        f = build_feature_vector(d)
        f["__label__"] = label
        rows.append(f)

    T = pd.DataFrame(rows)

    y = T["__label__"].values
    X = T.drop(columns=["__label__"]).copy()  # may contain NaNs; OK

    return X, y


# -------- CELL 7: Train calibrated compliance classifier with internal imputation --------
def train_calibrated_model(X, y):
    # base model: imputer + multinomial logistic regression
    base = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),  # handles NaNs automatically
        ("logreg", LogisticRegression(
            max_iter=3000,
            solver="lbfgs",
            multi_class="multinomial"
        ))
    ])

    cv = StratifiedKFold(n_splits=CONFIG["cv_folds"], shuffle=True, random_state=42)

    # calibration: sigmoid is stable for small data; estimator can contain NaNs since imputer fixes them
    calib = CalibratedClassifierCV(base, method="sigmoid", cv=cv)
    calib.fit(X, y)
    return calib


# -------- CELL 8: Bootstrap confidence for the uploaded dataset --------
def bootstrap_probs(df, model, feature_cols):
    probs = []
    for b in range(CONFIG["bootstrap_iters"]):
        d = df.sample(
            frac=CONFIG["bootstrap_sample_frac"],
            replace=True,
            random_state=1000 + b
        )
        f = build_feature_vector(d)
        x = pd.DataFrame([f])[feature_cols].copy()  # may contain NaNs; OK
        p = model.predict_proba(x)[0]  # pipeline imputes internally
        probs.append(p)
    P = np.vstack(probs)
    return P


def confidence_band(p_max, ci_width):
    # combine magnitude + uncertainty
    if p_max >= CONFIG["p_high"] and ci_width <= 0.18:
        return "HIGH"
    if p_max >= CONFIG["p_mid"] and ci_width <= 0.28:
        return "MEDIUM"
    return "LOW"


# -------- CELL 9: Upload dataset and run end-to-end --------
print("Upload your dataset (CSV/XLSX):")
up = files.upload()
ds_name = next(iter(up.keys()))

def load_df(name, b):
    if name.lower().endswith(".csv"):
        return pd.read_csv(io.BytesIO(b))
    if name.lower().endswith(".xlsx"):
        return pd.read_excel(io.BytesIO(b))
    raise ValueError("Upload CSV or XLSX")

df = load_df(ds_name, up[ds_name])
df = add_row_id(df)
print("Dataset loaded:", df.shape)

# Build internal calibration training set (prototype baseline)
X_train, y_train = build_training_matrix(df)
feature_cols = list(X_train.columns)

model = train_calibrated_model(X_train, y_train)

# Predict on full dataset
f_main = build_feature_vector(df)
x_main = pd.DataFrame([f_main])[feature_cols].copy()  # may contain NaNs; OK

p = model.predict_proba(x_main)[0]
classes = list(model.classes_)
p_map = dict(zip(classes, p))
label = classes[int(np.argmax(p))]
p_max = float(np.max(p))

# Bootstrap confidence
Pboot = bootstrap_probs(df, model, feature_cols)
p_mean = Pboot.mean(axis=0)
p_lo = np.quantile(Pboot, 0.025, axis=0)
p_hi = np.quantile(Pboot, 0.975, axis=0)
ci_width = float(np.max(p_hi - p_lo))
conf = confidence_band(p_max, ci_width)

print("Predicted label:", label)
print("Class probabilities:", p_map)
print("Confidence band:", conf)

(label, p_map, conf)

Upload your dataset (CSV/XLSX):


Saving 10_Year_Road_Accident_Analysis_8_States.xlsx to 10_Year_Road_Accident_Analysis_8_States.xlsx
Dataset loaded: (80, 10)


  d.loc[start:end, cols_m] = np.nan
  d.loc[start:end, cols_m] = np.nan
  d.loc[start:end, cols_m] = np.nan
  d.loc[start:end, cols_m] = np.nan
  d.loc[start:end, cols_m] = np.nan
  d.loc[start:end, cols_m] = np.nan
  d.loc[start:end, cols_m] = np.nan
 'knn_dist_cv' 'knn_dist_skew' 'spec_entropy' 'spec_top_share' 'anom_rate'
 'anom_score_p95' 'straight_rate' 'longstring_norm_p95' 'resp_entropy_p10']. At least one non-missing value is needed for imputation with strategy='median'.
 'knn_dist_cv' 'knn_dist_skew' 'spec_entropy' 'spec_top_share' 'anom_rate'
 'anom_score_p95' 'straight_rate' 'longstring_norm_p95' 'resp_entropy_p10']. At least one non-missing value is needed for imputation with strategy='median'.
 'knn_dist_cv' 'knn_dist_skew' 'spec_entropy' 'spec_top_share' 'anom_rate'
 'anom_score_p95' 'straight_rate' 'longstring_norm_p95' 'resp_entropy_p10']. At least one non-missing value is needed for imputation with strategy='median'.
 'knn_dist_cv' 'knn_dist_skew' 'spec_entropy' 'spec_

Predicted label: AI_GENERATED
Class probabilities: {'AI_GENERATED': np.float64(0.5898807140546796), 'MANIPULATED': np.float64(0.15993278252367077), 'ORIGINAL': np.float64(0.25018650342164983)}
Confidence band: LOW


 'knn_dist_cv' 'knn_dist_skew' 'spec_entropy' 'spec_top_share' 'anom_rate'
 'anom_score_p95' 'straight_rate' 'longstring_norm_p95' 'resp_entropy_p10']. At least one non-missing value is needed for imputation with strategy='median'.
 'knn_dist_cv' 'knn_dist_skew' 'spec_entropy' 'spec_top_share' 'anom_rate'
 'anom_score_p95' 'straight_rate' 'longstring_norm_p95' 'resp_entropy_p10']. At least one non-missing value is needed for imputation with strategy='median'.
 'knn_dist_cv' 'knn_dist_skew' 'spec_entropy' 'spec_top_share' 'anom_rate'
 'anom_score_p95' 'straight_rate' 'longstring_norm_p95' 'resp_entropy_p10']. At least one non-missing value is needed for imputation with strategy='median'.
 'knn_dist_cv' 'knn_dist_skew' 'spec_entropy' 'spec_top_share' 'anom_rate'
 'anom_score_p95' 'straight_rate' 'longstring_norm_p95' 'resp_entropy_p10']. At least one non-missing value is needed for imputation with strategy='median'.
 'knn_dist_cv' 'knn_dist_skew' 'spec_entropy' 'spec_top_share' 'anom_rat

('AI_GENERATED',
 {'AI_GENERATED': np.float64(0.5898807140546796),
  'MANIPULATED': np.float64(0.15993278252367077),
  'ORIGINAL': np.float64(0.25018650342164983)},
 'LOW')

In [6]:
# -------- CELL 9: Build "evidence" + top drivers (feature z-scores vs training medians) --------
# Evidence as standardized deviation vs training distribution
Xref = X_train.copy()
ref_med = Xref.median()
ref_iqr = (Xref.quantile(0.75) - Xref.quantile(0.25)).replace(0, np.nan)

xv = x_main.iloc[0]
z = (xv - ref_med) / (ref_iqr + 1e-9)
z = z.replace([np.inf, -np.inf], np.nan).fillna(0.0)

evidence = pd.DataFrame({
    "Feature": feature_cols,
    "Value": [float(xv[c]) if pd.notna(xv[c]) else np.nan for c in feature_cols],
    "RobustZ": [float(z[c]) for c in feature_cols],
})
evidence["AbsZ"] = evidence["RobustZ"].abs()
top_evidence = evidence.sort_values("AbsZ", ascending=False).head(12)[["Feature", "Value", "RobustZ"]]

# Human-friendly mapping for the report
FEATURE_EXPLAIN = {
    "dup_exact_rate": "Exact duplicate rate (copy/paste, merging errors)",
    "dup_exact_excess": "Excess duplicates beyond first occurrence",
    "near_dup_rate": "Near-duplicate (template) clustered share (MinHash/LSH)",
    "near_dup_max_cluster": "Largest near-duplicate cluster size",
    "digit_chi2_max": "Digit-preference chi-square max (editing/rounding artifact)",
    "digit_spike_max": "Largest last-digit proportion (spike indicator)",
    "heap_05_max": "Max share ending in 0/5 (heaping/rounding)",
    "het_skew_std": "Heterogeneity of skew across variables (organic irregularity)",
    "het_kurt_std": "Heterogeneity of kurtosis across variables",
    "het_entropy_std": "Heterogeneity of entropy across variables",
    "knn_dist_cv": "kNN geometry CV (uniformity / spread in feature space)",
    "spec_entropy": "Correlation spectrum entropy (higher-order structure)",
    "spec_top_share": "Top eigenvalue share (dominant dependence axis)",
    "anom_rate": "Anomaly concentration rate (ensemble outliers)",
    "straight_rate": "Straight-lining rate (survey fraud signal)",
    "longstring_norm_p95": "Long-string normalized p95 (templating signal)",
    "resp_entropy_p10": "Low respondent entropy p10 (templating/regularity)",
    "miss_overall": "Overall missingness rate",
    "miss_spread": "Spread of missingness across columns (patterned missingness)",
}

def explain_feat(k):
    return FEATURE_EXPLAIN.get(k, k)

# Probability table
prob_df = pd.DataFrame({
    "Class": classes,
    "P_mean": p_mean,
    "P_2.5%": p_lo,
    "P_97.5%": p_hi
}).sort_values("P_mean", ascending=False)

top_evidence, prob_df


(                 Feature  Value   RobustZ
 1            miss_maxcol    0.0 -0.068750
 0           miss_overall    0.0 -0.064583
 2            miss_spread    0.0 -0.055338
 3         dup_exact_rate    0.0  0.000000
 4       dup_exact_excess    0.0  0.000000
 5          near_dup_rate    0.0  0.000000
 6      near_dup_clusters    0.0  0.000000
 7   near_dup_max_cluster    0.0  0.000000
 8         digit_chi2_max    0.0  0.000000
 9        digit_spike_max    0.0  0.000000
 10           heap_05_max    0.0  0.000000
 11          het_skew_std    NaN  0.000000,
           Class    P_mean    P_2.5%   P_97.5%
 2      ORIGINAL  0.555989  0.484067  0.613937
 0  AI_GENERATED  0.290156  0.236447  0.357772
 1   MANIPULATED  0.153855  0.149617  0.158161)

In [7]:
# -------- CELL 10: Charts --------
# 1) Probability bars
plt.figure(figsize=(7,4))
plt.bar(prob_df["Class"], prob_df["P_mean"])
plt.title("Compliance Classification Probabilities (Bootstrap Mean)")
plt.ylabel("Probability")
plt.ylim(0, 1)
plt.tight_layout()
prob_png = os.path.join(ART_DIR, "probabilities.png")
plt.savefig(prob_png, dpi=220)
plt.close()

# 2) Top evidence |Z|
top_plot = top_evidence.copy()
top_plot["FeatureLabel"] = top_plot["Feature"].apply(explain_feat)
plt.figure(figsize=(9,4))
plt.barh(top_plot["FeatureLabel"][::-1], top_plot["RobustZ"][::-1])
plt.title("Top Statistical Drivers (Robust Z vs Calibration Baseline)")
plt.xlabel("Robust Z (IQR-scaled)")
plt.tight_layout()
drivers_png = os.path.join(ART_DIR, "top_drivers.png")
plt.savefig(drivers_png, dpi=220)
plt.close()

prob_png, drivers_png


('artifacts/probabilities.png', 'artifacts/top_drivers.png')

In [8]:
# -------- CELL 11: Generate Compliance PDF Report (Dataset Integrity Only) --------
def band_color_hex(conf):
    return {"HIGH":"#2e7d32","MEDIUM":"#ef6c00","LOW":"#c62828"}.get(conf, "#000000")

styles = getSampleStyleSheet()
title_style = ParagraphStyle("T", parent=styles["Title"], fontName="Helvetica-Bold", fontSize=20,
                             alignment=TA_CENTER, textColor=colors.HexColor("#1f2a44"), spaceAfter=10)
h_style = ParagraphStyle("H", parent=styles["Heading2"], fontName="Helvetica-Bold", fontSize=13,
                         textColor=colors.HexColor("#1f2a44"), spaceBefore=10, spaceAfter=6)
body_style = ParagraphStyle("B", parent=styles["BodyText"], fontName="Helvetica", fontSize=10, leading=14)
small_style = ParagraphStyle("S", parent=styles["BodyText"], fontName="Helvetica", fontSize=9, leading=12,
                             textColor=colors.HexColor("#444444"))

def make_table(df, col_widths):
    data = [list(df.columns)] + df.astype(str).values.tolist()
    t = Table(data, colWidths=col_widths)
    t.setStyle(TableStyle([
        ("BACKGROUND", (0,0), (-1,0), colors.HexColor("#e8edf6")),
        ("TEXTCOLOR", (0,0), (-1,0), colors.HexColor("#1f2a44")),
        ("FONTNAME", (0,0), (-1,0), "Helvetica-Bold"),
        ("FONTNAME", (0,1), (-1,-1), "Helvetica"),
        ("FONTSIZE", (0,0), (-1,-1), 9),
        ("GRID", (0,0), (-1,-1), 0.4, colors.HexColor("#b8c2d6")),
        ("ROWBACKGROUNDS", (0,1), (-1,-1), [colors.white, colors.HexColor("#f7f9fd")]),
        ("VALIGN", (0,0), (-1,-1), "TOP"),
        ("LEFTPADDING", (0,0), (-1,-1), 6),
        ("RIGHTPADDING", (0,0), (-1,-1), 6),
        ("TOPPADDING", (0,0), (-1,-1), 4),
        ("BOTTOMPADDING", (0,0), (-1,-1), 4),
    ]))
    return t

# Pretty evidence table
evi_pdf = top_evidence.copy()
evi_pdf["Meaning"] = evi_pdf["Feature"].apply(explain_feat)
evi_pdf = evi_pdf[["Feature", "Meaning", "Value", "RobustZ"]]

prob_pdf = prob_df.copy()
prob_pdf["P_mean"] = prob_pdf["P_mean"].map(lambda x: f"{x:.3f}")
prob_pdf["P_2.5%"] = prob_pdf["P_2.5%"].map(lambda x: f"{x:.3f}")
prob_pdf["P_97.5%"] = prob_pdf["P_97.5%"].map(lambda x: f"{x:.3f}")

pdf_path = "Dataset_Integrity_Compliance_Report.pdf"
doc = SimpleDocTemplate(pdf_path, pagesize=A4, leftMargin=2*cm, rightMargin=2*cm, topMargin=1.6*cm, bottomMargin=1.6*cm)

elements = []
elements.append(Paragraph("Dataset Integrity Compliance Report (Prototype)", title_style))
elements.append(Paragraph(f"<b>Date:</b> {datetime.datetime.now().strftime('%d %b %Y, %H:%M')}", body_style))
elements.append(Paragraph(f"<b>Dataset:</b> {safe_str(ds_name, 90)}", body_style))
elements.append(Paragraph(f"<b>Rows × Cols:</b> {df.shape[0]} × {df.shape[1]}", body_style))
elements.append(Spacer(1, 10))

# Decision box
conf_hex = band_color_hex(conf)
decision_box = Table([[
    Paragraph(f"<b>Compliance Label:</b> {label}", body_style),
    Paragraph(f"<b>Confidence:</b> <font color='{conf_hex}'>{conf}</font>", body_style),
]], colWidths=[9.0*cm, 6.5*cm])
decision_box.setStyle(TableStyle([
    ("BACKGROUND", (0,0), (-1,-1), colors.HexColor("#f0f4fb")),
    ("BOX", (0,0), (-1,-1), 1.0, colors.HexColor("#b8c2d6")),
    ("LEFTPADDING", (0,0), (-1,-1), 10),
    ("RIGHTPADDING", (0,0), (-1,-1), 10),
    ("TOPPADDING", (0,0), (-1,-1), 10),
    ("BOTTOMPADDING", (0,0), (-1,-1), 10),
]))
elements.append(decision_box)
elements.append(Spacer(1, 8))

elements.append(Paragraph(
    "This report classifies the dataset into one of three compliance categories: <b>ORIGINAL</b>, <b>MANIPULATED</b>, or <b>AI_GENERATED</b>. "
    "Classification is produced using a calibrated statistical model trained on internal baseline variants (resamples and controlled perturbations) "
    "and validated through bootstrap uncertainty estimates. This is a risk/compliance indicator and should be interpreted alongside provenance evidence.",
    body_style
))
elements.append(Spacer(1, 10))

elements.append(Paragraph("Calibrated Probabilities (with 95% Bootstrap Interval)", h_style))
elements.append(make_table(prob_pdf, [4.2*cm, 3.6*cm, 3.6*cm, 3.6*cm]))
elements.append(Spacer(1, 10))
elements.append(Image(prob_png, width=16*cm, height=9*cm))
elements.append(PageBreak())

elements.append(Paragraph("Top Statistical Drivers (Audit-Grade Evidence)", h_style))
elements.append(Paragraph(
    "Drivers are listed as robust deviations from the calibration baseline (IQR-scaled Z). "
    "Large absolute values indicate features that materially differ from baseline behavior.",
    body_style
))
elements.append(Spacer(1, 6))
elements.append(make_table(evi_pdf, [3.0*cm, 6.0*cm, 3.0*cm, 3.0*cm]))
elements.append(Spacer(1, 10))
elements.append(Image(drivers_png, width=16*cm, height=7*cm))
elements.append(PageBreak())

elements.append(Paragraph("Compliance Checklist (Mandated Review Actions)", h_style))
checklist = [
    "<b>Provenance:</b> Obtain raw exports/logs, collection protocol, consent/IRB evidence, and version history of the dataset.",
    "<b>Uniqueness:</b> Verify unique respondent/entity IDs; audit duplicates and template clusters if present.",
    "<b>Editing Artifacts:</b> Review digit preference, rounding/heaping, and blockwise missingness patterns; request pre-cleaning version.",
    "<b>Structure & Geometry:</b> Inspect distribution heterogeneity, kNN geometry, and dependence spectrum; compare against domain reference if available.",
    "<b>Reproducibility:</b> Request codebook + scripts; verify transformations match claimed methodology.",
    "<b>Decision Trail:</b> Record decision (pass / request clarification / fail) with evidence from this report.",
]
for item in checklist:
    elements.append(Paragraph("• " + item, body_style))
    elements.append(Spacer(1, 4))

elements.append(Spacer(1, 10))
elements.append(Paragraph(
    "<i>Important:</i> For strongest compliance, integrate a domain reference corpus (trusted historical datasets from the same instrument/population) "
    "and re-run two-sample distinguishability tests against that reference. This materially increases reliability of AI_GENERATED vs ORIGINAL separation.",
    small_style
))

doc.build(elements)
print("PDF created:", pdf_path)
files.download(pdf_path)


PDF created: Dataset_Integrity_Compliance_Report.pdf


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>