<a href="https://colab.research.google.com/github/Dey313/ResEthiq/blob/main/P2_Test_16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install openpyxl reportlab datasketch pypdf pdfplumber faiss-cpu sentence-transformers scikit-learn

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.1/96.1 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.6/329.6 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m95.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m66.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
!pip install triton
import torch
print(torch.__version__)

Collecting triton
  Downloading triton-3.5.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.7 kB)
Downloading triton-3.5.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (170.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m170.5/170.5 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: triton
Successfully installed triton-3.5.1
2.9.0+cpu


In [4]:
import io, os, re, zipfile, math, datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from google.colab import files

import pdfplumber
from pypdf import PdfReader

import faiss
from sentence_transformers import SentenceTransformer

from sklearn.ensemble import IsolationForest
from datasketch import MinHash, MinHashLSH

from reportlab.lib.pagesizes import A4
from reportlab.lib.units import cm
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image, PageBreak
from reportlab.lib.enums import TA_CENTER


In [5]:
CONFIG = {
    # Dataset integrity thresholds
    "max_overall_missingness": 0.20,
    "exact_dup_rate_high": 0.02,

    "lsh_threshold": 0.90,
    "min_cluster_size": 5,
    "near_dup_rate_high": 0.05,

    "straight_line_items_min": 8,
    "straight_line_rate_high": 0.08,
    "long_string_norm_high": 0.60,

    "heaping_last_digit_high": 0.35,
    "likert_var_threshold": 0.25,
    "likert_low_var_share_high": 0.25,

    "corr_abs_high": 0.90,
    "corr_share_high": 0.08,

    "iforest_contamination": 0.02,
    "anomaly_rate_high": 0.05,

    # Document integrity thresholds
    "plag_semantic_high": 0.75,     # cosine sim
    "plag_exact_high": 0.08,        # exact fingerprint overlap (0–1)
    "ai_drift_high": 0.35,          # stylometry drift score (0–1-ish)
    "ai_repeat_high": 0.12,         # repetition ratio
    "ai_redundancy_high": 0.35,     # redundancy proxy

    # Overall weighting (dataset primary)
    "overall_weights": {
        "dataset": 0.55,
        "plagiarism": 0.25,
        "ai_writing": 0.20
    }
}

EXCLUDE = {"__row_id__", "id", "respondent_id", "response_id", "name", "phone", "email"}

ART_DIR = "artifacts"
os.makedirs(ART_DIR, exist_ok=True)


In [6]:
def clamp01(x: float) -> float:
    return float(max(0.0, min(1.0, x)))

def safe_str(x, max_len=140):
    s = str(x)
    return s if len(s) <= max_len else s[:max_len-3] + "..."

def add_row_id(df: pd.DataFrame, col="__row_id__"):
    df = df.copy()
    if col not in df.columns:
        df[col] = [f"r_{i:06d}" for i in range(len(df))]
    return df

def guess_likert_cols(df, exclude=None):
    exclude = set(exclude or [])
    likerts = []
    for c in df.columns:
        if c in exclude:
            continue
        s = df[c].dropna()
        if len(s) < 30:
            continue
        if pd.api.types.is_numeric_dtype(s):
            vals = np.asarray(s)
            uniq = sorted(set(np.round(vals).astype(int).tolist()))
            if len(uniq) in (4,5,6,7) and (max(uniq) - min(uniq) <= 6):
                likerts.append(c)
    return likerts

def last_digit_share(series, digits={0,5}):
    s = series.dropna()
    if len(s) == 0:
        return 0.0
    vals = np.abs(np.round(s.values).astype(int))
    last = vals % 10
    return float(np.mean(np.isin(last, list(digits))))

def split_paragraphs(text):
    # strong paragraph splitter
    parts = [p.strip() for p in re.split(r"\n\s*\n+", text) if p.strip()]
    # fallback for single-block: split by sentence groups
    if len(parts) < 3:
        parts = [p.strip() for p in re.split(r"(?<=[.!?])\s{2,}", text) if p.strip()]
    return parts

def band_from_score(score):
    if score >= 80: return "LOW"
    if score >= 55: return "MEDIUM"
    return "HIGH"

def verdict_from_band(band):
    if band == "LOW":
        return "Likely genuine/original (no strong synthetic/manipulation signals)"
    if band == "MEDIUM":
        return "Some risk signals detected; targeted review recommended"
    return "Strong signals consistent with manipulation/synthetic or unoriginal content"


In [7]:
def check_missingness(df, cfg):
    m = float(df.isna().mean().mean())
    risk = clamp01(m / cfg["max_overall_missingness"])
    flag = None
    if m >= cfg["max_overall_missingness"]:
        flag = ("High missingness", "Medium", "missingness", f"Overall missingness = {m:.1%}", "Audit missing blocks; verify export/collection.")
    return risk, {"overall_missingness": m}, flag

def check_exact_duplicates(df, cfg, exclude_cols=None):
    exclude_cols = set(exclude_cols or [])
    cols = [c for c in df.columns if c not in exclude_cols]
    X = df[cols].fillna("__MISSING__")
    dup = X.duplicated(keep=False)
    rate = float(dup.mean())
    risk = clamp01(rate / cfg["exact_dup_rate_high"])
    flag = None
    if rate >= cfg["exact_dup_rate_high"]:
        flag = ("Exact duplicate rows", "High", "duplicates", f"Exact dup rate = {rate:.1%}", "Check duplicate submissions/merge errors.")
    return risk, {"exact_dup_rate": rate}, flag

def row_signature_tokens(row, cols):
    toks = []
    for c in cols:
        v = row[c]
        if pd.isna(v): v = "__MISSING__"
        toks.append(f"{c}={str(v)}")
    return toks

def check_near_duplicates_minhash(df, cfg, exclude_cols=None):
    exclude_cols = set(exclude_cols or [])
    cols = [c for c in df.columns if c not in exclude_cols]
    if len(cols) < 8 or len(df) < 50:
        return 0.0, {"near_dup_rate": None, "n_clusters": 0, "skipped": True}, None

    df_s = df.sample(min(len(df), 2500), random_state=42).copy()
    sampled = len(df_s) < len(df)

    thr = cfg["lsh_threshold"]
    lsh = MinHashLSH(threshold=thr, num_perm=64)

    keys = []
    key_to_rowid = {}
    key_to_mh = {}

    for i, (_, row) in enumerate(df_s.iterrows()):
        mh = MinHash(num_perm=64)
        for t in row_signature_tokens(row, cols):
            mh.update(t.encode("utf8"))
        k = f"k{i}"
        lsh.insert(k, mh)
        keys.append(k)
        key_to_rowid[k] = row["__row_id__"]
        key_to_mh[k] = mh

    visited = set()
    clusters = []
    for k in keys:
        if k in visited:
            continue
        nbrs = lsh.query(key_to_mh[k])
        if len(nbrs) >= cfg["min_cluster_size"]:
            clusters.append([key_to_rowid[x] for x in nbrs])
            visited.update(nbrs)

    near_rate = sum(len(c) for c in clusters) / max(1, len(df_s))
    risk = clamp01(near_rate / cfg["near_dup_rate_high"])
    flag = None
    if near_rate >= cfg["near_dup_rate_high"]:
        top_sizes = sorted([len(c) for c in clusters], reverse=True)[:5]
        flag = ("Near-duplicate templates (MinHash/LSH)", "High", "near_dups",
                f"Near-dup clustered rate = {near_rate:.1%} (thr {cfg['near_dup_rate_high']:.1%}); top clusters {top_sizes}",
                "Inspect templated response blocks; verify provenance.")
    return risk, {"near_dup_rate": near_rate, "n_clusters": len(clusters), "sampled": sampled}, flag

def straight_lining_mask(df, likert_cols, min_items=8):
    if len(likert_cols) < min_items:
        return np.zeros(len(df), dtype=bool)
    B = df[likert_cols]
    same = B.nunique(axis=1) == 1
    answered = B.notna().sum(axis=1) >= min_items
    return (same & answered).values

def long_string_index(df, likert_cols):
    if len(likert_cols) == 0:
        return np.zeros(len(df))
    B = df[likert_cols].to_numpy(dtype=float)
    out = np.zeros(B.shape[0])
    for i in range(B.shape[0]):
        row = B[i, :]
        best = run = 1
        for j in range(1, len(row)):
            if np.isnan(row[j]) or np.isnan(row[j-1]):
                run = 1
            elif row[j] == row[j-1]:
                run += 1
                best = max(best, run)
            else:
                run = 1
        out[i] = best
    return out

def check_survey_fraud(df, cfg, exclude_cols=None):
    exclude_cols = set(exclude_cols or [])
    lik = guess_likert_cols(df, exclude=exclude_cols)
    if len(lik) < cfg["straight_line_items_min"]:
        return 0.0, {"likert_cols_count": len(lik), "skipped": True}, None

    straight = straight_lining_mask(df, lik, min_items=cfg["straight_line_items_min"])
    straight_rate = float(np.mean(straight))

    lsi = long_string_index(df, lik)
    lsi_p95 = float(np.percentile(lsi, 95))
    lsi_norm = lsi_p95 / max(1, len(lik))

    risk = clamp01(0.8*(straight_rate/cfg["straight_line_rate_high"]) + 0.6*(lsi_norm/cfg["long_string_norm_high"]))

    flag = None
    if straight_rate >= cfg["straight_line_rate_high"] or lsi_norm >= cfg["long_string_norm_high"]:
        flag = ("Survey fraud patterns (straight-lining/templating)",
                "High" if straight_rate >= cfg["straight_line_rate_high"] else "Medium",
                "survey_fraud",
                f"Straight-lining={straight_rate:.1%} (thr {cfg['straight_line_rate_high']:.1%}); long-string p95={lsi_p95:.0f} items (norm {lsi_norm:.2f})",
                "Review respondent patterns; consider exclusion rules; re-run after cleaning.")
    return risk, {"likert_cols_count": len(lik), "straight_rate": straight_rate, "lsi_p95": lsi_p95, "lsi_norm": lsi_norm}, flag

def check_heaping(df, cfg, exclude_cols=None):
    exclude_cols = set(exclude_cols or [])
    num_cols = [c for c in df.select_dtypes(include="number").columns if c not in exclude_cols]
    best_col, best_share = None, 0.0
    for c in num_cols:
        share = last_digit_share(df[c], digits={0,5})
        if share > best_share:
            best_share, best_col = share, c
    risk = clamp01(best_share / cfg["heaping_last_digit_high"]) if best_col else 0.0
    flag = None
    if best_col and best_share >= cfg["heaping_last_digit_high"]:
        flag = ("Rounding/heaping detected", "Medium", "heaping",
                f"Column {best_col}: last-digit(0/5) share={best_share:.1%} (thr {cfg['heaping_last_digit_high']:.0%})",
                "Check rounding in entry/cleaning; validate measurement precision.")
    return risk, {"heaping_col": best_col, "heaping_share": best_share}, flag

def check_likert_variance(df, cfg, exclude_cols=None):
    exclude_cols = set(exclude_cols or [])
    lik = guess_likert_cols(df, exclude=exclude_cols)
    if not lik:
        return 0.0, {"likert_cols_count": 0, "low_var_share": 0.0, "skipped": True}, None
    low = []
    for c in lik:
        v = float(df[c].var(ddof=1))
        if v < cfg["likert_var_threshold"]:
            low.append(c)
    share = len(low) / max(1, len(lik))
    risk = clamp01(share / cfg["likert_low_var_share_high"])
    flag = None
    if share >= cfg["likert_low_var_share_high"]:
        flag = ("Likert items low variance", "Medium", "likert_var",
                f"{share:.1%} of Likert-like items have variance < {cfg['likert_var_threshold']}",
                "Check uniform responding/synthetic smoothing; verify instrument.")
    return risk, {"likert_cols_count": len(lik), "low_var_share": share}, flag

def check_dependence(df, cfg, exclude_cols=None):
    exclude_cols = set(exclude_cols or [])
    Xn = df.select_dtypes(include="number").drop(columns=[c for c in exclude_cols if c in df.columns], errors="ignore")
    if Xn.shape[1] < 6 or len(df) < 150:
        return 0.0, {"corr_share_high": None, "skipped": True}, None
    Xn = Xn.fillna(Xn.median(numeric_only=True))
    corr = Xn.corr(method="spearman").abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    share_high = float((upper >= cfg["corr_abs_high"]).sum().sum() / max(1, upper.count().sum()))
    risk = clamp01(share_high / cfg["corr_share_high"])
    flag = None
    if share_high >= cfg["corr_share_high"]:
        flag = ("Dependence unusually strong", "Medium", "dependence",
                f"{share_high:.1%} pairs with |Spearman ρ|≥{cfg['corr_abs_high']}",
                "Review construct overlap and preprocessing; stronger if paired with duplication.")
    return risk, {"corr_share_high": share_high}, flag

def check_anomalies(df, cfg, exclude_cols=None):
    exclude_cols = set(exclude_cols or [])
    X = df.drop(columns=[c for c in exclude_cols if c in df.columns], errors="ignore")
    Xn = X.select_dtypes(include="number")
    if Xn.shape[1] < 5 or len(df) < 200:
        return 0.0, {"anomaly_rate": None, "skipped": True}, None
    Xn = Xn.fillna(Xn.median(numeric_only=True))
    iso = IsolationForest(contamination=cfg["iforest_contamination"], random_state=42)
    iso.fit(Xn)
    scores = -iso.score_samples(Xn)
    thr = np.quantile(scores, 1.0 - cfg["iforest_contamination"])
    rate = float((scores >= thr).mean())
    risk = clamp01(rate / cfg["anomaly_rate_high"])
    flag = None
    if rate >= cfg["anomaly_rate_high"]:
        flag = ("High anomaly concentration", "High", "anomaly",
                f"Anomaly rate={rate:.1%} (thr {cfg['anomaly_rate_high']:.1%})",
                "Inspect anomalous rows; verify provenance and constraints.")
    return risk, {"anomaly_rate": rate}, flag


In [8]:
print("Upload your dataset (CSV/XLSX):")
up = files.upload()
ds_name = next(iter(up.keys()))

def load_df(name):
    if name.lower().endswith(".csv"):
        return pd.read_csv(io.BytesIO(up[name]))
    if name.lower().endswith(".xlsx"):
        return pd.read_excel(io.BytesIO(up[name]))
    raise ValueError("Use CSV or XLSX")

df = load_df(ds_name)
df = add_row_id(df)
print("Dataset shape:", df.shape)

# run modules
ds_flags = []
ds_risks = {}
ds_metrics = {}

for fn, key in [
    (check_missingness, "missingness"),
    (check_exact_duplicates, "duplicates"),
    (check_near_duplicates_minhash, "near_dups"),
    (check_survey_fraud, "survey_fraud"),
    (check_heaping, "heaping"),
    (check_likert_variance, "likert_var"),
    (check_dependence, "dependence"),
    (check_anomalies, "anomaly"),
]:
    r, m, f = fn(df, CONFIG, exclude_cols=EXCLUDE) if fn not in [check_missingness] else fn(df, CONFIG)
    ds_risks[key] = float(r)
    ds_metrics.update(m)
    if f: ds_flags.append(f)

# dataset score (weighted by internal weights)
ds_weights = {
    "missingness": 0.12, "duplicates": 0.16, "near_dups": 0.18, "survey_fraud": 0.18,
    "heaping": 0.10, "likert_var": 0.08, "dependence": 0.10, "anomaly": 0.08
}
ds_total_w = sum(ds_weights.values())
ds_weighted_risk = sum(ds_weights[k]*ds_risks.get(k,0.0) for k in ds_weights) / ds_total_w
ds_weighted_risk = clamp01(ds_weighted_risk)

dataset_score = int(round(100*(1.0 - ds_weighted_risk)))
dataset_band = band_from_score(dataset_score)

dataset_score, dataset_band, ds_metrics.get("overall_missingness")


Upload your dataset (CSV/XLSX):


Saving 10_Year_Road_Accident_Analysis_8_States.xlsx to 10_Year_Road_Accident_Analysis_8_States.xlsx
Dataset shape: (80, 10)


(91, 'LOW', 0.0)

In [9]:
print("Upload a ZIP of reference PDFs (your plagiarism corpus):")
upc = files.upload()
zip_name = next(iter(upc.keys()))
corpus_dir = "corpus_pdfs"
os.makedirs(corpus_dir, exist_ok=True)

# unzip
with zipfile.ZipFile(io.BytesIO(upc[zip_name]), 'r') as z:
    z.extractall(corpus_dir)

# collect pdf paths
pdf_paths = []
for root, _, files_ in os.walk(corpus_dir):
    for f in files_:
        if f.lower().endswith(".pdf"):
            pdf_paths.append(os.path.join(root, f))

print("Corpus PDFs found:", len(pdf_paths))
pdf_paths[:5]


Upload a ZIP of reference PDFs (your plagiarism corpus):


Saving test.zip to test.zip
Corpus PDFs found: 38


['corpus_pdfs/__MACOSX/test/._Assessing the Shift Towards Entrepreneurship among Students in North East Indian Universities_ A Behavioral and Institutional Analysis.pdf',
 'corpus_pdfs/__MACOSX/test/._DESIGN-3.pdf',
 'corpus_pdfs/__MACOSX/test/._Ranjit Sir.pdf',
 'corpus_pdfs/__MACOSX/test/._1-s2.0-S2949948824000295-main.pdf',
 'corpus_pdfs/__MACOSX/test/._2006SystemsThinkingDissertation.pdf']

In [10]:
def extract_pdf_text(path):
    # best-effort extraction
    text = ""
    try:
        with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                t = page.extract_text() or ""
                text += t + "\n"
    except:
        try:
            reader = PdfReader(path)
            for p in reader.pages:
                text += (p.extract_text() or "") + "\n"
        except:
            text = ""
    return text.strip()

# Build corpus paragraphs
corpus_items = []  # (doc_id, doc_name, paragraph_text)
for i, p in enumerate(pdf_paths):
    t = extract_pdf_text(p)
    if len(t) < 200:
        continue
    paras = split_paragraphs(t)
    doc_name = os.path.basename(p)
    for para in paras:
        if len(para) >= 200:
            corpus_items.append((i, doc_name, para))

print("Corpus paragraphs:", len(corpus_items))

# Embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Embed in batches
batch = 64
embs = []
meta = []  # (doc_id, doc_name, para_idx)
for start in range(0, len(corpus_items), batch):
    chunk = corpus_items[start:start+batch]
    texts = [x[2] for x in chunk]
    v = model.encode(texts, normalize_embeddings=True, show_progress_bar=False)
    embs.append(v)
    meta.extend([(x[0], x[1], start+i) for i, x in enumerate(chunk)])

emb = np.vstack(embs).astype("float32")
dim = emb.shape[1]

# FAISS index (cosine via inner product on normalized vectors)
index = faiss.IndexFlatIP(dim)
index.add(emb)

print("FAISS index size:", index.ntotal, "dim:", dim)




Corpus paragraphs: 20


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

FAISS index size: 20 dim: 384


In [12]:
print("Upload the target PDF to check (manuscript/thesis):")
upt = files.upload()
target_pdf = next(iter(upt.keys()))

# Save to disk for extraction
target_path = os.path.join(ART_DIR, target_pdf)
with open(target_path, "wb") as f:
    f.write(upt[target_pdf])

target_text = extract_pdf_text(target_path)
target_paras = [p for p in split_paragraphs(target_text) if len(p) >= 200]
print("Target paragraphs:", len(target_paras))

# --- Semantic plagiarism (top matches per paragraph) ---
top_k = 3
target_vec = model.encode(target_paras, normalize_embeddings=True, show_progress_bar=False).astype("float32")
D, I = index.search(target_vec, top_k)  # D is cosine sim (0..1)

# Build top match table
matches = []
for pi in range(len(target_paras)):
    for rank in range(top_k):
        mi = int(I[pi, rank])
        sim = float(D[pi, rank])
        doc_id, doc_name, meta_idx = meta[mi]
        matches.append({
            "TargetPara": pi,
            "Sim": round(sim, 3),
            "CorpusDoc": doc_name,
            "CorpusParaId": meta_idx
        })

match_df = pd.DataFrame(matches).sort_values(["TargetPara", "Sim"], ascending=[True, False])

# semantic risk = share of paragraphs with very high similarity
sem_thr = CONFIG["plag_semantic_high"]
high_sem = (D[:,0] >= sem_thr).mean() if len(D) else 0.0
semantic_risk = clamp01(high_sem / 0.12)  # if >12% paras are high-sim => strong risk

# --- Exact plagiarism proxy: fingerprint overlap using word 5-grams ---
def fingerprints(text, k=5):
    toks = re.findall(r"[A-Za-z0-9]+", text.lower())
    if len(toks) < k: return set()
    return set(" ".join(toks[i:i+k]) for i in range(len(toks)-k+1))

# Build a compact fingerprint set for corpus by sampling top N paragraphs
# (keeps Colab fast; for production you’d index fingerprints)
sample_corpus = corpus_items[:min(len(corpus_items), 3000)]
corpus_fp = set()
for _, _, para in sample_corpus:
    corpus_fp |= fingerprints(para, k=5)

target_fp = set()
for para in target_paras[:min(len(target_paras), 400)]:
    target_fp |= fingerprints(para, k=5)

overlap = (len(target_fp & corpus_fp) / max(1, len(target_fp))) if len(target_fp) else 0.0
exact_risk = clamp01(overlap / CONFIG["plag_exact_high"])

# Combined plagiarism risk
plagiarism_risk = clamp01(0.65*semantic_risk + 0.35*exact_risk)
plagiarism_score = int(round(100*(1.0 - plagiarism_risk)))
plagiarism_band = band_from_score(plagiarism_score)

high_sem, overlap, plagiarism_score, plagiarism_band


Upload the target PDF to check (manuscript/thesis):


Saving Final Thesis.pdf to Final Thesis.pdf
Target paragraphs: 1


(np.float64(0.0), 0.0015973985224063667, 99, 'LOW')

In [13]:
def stylometry_features(paras):
    feats = []
    for p in paras:
        sents = re.split(r"[.!?]+", p)
        sents = [s.strip() for s in sents if len(s.strip()) > 0]
        words = re.findall(r"[A-Za-z]+", p.lower())
        if not words:
            feats.append([0,0,0,0,0])
            continue
        avg_sent_len = np.mean([len(re.findall(r"[A-Za-z]+", s)) for s in sents]) if sents else 0
        word_len = np.mean([len(w) for w in words])
        uniq_ratio = len(set(words)) / max(1, len(words))
        punct_rate = len(re.findall(r"[,;:]", p)) / max(1, len(words))
        stop_like = sum(w in {"the","and","of","to","in","for","with","on","as","by","is","are"} for w in words)/max(1,len(words))
        feats.append([avg_sent_len, word_len, uniq_ratio, punct_rate, stop_like])
    return np.array(feats, dtype=float)

def repetition_ratio(text):
    words = re.findall(r"[A-Za-z]+", text.lower())
    if len(words) < 50: return 0.0
    # repeated bigrams ratio
    bigrams = [words[i]+"_"+words[i+1] for i in range(len(words)-1)]
    return 1.0 - (len(set(bigrams)) / max(1, len(bigrams)))

def redundancy_proxy(paras):
    # proxy: if many paragraphs are semantically very close to each other (self-sim)
    if len(paras) < 20:
        return 0.0
    v = model.encode(paras[:200], normalize_embeddings=True, show_progress_bar=False).astype("float32")
    sims = v @ v.T
    np.fill_diagonal(sims, 0.0)
    # share of paragraph pairs above 0.90 (very redundant)
    upper = sims[np.triu_indices_from(sims, k=1)]
    return float(np.mean(upper > 0.90))

# stylometry drift: compare first third vs last third
paras = target_paras
F = stylometry_features(paras)
if len(F) >= 30:
    a = F[:len(F)//3].mean(axis=0)
    b = F[-len(F)//3:].mean(axis=0)
    drift = float(np.linalg.norm(a-b) / (np.linalg.norm(a)+1e-9))
else:
    drift = 0.0

rep = repetition_ratio(target_text)
red = redundancy_proxy(paras)

# risk mapping
drift_risk = clamp01(drift / CONFIG["ai_drift_high"])
rep_risk = clamp01(rep / CONFIG["ai_repeat_high"])
red_risk = clamp01(red / CONFIG["ai_redundancy_high"])

ai_writing_risk = clamp01(0.45*drift_risk + 0.30*rep_risk + 0.25*red_risk)
ai_writing_score = int(round(100*(1.0 - ai_writing_risk)))
ai_writing_band = band_from_score(ai_writing_score)

drift, rep, red, ai_writing_score, ai_writing_band


(0.0, 0.4810509282916028, 0.0, 70, 'MEDIUM')

In [14]:
# Combine risks
overall_w = CONFIG["overall_weights"]
overall_risk = clamp01(
    overall_w["dataset"]*ds_weighted_risk +
    overall_w["plagiarism"]*plagiarism_risk +
    overall_w["ai_writing"]*ai_writing_risk
)
overall_score = int(round(100*(1.0 - overall_risk)))
overall_band = band_from_score(overall_score)
overall_verdict = verdict_from_band(overall_band)

overall_score, overall_band, overall_verdict


(89,
 'LOW',
 'Likely genuine/original (no strong synthetic/manipulation signals)')

In [15]:
# Tables for report
ds_driver_rows = []
for k,w in ds_weights.items():
    ds_driver_rows.append({
        "Module": k,
        "Risk (0-1)": round(ds_risks.get(k,0.0), 3),
        "Weight": round(w,3),
        "Impact": round(w*ds_risks.get(k,0.0), 3)
    })
ds_driver_df = pd.DataFrame(ds_driver_rows).sort_values("Impact", ascending=False)

ds_flags_df = pd.DataFrame([{
    "Flag": f[0], "Severity": f[1], "Module": f[2], "Evidence": f[3], "Action": f[4]
} for f in ds_flags]) if ds_flags else pd.DataFrame([{
    "Flag":"No major dataset flags triggered.", "Severity":"-", "Module":"-", "Evidence":"-", "Action":"-"
}])

doc_summary_df = pd.DataFrame([
    {"Component":"Dataset Integrity", "Score":dataset_score, "Band":dataset_band},
    {"Component":"Plagiarism Risk", "Score":plagiarism_score, "Band":plagiarism_band},
    {"Component":"AI-writing Risk", "Score":ai_writing_score, "Band":ai_writing_band},
    {"Component":"Overall Research Integrity", "Score":overall_score, "Band":overall_band},
])

plag_summary_df = pd.DataFrame([
    {"Metric":"High-similarity paragraph share (semantic)", "Value": f"{high_sem:.1%} (thr {CONFIG['plag_semantic_high']})"},
    {"Metric":"Exact fingerprint overlap (5-gram)", "Value": f"{overlap:.2%}"},
    {"Metric":"Plagiarism score", "Value": f"{plagiarism_score}/100 ({plagiarism_band})"},
])

ai_summary_df = pd.DataFrame([
    {"Metric":"Stylometry drift", "Value": f"{drift:.3f}"},
    {"Metric":"Repetition ratio", "Value": f"{rep:.3f}"},
    {"Metric":"Redundancy proxy", "Value": f"{red:.3f}"},
    {"Metric":"AI-writing score", "Value": f"{ai_writing_score}/100 ({ai_writing_band})"},
])

# Charts
os.makedirs("pdf_artifacts", exist_ok=True)

# dataset missingness
miss = df.isna().mean().sort_values(ascending=False).head(15)
plt.figure(figsize=(10,4))
miss.plot(kind="bar")
plt.title("Top 15 Columns by Missingness")
plt.ylabel("Missingness rate")
plt.tight_layout()
missingness_png = "pdf_artifacts/missingness.png"
plt.savefig(missingness_png, dpi=220)
plt.close()

# top drivers chart
top6 = ds_driver_df.head(6).copy()
plt.figure(figsize=(8,4))
plt.bar(top6["Module"], top6["Impact"])
plt.title("Top Dataset Integrity Drivers (Impact)")
plt.ylabel("Impact (weight × risk)")
plt.xticks(rotation=25, ha="right")
plt.tight_layout()
drivers_png = "pdf_artifacts/top_drivers.png"
plt.savefig(drivers_png, dpi=220)
plt.close()

# radar chart (3 components)
labels = ["Dataset", "Plagiarism", "AI-writing"]
vals = [ds_weighted_risk, plagiarism_risk, ai_writing_risk]
angles = np.linspace(0, 2*np.pi, len(labels), endpoint=False).tolist()
vals_c = vals + vals[:1]
angles_c = angles + angles[:1]

plt.figure(figsize=(6,6))
ax = plt.subplot(111, polar=True)
ax.plot(angles_c, vals_c, linewidth=2)
ax.fill(angles_c, vals_c, alpha=0.25)
ax.set_thetagrids(np.degrees(angles), labels)
ax.set_title("Risk Radar (0 = low, 1 = high)")
ax.set_ylim(0, 1)
radar_png = "pdf_artifacts/radar.png"
plt.tight_layout()
plt.savefig(radar_png, dpi=220)
plt.close()

missingness_png, drivers_png, radar_png


('pdf_artifacts/missingness.png',
 'pdf_artifacts/top_drivers.png',
 'pdf_artifacts/radar.png')

In [16]:
def band_color_hex(band):
    return {"LOW":"#2e7d32","MEDIUM":"#ef6c00","HIGH":"#c62828"}.get(band, "#000000")

styles = getSampleStyleSheet()
title_style = ParagraphStyle("T", parent=styles["Title"], fontName="Helvetica-Bold", fontSize=20,
                             alignment=TA_CENTER, textColor=colors.HexColor("#1f2a44"), spaceAfter=10)
h_style = ParagraphStyle("H", parent=styles["Heading2"], fontName="Helvetica-Bold", fontSize=13,
                         textColor=colors.HexColor("#1f2a44"), spaceBefore=10, spaceAfter=6)
body_style = ParagraphStyle("B", parent=styles["BodyText"], fontName="Helvetica", fontSize=10, leading=14)
small_style = ParagraphStyle("S", parent=styles["BodyText"], fontName="Helvetica", fontSize=9, leading=12,
                             textColor=colors.HexColor("#444444"))

def make_table(df, col_widths):
    data = [list(df.columns)] + df.astype(str).values.tolist()
    t = Table(data, colWidths=col_widths)
    t.setStyle(TableStyle([
        ("BACKGROUND", (0,0), (-1,0), colors.HexColor("#e8edf6")),
        ("TEXTCOLOR", (0,0), (-1,0), colors.HexColor("#1f2a44")),
        ("FONTNAME", (0,0), (-1,0), "Helvetica-Bold"),
        ("FONTNAME", (0,1), (-1,-1), "Helvetica"),
        ("FONTSIZE", (0,0), (-1,-1), 9),
        ("GRID", (0,0), (-1,-1), 0.4, colors.HexColor("#b8c2d6")),
        ("ROWBACKGROUNDS", (0,1), (-1,-1), [colors.white, colors.HexColor("#f7f9fd")]),
        ("VALIGN", (0,0), (-1,-1), "TOP"),
        ("LEFTPADDING", (0,0), (-1,-1), 6),
        ("RIGHTPADDING", (0,0), (-1,-1), 6),
        ("TOPPADDING", (0,0), (-1,-1), 4),
        ("BOTTOMPADDING", (0,0), (-1,-1), 4),
    ]))
    return t

pdf_path = "Research_Integrity_Scorecard.pdf"
doc = SimpleDocTemplate(pdf_path, pagesize=A4, leftMargin=2*cm, rightMargin=2*cm, topMargin=1.6*cm, bottomMargin=1.6*cm)

elements = []
elements.append(Paragraph("Research Integrity Scorecard (Prototype)", title_style))
elements.append(Paragraph(f"<b>Date:</b> {datetime.datetime.now().strftime('%d %b %Y, %H:%M')}", body_style))
elements.append(Paragraph(f"<b>Dataset:</b> {safe_str(ds_name, 80)}", body_style))
elements.append(Paragraph(f"<b>Target PDF:</b> {safe_str(target_pdf, 80)}", body_style))
elements.append(Spacer(1, 10))

# Score banner
band_hex = band_color_hex(overall_band)
score_box = Table([[
    Paragraph(f"<b>Overall Integrity Score:</b> {overall_score}/100", body_style),
    Paragraph(f"<b>Risk Band:</b> <font color='{band_hex}'>{overall_band}</font>", body_style),
]], colWidths=[9.0*cm, 6.5*cm])
score_box.setStyle(TableStyle([
    ("BACKGROUND", (0,0), (-1,-1), colors.HexColor("#f0f4fb")),
    ("BOX", (0,0), (-1,-1), 1.0, colors.HexColor("#b8c2d6")),
    ("LEFTPADDING", (0,0), (-1,-1), 10),
    ("RIGHTPADDING", (0,0), (-1,-1), 10),
    ("TOPPADDING", (0,0), (-1,-1), 10),
    ("BOTTOMPADDING", (0,0), (-1,-1), 10),
]))
elements.append(score_box)
elements.append(Spacer(1, 6))
elements.append(Paragraph(f"<b>Verdict:</b> {overall_verdict}", body_style))
elements.append(Spacer(1, 10))

elements.append(Paragraph("Component Scores", h_style))
elements.append(make_table(doc_summary_df, [7.0*cm, 3.0*cm, 5.5*cm]))
elements.append(Spacer(1, 10))

elements.append(Paragraph("Risk Radar", h_style))
elements.append(Image(radar_png, width=12*cm, height=12*cm))
elements.append(Spacer(1, 10))

# Dataset section
elements.append(Paragraph("A) Dataset Integrity (Primary)", h_style))
elements.append(Paragraph(
    "Dataset Integrity Score reflects the likelihood the dataset is genuine vs manipulated/synthetic based on "
    "forensic signals (duplicates/template clusters, survey fraud patterns, distribution anomalies, dependence structure, and anomalies). "
    "Lower scores indicate stronger manipulation/synthetic signals.",
    body_style
))
elements.append(Spacer(1, 6))
elements.append(Paragraph(f"<b>Dataset Score:</b> {dataset_score}/100 ({dataset_band})", body_style))
elements.append(Spacer(1, 8))

elements.append(Paragraph("Top Drivers (Dataset)", h_style))
elements.append(make_table(ds_driver_df.head(8), [4.0*cm, 2.2*cm, 2.2*cm, 3.0*cm]))
elements.append(Spacer(1, 8))
elements.append(Image(drivers_png, width=16*cm, height=6.5*cm))
elements.append(Spacer(1, 8))

elements.append(Paragraph("Missingness Profile", h_style))
elements.append(Image(missingness_png, width=16*cm, height=6.5*cm))
elements.append(PageBreak())

elements.append(Paragraph("Dataset Flags & Evidence", h_style))
ds_flags_pdf = ds_flags_df.copy()
ds_flags_pdf["Evidence"] = ds_flags_pdf["Evidence"].apply(lambda x: safe_str(x, 140))
ds_flags_pdf["Action"] = ds_flags_pdf["Action"].apply(lambda x: safe_str(x, 120))
elements.append(make_table(ds_flags_pdf, [4.0*cm, 1.6*cm, 2.2*cm, 4.6*cm, 3.0*cm]))
elements.append(PageBreak())

# Plagiarism section
elements.append(Paragraph("B) Plagiarism Risk (PDF vs Your Corpus)", h_style))
elements.append(Paragraph(
    "Plagiarism Risk combines semantic similarity (embedding search over your corpus) and exact phrase overlap (5-gram fingerprint proxy). "
    "This prototype compares only against the provided corpus ZIP for legal/controlled coverage.",
    body_style
))
elements.append(Spacer(1, 8))
elements.append(make_table(plag_summary_df, [7.5*cm, 8.0*cm]))
elements.append(Spacer(1, 10))

elements.append(Paragraph("Top Semantic Matches (sample)", h_style))
top_matches = match_df.head(30).copy()
elements.append(make_table(top_matches, [2.2*cm, 1.8*cm, 7.0*cm, 3.0*cm]))
elements.append(PageBreak())

# AI-writing section
elements.append(Paragraph("C) AI-writing Risk (Forensic Signals)", h_style))
elements.append(Paragraph(
    "AI-writing Risk is estimated using an explainable ensemble (stylometry drift, repetition ratio, and semantic redundancy). "
    "This is a risk indicator, not a definitive AI-authorship claim.",
    body_style
))
elements.append(Spacer(1, 8))
elements.append(make_table(ai_summary_df, [7.5*cm, 8.0*cm]))
elements.append(PageBreak())

# Audit Checklist (1 page)
elements.append(Paragraph("Audit Checklist (1 page)", h_style))
checklist = [
    "Verify provenance: collection logs, export evidence, IRB/consent records.",
    "Confirm sampling: frame, inclusion/exclusion, unique respondent IDs.",
    "Inspect duplicates/template clusters: repeated rows or repeated response patterns.",
    "Survey fraud review: straight-liners, long-string patterns; document exclusion rules and rerun.",
    "Distribution checks: rounding/heaping; confirm measurement precision.",
    "Dependence review: unusually high correlations; confirm constructs distinct; review preprocessing.",
    "Plagiarism review: inspect top matched spans and sources; verify citations and originality.",
    "AI-writing review: check stylometry drift/redundancy; review sections with strongest signals.",
    "Reproducibility: codebook + scripts; verify variables and transformations match manuscript methods.",
    "Record editorial decision trail with evidence."
]
for item in checklist:
    elements.append(Paragraph("• " + item, body_style))
    elements.append(Spacer(1, 4))
elements.append(Spacer(1, 10))
elements.append(Paragraph(
    "<i>Note:</i> This prototype provides automated risk signals and does not constitute definitive fraud or plagiarism proof. "
    "Always interpret in study context and follow institutional policy.",
    small_style
))

doc.build(elements)
print("PDF created:", pdf_path)
files.download(pdf_path)


PDF created: Research_Integrity_Scorecard.pdf


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>