<a href="https://colab.research.google.com/github/Ashish-Abraham/LLM-Boilerplate/blob/main/ComparisonStudyParaphraser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q datasets==3.6.0 pandas  tqdm

In [None]:
import os
import requests
import zipfile
import io
import json

# ---------------------------
# Step 1: Download the dataset
# ---------------------------
url = "https://storage.googleapis.com/gresearch/dipper/dipper-training-data.zip"
os.makedirs("dipper_data", exist_ok=True)

print("Downloading dataset...")
response = requests.get(url)
response.raise_for_status()

# ---------------------------
# Step 2: Extract contents
# ---------------------------
print("Extracting...")
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    z.extractall("dipper_data")

# ---------------------------
# Step 3: Inspect top-level folders
# ---------------------------
for root, dirs, files in os.walk("dipper_data"):
    print(root)
    # Stop after showing first few paths
    if len(dirs) == 0 or "sents_1" in root:
        break

# ---------------------------
# Step 4: Find one JSONL file dynamically
# ---------------------------
jsonl_path = None
for root, _, files in os.walk("dipper_data"):
    for f in files:
        if f.endswith(".jsonl"):
            jsonl_path = os.path.join(root, f)
            break
    if jsonl_path:
        break

if jsonl_path:
    print(f"\nFound sample file:\n{jsonl_path}\n")

    # ---------------------------
    # Step 5: Read first few examples
    # ---------------------------
    with open(jsonl_path, "r") as f:
        for i, line in enumerate(f):
            example = json.loads(line)
            print(json.dumps(example, indent=2))
            if i >= 2:  # show first 3
                break

Downloading dataset...
Extracting...
dipper_data
dipper_data/par3
dipper_data/par3/gt_translator
dipper_data/par3/gt_translator/sents_8


In [None]:
!apt-get install tree

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  tree
0 upgraded, 1 newly installed, 0 to remove and 38 not upgraded.
Need to get 47.9 kB of archives.
After this operation, 116 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tree amd64 2.0.2-1 [47.9 kB]
Fetched 47.9 kB in 1s (42.4 kB/s)
Selecting previously unselected package tree.
(Reading database ... 126718 files and directories currently installed.)
Preparing to unpack .../tree_2.0.2-1_amd64.deb ...
Unpacking tree (2.0.2-1) ...
Setting up tree (2.0.2-1) ...
Processing triggers for man-db (2.10.2-1) ...


In [None]:
!tree /content/dipper_data

## Intel/roberta-base-mrpc

In [None]:
# Cell: evaluate Intel/roberta-base-mrpc
# Run this cell (requires internet to download HF models & datasets)
import os, re, json, math, zipfile, io
from pathlib import Path
import numpy as np
import pandas as pd
from typing import Optional, Tuple, List
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support,
                             matthews_corrcoef, confusion_matrix, roc_auc_score, roc_curve,
                             classification_report)
import matplotlib.pyplot as plt
import seaborn as sns

# ----- CONFIG -----
MODEL_NAME = "Intel/roberta-base-mrpc"
OUTPUT_ROOT = Path("./output/roberta_mrpc").resolve()
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 64
MAX_LENGTH = 256
USE_MIXED_PRECISION = True

# ----- cleaning/parsing helpers (unchanged except robust column handling) -----
def clean_sentence(sent: str) -> str:
    s = str(sent)
    s = re.sub(r"lexical\s*=\s*\d+", "", s, flags=re.IGNORECASE)
    s = re.sub(r"order\s*=\s*\d+", "", s, flags=re.IGNORECASE)
    s = s.replace("<sent>", " ")
    # If the cell contains quoted pieces, keep the quoted content (like original code)
    matches = re.findall(r'"([^"]+)"', s)
    if matches:
        s = " ".join(matches)
    s = re.sub(r"\s+", " ", s)
    s = s.replace(", ,", ",").strip()
    return s

def parse_dipper_tsv(df: pd.DataFrame) -> pd.DataFrame:
    # Expect at least two columns; some tsvs may have extra columns (we only need first two)
    if df is None or df.shape[1] < 2:
        return pd.DataFrame(columns=["sentence1", "sentence2", "label"])
    # force using first two columns no matter the header
    c1, c2 = df.columns[0], df.columns[1]
    s1 = df[c1].astype(str).map(clean_sentence)
    s2 = df[c2].astype(str).map(clean_sentence)
    out = pd.DataFrame({"sentence1": s1, "sentence2": s2})
    # filter too-short entries (same thresholds as you used)
    out = out[(out["sentence1"].str.len() > 8) & (out["sentence2"].str.len() > 8)].copy()
    out["label"] = 1
    out.dropna(inplace=True)
    out.reset_index(drop=True, inplace=True)
    return out

def _read_tsv(path: str, nrows: Optional[int] = None) -> Optional[pd.DataFrame]:
    """
    Read a TSV file defensively.
    Return DataFrame with whatever columns read (header=None fallback).
    """
    try:
        # try without header first (most DIPPER tsvs are headerless)
        return pd.read_csv(path, sep="\t", nrows=nrows, header=None, engine="python", quoting=3)
    except Exception:
        try:
            # fallback: let pandas infer header
            return pd.read_csv(path, sep="\t", nrows=nrows, engine="python")
        except Exception:
            # last resort: return None
            return None

# ----- New recursive gather function that matches your tree -----
def gather_dipper_pos_val(root: str, nrows_val: Optional[int] = 1500) -> List[pd.DataFrame]:
    """
    Recursively search `root` for any sents_* directories and collect valid/dev-like TSVs.
    Returns a list of positive-only DataFrames (label=1).
    """
    out = []
    if not os.path.exists(root):
        print(f"[gather_dipper_pos_val] DIPPER root not found: {root}")
        return out

    # patterns to accept: valid/dev variants, ctx_all/no_ctx_all variants, and 'valid..._all' etc.
    # We'll accept any filename that contains 'valid' or 'dev' or exactly 'ctx_all' / 'no_ctx_all'
    # but exclude 'train' files.
    def is_valid_file(fname: str) -> bool:
        lower = fname.lower()
        if "train" in lower:
            return False
        if "valid" in lower or "dev" in lower or "ctx_all" in lower or "no_ctx_all" in lower:
            return True
        return False

    found_files = []
    for dirpath, dirnames, filenames in os.walk(root):
        base = os.path.basename(dirpath)
        # focus on directories named 'sents_*' (covers the structure you posted)
        if not base.startswith("sents_"):
            continue
        for fname in filenames:
            if not fname.lower().endswith(".tsv"):
                continue
            if is_valid_file(fname):
                fpath = os.path.join(dirpath, fname)
                found_files.append(fpath)
                try:
                    df_raw = _read_tsv(fpath, nrows=nrows_val)
                    df = parse_dipper_tsv(df_raw) if df_raw is not None else None
                    if df is not None and len(df) > 0:
                        out.append(df)
                        print(f"  ✓ found DIPPER valid-like: {os.path.relpath(fpath, start=root)} -> {len(df)}")
                    else:
                        print(f"  ○ read but no usable rows: {os.path.relpath(fpath, start=root)}")
                except Exception as e:
                    print(f"  ! failed reading {fpath}: {e}")

    # Also check for some out_domain ctx_all/no_ctx_all files sitting under something like out_domain/*/sents_*
    # (the above walk will already hit them because they are under sents_*; this comment is just informative)
    if not found_files:
        print(f"[gather_dipper_pos_val] No valid/dev files found under {root}.")
    else:
        print(f"[gather_dipper_pos_val] Total valid-like files found: {len(found_files)}")

    return out

# ----- Add negatives (unchanged) -----
def add_negatives(df: pd.DataFrame, ratio=1.0, seed=42) -> pd.DataFrame:
    pos = df.copy().reset_index(drop=True)
    n = len(pos)
    if n < 4:
        return pos
    rng = np.random.default_rng(seed)
    idx = rng.permutation(n)
    half = n // 2
    neg = pd.DataFrame({
        "sentence1": pos.loc[idx[:half], "sentence1"].values,
        "sentence2": pos.loc[idx[-half:], "sentence2"].values,
        "label": 0
    })
    if len(neg) > int(len(pos) * ratio):
        neg = neg.sample(int(len(pos) * ratio), random_state=seed)
    pos["label"] = 1
    out = pd.concat([pos, neg], ignore_index=True).sample(frac=1.0, random_state=seed).reset_index(drop=True)
    return out

def load_dipper_val(dipper_root: str = "/content/dipper_data", val_size_limit: int = 8000) -> pd.DataFrame:
    """
    Build a DIPPER validation DF by aggregating all positive valid-like files found under dipper_root,
    then adding negatives and sampling a validation set.
    """
    print(f"[load_dipper_val] Searching for DIPPER valid/dev files under: {dipper_root}")
    pos_list = gather_dipper_pos_val(dipper_root)
    if not pos_list:
        raise RuntimeError("No DIPPER valid-like files found under dipper_root. Check path and filename conventions.")
    pos_all = pd.concat(pos_list, ignore_index=True)
    # deduplicate near-identical pairs (optional): uncomment if needed
    # pos_all = pos_all.drop_duplicates(subset=["sentence1","sentence2"]).reset_index(drop=True)
    full = add_negatives(pos_all, ratio=1.0)
    val_size = min(val_size_limit, max(2000, int(0.1 * len(full))))
    val_df = full.sample(val_size, random_state=42).reset_index(drop=True)
    print(f"[load_dipper_val] Loaded DIPPER val: {len(val_df)} samples (label dist: {val_df['label'].value_counts().to_dict()})")
    return val_df

# HC3 helpers (adapted from your code)
def to_list_safely(x):
    if x is None: return []
    try:
        if pd.isna(x): return []
    except Exception:
        pass
    if isinstance(x, (list, tuple)):
        return [str(e).strip() for e in x if str(e).strip()]
    if isinstance(x, np.ndarray):
        return [str(e).strip() for e in x.tolist() if str(e).strip()]
    s = str(x).strip()
    return [s] if s else []

def make_pairs_from_hc3_split(ds_split, add_context: bool, cartesian: bool = True):
    df = ds_split.to_pandas()
    rows = []
    human_col = "human_answers" if "human_answers" in df.columns else "human_answers"
    ai_col = "chatgpt_answers" if "chatgpt_answers" in df.columns else "chatgpt_answers"
    q_col = "question" if "question" in df.columns else None
    for _, r in df.iterrows():
        q = ""
        if q_col is not None and q_col in r:
            q = str(r[q_col]).strip()
        human_list = to_list_safely(r.get(human_col))
        ai_list = to_list_safely(r.get(ai_col))
        if not human_list or not ai_list:
            continue
        prefix = "lexical = NA, order = NA"
        if cartesian:
            for h in human_list:
                for a in ai_list:
                    if add_context and q:
                        col0 = f"{prefix}  <sent> {q} </sent> {h}"
                    else:
                        col0 = f"{prefix} {h}"
                    rows.append((col0, a))
        else:
            k = min(len(human_list), len(ai_list))
            for i in range(k):
                h = human_list[i]; a = ai_list[i]
                if add_context and q:
                    col0 = f"{prefix}  <sent> {q} </sent> {h}"
                else:
                    col0 = f"{prefix} {h}"
                rows.append((col0, a))
    return pd.DataFrame(rows, columns=[0, 1])

def build_hc3_tsvs(cartesian=True):
    print("Loading HC3 (Hello-SimpleAI/HC3, config='all') ...")
    hc3 = load_dataset("Hello-SimpleAI/HC3", "all")
    all_splits = {}
    for split in hc3.keys():
        df_no_ctx = make_pairs_from_hc3_split(hc3[split], add_context=False, cartesian=cartesian)
        df_no_ctx.columns = ["sentence1", "sentence2"]
        df_no_ctx["label"] = 1
        all_splits[f"hc3_{split}_no_ctx"] = df_no_ctx
        df_ctx = make_pairs_from_hc3_split(hc3[split], add_context=True, cartesian=cartesian)
        df_ctx.columns = ["sentence1", "sentence2"]
        df_ctx["label"] = 1
        all_splits[f"hc3_{split}_ctx"] = df_ctx
        print(f"  HC3 {split}: no_ctx={len(df_no_ctx)} ctx={len(df_ctx)}")
    # We'll return concatenation of available splits for evaluation
    if len(all_splits) == 0:
        raise RuntimeError("HC3 had no splits.")
    combined = pd.concat(list(all_splits.values()), ignore_index=True)
    # Add negatives by shuffling similar to DIPPER
    combined_full = add_negatives(combined, ratio=1.0)
    # sample validation-size portion (keep reasonable)
    val_size = min(8000, max(2000, int(0.1 * len(combined_full))))
    val_df = combined_full.sample(val_size, random_state=42).reset_index(drop=True)
    print(f"Built HC3-eval: {len(val_df)} samples")
    return val_df

# ----- Torch Dataset -----
class PairEvalDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, max_length=256):
        self.df = df.reset_index(drop=True)
        self.tok = tokenizer
        self.max_length = max_length
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        r = self.df.loc[idx]
        a = str(r["sentence1"])
        b = str(r["sentence2"])
        toks = self.tok(a, b, truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt")
        item = {k: v.squeeze(0) for k, v in toks.items()}
        item["label"] = torch.tensor(int(r["label"]), dtype=torch.long)
        return item

# ----- Evaluation functions -----
def save_confusion_matrix(cm: np.ndarray, dataset_name: str, output_dir: str, normalize: bool = False):
    fmt = 'd'
    cbar_label = 'Count'
    title = f"{dataset_name} - Confusion Matrix"
    cm_plot = cm
    if normalize:
        cm_plot = cm.astype(float)
        row_sums = cm_plot.sum(axis=1, keepdims=True)
        cm_plot = np.divide(cm_plot, row_sums, where=row_sums != 0)
        fmt = '.2f'
        cbar_label = 'Proportion'
        title = f"{dataset_name} - Normalized Confusion Matrix"
    plt.figure(figsize=(5.5,4.5))
    sns.heatmap(cm_plot, annot=True, fmt=fmt, cmap='Blues',
                xticklabels=['Not Paraphrase','Paraphrase'], yticklabels=['Not Paraphrase','Paraphrase'],
                cbar_kws={'label': cbar_label})
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    suffix = '_normalized' if normalize else ''
    fname = os.path.join(output_dir, f"{dataset_name.lower().replace(' ','_')}_confusion_matrix{suffix}.png")
    plt.savefig(fname, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved {fname}")

def evaluate_and_save(model, tokenizer, df: pd.DataFrame, dataset_name: str, output_dir: str, batch_size: int = 64, device: torch.device = DEVICE):
    if df is None or len(df) == 0:
        print(f"Skipping {dataset_name} - no data")
        return None, (None, None)
    print(f"\nEvaluating {dataset_name} ({len(df)} samples)")
    ds = PairEvalDataset(df, tokenizer, max_length=MAX_LENGTH)
    loader = DataLoader(ds, batch_size=batch_size, shuffle=False)
    model.to(device); model.eval()
    all_preds = []; all_labels = []; all_probs = []
    use_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
    amp_dtype = torch.bfloat16 if use_bf16 else torch.float16
    with torch.no_grad():
        with torch.cuda.amp.autocast(enabled=USE_MIXED_PRECISION and torch.cuda.is_available(), dtype=amp_dtype):
            for batch in loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                token_type_ids = batch.get('token_type_ids', None)
                if token_type_ids is not None:
                    token_type_ids = token_type_ids.to(device)
                    out = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
                else:
                    out = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = out.logits
                probs = F.softmax(logits, dim=-1)[:, 1].cpu().numpy()
                preds = (probs > 0.5).astype(int)
                all_preds.extend(preds.tolist())
                all_labels.extend(batch['label'].cpu().numpy().tolist())
                all_probs.extend(probs.tolist())
    acc = accuracy_score(all_labels, all_preds)
    p, r, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary', zero_division=0)
    mcc = matthews_corrcoef(all_labels, all_preds) if len(set(all_labels))>1 else 0.0
    cm = confusion_matrix(all_labels, all_preds, labels=[0,1])
    auc = None; fpr = None; tpr = None; tpr_1pct = None
    try:
        auc = float(roc_auc_score(all_labels, all_probs))
        fpr, tpr, _ = roc_curve(all_labels, all_probs)
        if len(fpr)>1:
            tpr_1pct = float(np.interp(0.01, fpr, tpr))
    except Exception:
        pass
    metrics = {
        'dataset': dataset_name,
        'accuracy': float(acc),
        'precision': float(p),
        'recall': float(r),
        'f1': float(f1),
        'mcc': float(mcc),
        'auc': auc,
        'confusion_matrix': cm.tolist(),
        'samples': len(df),
        'tpr_at_1pct_fpr': tpr_1pct,
        'fpr': fpr.tolist() if fpr is not None else None,
        'tpr': tpr.tolist() if tpr is not None else None
    }
    save_confusion_matrix(cm, dataset_name, output_dir, normalize=False)
    save_confusion_matrix(cm, dataset_name, output_dir, normalize=True)
    with open(os.path.join(output_dir, f"{dataset_name.lower().replace(' ','_')}_metrics.json"), "w") as fh:
        json.dump(metrics, fh, indent=2)
    print(classification_report(all_labels, all_preds, target_names=['Not Paraphrase','Paraphrase']))
    return metrics, (all_labels, all_probs)

# ----- Load model & tokenizer -----
print(f"Loading model/tokenizer: {MODEL_NAME} ...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(DEVICE)

# ----- Prepare datasets -----
try:
    dipper_val = load_dipper_val("/content/dipper_data")
except Exception as e:
    print("Warning: DIPPER load failed:", e)
    dipper_val = pd.DataFrame(columns=["sentence1","sentence2","label"])

try:
    hc3_df = build_hc3_tsvs(cartesian=True)
except Exception as e:
    print("Warning: HC3 load failed:", e)
    hc3_df = pd.DataFrame(columns=["sentence1","sentence2","label"])

# Custom CSV
CUSTOM_CSV_PATH = "/content/custom_paraphrases.csv"
custom_df = None
if os.path.isfile(CUSTOM_CSV_PATH):
    cdf = pd.read_csv(CUSTOM_CSV_PATH)
    if {"original_sentence","paraphrased_sentence"}.issubset(set(cdf.columns)):
        custom_df = pd.DataFrame({
            "sentence1": cdf["original_sentence"].astype(str),
            "sentence2": cdf["paraphrased_sentence"].astype(str),
            "label": 1
        })
        custom_eval_df = add_negatives(custom_df, ratio=1.0)
        print(f"Loaded custom CSV: {len(custom_eval_df)} (pos/neg: {custom_eval_df['label'].value_counts().to_dict()})")
    else:
        print("Custom CSV missing required columns. Skipping custom.")
        custom_eval_df = pd.DataFrame(columns=["sentence1","sentence2","label"])
else:
    print(f"Custom CSV not found at {CUSTOM_CSV_PATH}. Skipping custom.")
    custom_eval_df = pd.DataFrame(columns=["sentence1","sentence2","label"])

OUT = OUTPUT_ROOT
OUT.mkdir(parents=True, exist_ok=True)

# ----- Evaluate DIPPER and HC3 -----
all_metrics = []
roc_entries = []
for df, name in [(dipper_val, "DIPPER_VAL"), (hc3_df, "HC3_VAL")]:
    if df is None or len(df)==0:
        print(f"Skipping {name} - no data")
        continue
    metrics, (labels, probs) = evaluate_and_save(model, tokenizer, df, name, str(OUT), batch_size=BATCH_SIZE)
    if metrics:
        all_metrics.append(metrics)
        if metrics.get('auc') is not None and metrics.get('fpr') is not None:
            roc_entries.append({
                'dataset': name,
                'fpr': np.array(metrics['fpr']),
                'tpr': np.array(metrics['tpr']),
                'auc': metrics['auc'],
                'accuracy': metrics['accuracy']
            })

with open(os.path.join(OUT, "summary.json"), "w") as fh:
    json.dump(all_metrics, fh, indent=2)
print(f"Saved summary.json to {OUT}")

# ----- Plot ROC curves comparison -----
if len(roc_entries) > 0:
    plt.figure(figsize=(8,6))
    cmap = plt.get_cmap('tab10')
    lines = []
    labels_list = []
    for i, entry in enumerate(roc_entries):
        color = cmap(i % 10)
        fpr = entry['fpr']
        tpr = entry['tpr']
        auc_val = entry['auc']
        acc_val = entry['accuracy']

        if len(fpr) == 0 or len(tpr) == 0:
            continue

        try:
            tpr_at_1pct = float(np.interp(0.01, fpr, tpr))
        except Exception:
            tpr_at_1pct = 0.0
        plt.plot(fpr, tpr, label=None, color=color, linewidth=2)
        legend_label = f"{entry['dataset']} (AUC={auc_val:.3f}, TPR@1%FPR={tpr_at_1pct:.3f}, Acc={acc_val:.3f})"
        lines.append(plt.Line2D([0],[0], color=color, lw=2))
        labels_list.append(legend_label)

    plt.plot([0,1],[0,1], linestyle='--', color='gray', linewidth=1.5, label='Random')
    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.05])
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title('ROC Curves Comparison', fontsize=14, fontweight='bold')
    plt.legend(lines + [plt.Line2D([0],[0], color='gray', lw=1.5, linestyle='--')],
               labels_list + ['Random'], loc='lower right', fontsize=10)
    plt.grid(alpha=0.3)
    roc_path = os.path.join(OUT, 'roc_comparison.png')
    plt.tight_layout()
    plt.savefig(roc_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved {roc_path}")
else:
    print('No ROC data available for plotting.')

# ----- Plot performance metrics comparison -----
if len(all_metrics) > 1:
    comp_df = pd.DataFrame(all_metrics)
    comp_df_sorted = comp_df.set_index('dataset')

    fig, ax = plt.subplots(figsize=(12,6))
    comp_df_sorted[['accuracy','precision','recall','f1','mcc','auc','tpr_at_1pct_fpr']].plot(
        kind='bar', ax=ax, width=0.8)
    plt.title('Performance Metrics Comparison Across Datasets', fontsize=14, fontweight='bold')
    plt.ylabel('Score', fontsize=12)
    plt.xlabel('Dataset', fontsize=12)
    plt.ylim(0,1.05)
    plt.legend(title='Metrics', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    comp_path = os.path.join(OUT, 'metrics_comparison.png')
    plt.savefig(comp_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved {comp_path}")

# ----- Print and save summary table -----
summary_df = pd.DataFrame(all_metrics)
if not summary_df.empty:
    display_cols = ['dataset','accuracy','precision','recall','f1','mcc',
                'auc','tpr_at_1pct_fpr','samples']
    print('\n' + '='*80)
    print('SUMMARY TABLE')
    print('='*80)
    print(summary_df[display_cols].round(4).to_string(index=False))
    print('='*80)
    summary_df.to_csv(os.path.join(OUT, 'summary.csv'), index=False)
    print(f"\nSaved summary table to {os.path.join(OUT, 'summary.csv')}")

# ----- Evaluate custom CSV and create per-model analysis -----
if custom_eval_df is not None and len(custom_eval_df)>0:
    print("\n" + "="*80)
    print("CUSTOM CSV EVALUATION - PER-MODEL ANALYSIS")
    print("="*80)

    metrics_c, (labels_c, probs_c) = evaluate_and_save(model, tokenizer, custom_eval_df, "CUSTOM_CSV", str(OUT), batch_size=BATCH_SIZE)

    # Get predictions for all samples
    ds_custom = PairEvalDataset(custom_eval_df, tokenizer, max_length=MAX_LENGTH)
    loader = DataLoader(ds_custom, batch_size=BATCH_SIZE, shuffle=False)
    preds_list = []
    model.eval()
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            out = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = F.softmax(out.logits, dim=-1)[:,1].cpu().numpy()
            preds = (probs > 0.5).astype(int)
            for i in range(len(preds)):
                preds_list.append({'pred': int(preds[i]), 'prob_paraphrase': float(probs[i])})

    # Combine with original data
    report_df = custom_eval_df.copy().reset_index(drop=True)
    preds_df = pd.DataFrame(preds_list)
    display_df = pd.concat([report_df, preds_df], axis=1)

    # Load original CSV to get model names
    if os.path.isfile(CUSTOM_CSV_PATH):
        orig_csv = pd.read_csv(CUSTOM_CSV_PATH)
        if 'model_name' in orig_csv.columns:
            display_df['model_name'] = orig_csv['model_name'].values[:len(display_df)]

            # Calculate per-model metrics
            model_metrics = []
            for model_name in display_df['model_name'].unique():
                model_data = display_df[display_df['model_name'] == model_name]
                labels_m = model_data['label'].values
                preds_m = model_data['pred'].values

                acc = accuracy_score(labels_m, preds_m)
                p, r, f1, _ = precision_recall_fscore_support(labels_m, preds_m, average='binary', zero_division=0)

                model_metrics.append({
                    'Model': model_name,
                    'Precision (%)': p * 100,
                    'Recall (%)': r * 100,
                    'F1-Score (%)': f1 * 100,
                    'Accuracy (%)': acc * 100,
                    'Samples': len(model_data)
                })

            model_perf_df = pd.DataFrame(model_metrics)

            # Display and save per-model table
            print("\nPer-Model Performance:")
            print(model_perf_df.round(2).to_string(index=False))
            model_perf_df.to_csv(os.path.join(OUT, "per_model_metrics.csv"), index=False)
            print(f"\nSaved per-model metrics to {os.path.join(OUT, 'per_model_metrics.csv')}")

            # Plot per-model performance
            fig, ax = plt.subplots(figsize=(10, 6))
            x = np.arange(len(model_perf_df))
            width = 0.2

            ax.bar(x - 1.5*width, model_perf_df['Precision (%)'], width, label='Precision', alpha=0.8)
            ax.bar(x - 0.5*width, model_perf_df['Recall (%)'], width, label='Recall', alpha=0.8)
            ax.bar(x + 0.5*width, model_perf_df['F1-Score (%)'], width, label='F1-Score', alpha=0.8)
            ax.bar(x + 1.5*width, model_perf_df['Accuracy (%)'], width, label='Accuracy', alpha=0.8)

            ax.set_ylabel('Score (%)', fontsize=12)
            ax.set_xlabel('Model', fontsize=12)
            ax.set_title('Paraphrase Detection Performance by LLM', fontsize=14, fontweight='bold')
            ax.set_xticks(x)
            ax.set_xticklabels(model_perf_df['Model'], rotation=45, ha='right')
            ax.legend()
            ax.set_ylim(0, 105)
            ax.grid(axis='y', alpha=0.3)
            plt.tight_layout()

            model_plot_path = os.path.join(OUT, 'per_model_performance.png')
            plt.savefig(model_plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"Saved per-model plot to {model_plot_path}")

            # Plot average probability scores by model
            fig, ax = plt.subplots(figsize=(10, 6))
            model_probs = display_df.groupby('model_name')['prob_paraphrase'].agg(['mean', 'std'])
            model_probs = model_probs.sort_values('mean', ascending=False)

            ax.barh(model_probs.index, model_probs['mean'], xerr=model_probs['std'],
                   capsize=5, alpha=0.7, color='steelblue')
            ax.set_xlabel('Average Paraphrase Probability', fontsize=12)
            ax.set_ylabel('Model', fontsize=12)
            ax.set_title('Average Paraphrase Detection Confidence by LLM', fontsize=14, fontweight='bold')
            ax.set_xlim(0, 1)
            ax.grid(axis='x', alpha=0.3)
            plt.tight_layout()

            prob_plot_path = os.path.join(OUT, 'model_confidence_scores.png')
            plt.savefig(prob_plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"Saved confidence scores plot to {prob_plot_path}")

            # Heatmap of per-model confusion matrices
            n_models = len(model_perf_df)
            fig, axes = plt.subplots(1, n_models, figsize=(5*n_models, 4))
            if n_models == 1:
                axes = [axes]

            for idx, model_name in enumerate(display_df['model_name'].unique()):
                model_data = display_df[display_df['model_name'] == model_name]
                cm = confusion_matrix(model_data['label'], model_data['pred'], labels=[0,1])

                sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                           xticklabels=['Not Para','Para'],
                           yticklabels=['Not Para','Para'],
                           cbar=False)
                axes[idx].set_title(f'{model_name}', fontsize=11, fontweight='bold')
                axes[idx].set_ylabel('True' if idx == 0 else '', fontsize=10)
                axes[idx].set_xlabel('Predicted', fontsize=10)

            plt.suptitle('Confusion Matrices by LLM', fontsize=14, fontweight='bold', y=1.02)
            plt.tight_layout()

            cm_plot_path = os.path.join(OUT, 'model_confusion_matrices.png')
            plt.savefig(cm_plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"Saved confusion matrices plot to {cm_plot_path}")

            # Create error analysis: false positives and false negatives by model
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

            error_data = []
            for model_name in display_df['model_name'].unique():
                model_data = display_df[display_df['model_name'] == model_name]
                fp = ((model_data['label'] == 0) & (model_data['pred'] == 1)).sum()
                fn = ((model_data['label'] == 1) & (model_data['pred'] == 0)).sum()
                tp = ((model_data['label'] == 1) & (model_data['pred'] == 1)).sum()
                tn = ((model_data['label'] == 0) & (model_data['pred'] == 0)).sum()
                error_data.append({
                    'Model': model_name,
                    'False Positives': fp,
                    'False Negatives': fn,
                    'True Positives': tp,
                    'True Negatives': tn
                })

            error_df = pd.DataFrame(error_data)

            # Plot false positives and false negatives
            x = np.arange(len(error_df))
            width = 0.35

            ax1.bar(x - width/2, error_df['False Positives'], width, label='False Positives', color='salmon', alpha=0.8)
            ax1.bar(x + width/2, error_df['False Negatives'], width, label='False Negatives', color='lightcoral', alpha=0.8)
            ax1.set_ylabel('Count', fontsize=11)
            ax1.set_xlabel('Model', fontsize=11)
            ax1.set_title('Error Analysis: False Positives vs False Negatives', fontsize=12, fontweight='bold')
            ax1.set_xticks(x)
            ax1.set_xticklabels(error_df['Model'], rotation=45, ha='right')
            ax1.legend()
            ax1.grid(axis='y', alpha=0.3)

            # Plot true positives and true negatives
            ax2.bar(x - width/2, error_df['True Positives'], width, label='True Positives', color='mediumseagreen', alpha=0.8)
            ax2.bar(x + width/2, error_df['True Negatives'], width, label='True Negatives', color='lightgreen', alpha=0.8)
            ax2.set_ylabel('Count', fontsize=11)
            ax2.set_xlabel('Model', fontsize=11)
            ax2.set_title('Correct Predictions: True Positives vs True Negatives', fontsize=12, fontweight='bold')
            ax2.set_xticks(x)
            ax2.set_xticklabels(error_df['Model'], rotation=45, ha='right')
            ax2.legend()
            ax2.grid(axis='y', alpha=0.3)

            plt.tight_layout()
            error_plot_path = os.path.join(OUT, 'model_error_analysis.png')
            plt.savefig(error_plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"Saved error analysis plot to {error_plot_path}")

            # Distribution of confidence scores by model
            fig, ax = plt.subplots(figsize=(12, 6))

            for model_name in display_df['model_name'].unique():
                model_data = display_df[display_df['model_name'] == model_name]
                ax.hist(model_data['prob_paraphrase'], bins=30, alpha=0.5, label=model_name, edgecolor='black')

            ax.set_xlabel('Paraphrase Probability', fontsize=12)
            ax.set_ylabel('Frequency', fontsize=12)
            ax.set_title('Distribution of Paraphrase Detection Confidence Scores by LLM', fontsize=14, fontweight='bold')
            ax.legend()
            ax.grid(axis='y', alpha=0.3)
            ax.axvline(x=0.5, color='red', linestyle='--', linewidth=2, label='Decision Threshold')
            plt.tight_layout()

            dist_plot_path = os.path.join(OUT, 'confidence_distribution.png')
            plt.savefig(dist_plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"Saved confidence distribution plot to {dist_plot_path}")

    # Save detailed per-sample results
    display_df.to_csv(os.path.join(OUT, "custom_results_detailed.csv"), index=False)
    print(f"\nSaved detailed results to {os.path.join(OUT, 'custom_results_detailed.csv')}")
    print(f"Total samples evaluated: {len(display_df)}")
else:
    print("\nNo custom CSV results to display.")

print("\n" + "="*80)
print("EVALUATION COMPLETE")
print("="*80)
print(f"All results saved to: {OUT}")
print("\nGenerated files:")
print("  - summary.json, summary.csv: Overall metrics across datasets")
print("  - roc_comparison.png: ROC curves for DIPPER and HC3")
print("  - metrics_comparison.png: Bar chart comparing all metrics")
print("  - per_model_metrics.csv: Performance table by LLM")
print("  - per_model_performance.png: Bar chart of metrics by LLM")
print("  - model_confidence_scores.png: Average confidence by LLM")
print("  - model_confusion_matrices.png: Confusion matrices grid")
print("  - model_error_analysis.png: False positive/negative analysis")
print("  - confidence_distribution.png: Probability distributions")
print("  - custom_results_detailed.csv: Per-sample predictions")
print("="*80)


Loading model/tokenizer: Intel/roberta-base-mrpc ...
[load_dipper_val] Searching for DIPPER valid/dev files under: /content/dipper_data
  ✓ found DIPPER valid-like: par3/gt_translator/sents_8/valid_ctrl_no_ctx_all.tsv -> 1500
  ✓ found DIPPER valid-like: par3/gt_translator/sents_8/valid_ctrl_no_ctx.tsv -> 1483
  ✓ found DIPPER valid-like: par3/gt_translator/sents_8/valid_ctrl_ctx.tsv -> 1479
  ✓ found DIPPER valid-like: par3/gt_translator/sents_8/valid_ctrl_ctx_all.tsv -> 1500
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_no_ctx_all.tsv -> 1484
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_no_ctx.tsv -> 1478
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_ctx.tsv -> 1474
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_ctx_all_small.tsv -> 1500
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_ctx_all.tsv -> 1500
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_no_ctx_all_s

  if pd.isna(x): return []
  if pd.isna(x): return []


  HC3 train: no_ctx=60205 ctx=60205
Built HC3-eval: 8000 samples
Loaded custom CSV: 90 (pos/neg: {1: 60, 0: 30})

Evaluating DIPPER_VAL (8000 samples)


  with torch.cuda.amp.autocast(enabled=USE_MIXED_PRECISION and torch.cuda.is_available(), dtype=amp_dtype):


Saved /content/output/roberta_mrpc/dipper_val_confusion_matrix.png
Saved /content/output/roberta_mrpc/dipper_val_confusion_matrix_normalized.png
                precision    recall  f1-score   support

Not Paraphrase       0.39      1.00      0.56      2673
    Paraphrase       1.00      0.21      0.34      5327

      accuracy                           0.47      8000
     macro avg       0.69      0.60      0.45      8000
  weighted avg       0.79      0.47      0.42      8000


Evaluating HC3_VAL (8000 samples)


  with torch.cuda.amp.autocast(enabled=USE_MIXED_PRECISION and torch.cuda.is_available(), dtype=amp_dtype):


Saved /content/output/roberta_mrpc/hc3_val_confusion_matrix.png
Saved /content/output/roberta_mrpc/hc3_val_confusion_matrix_normalized.png
                precision    recall  f1-score   support

Not Paraphrase       0.33      1.00      0.50      2664
    Paraphrase       1.00      0.00      0.00      5336

      accuracy                           0.33      8000
     macro avg       0.67      0.50      0.25      8000
  weighted avg       0.78      0.33      0.17      8000

Saved summary.json to /content/output/roberta_mrpc

Evaluating CUSTOM_CSV (90 samples)


  with torch.cuda.amp.autocast(enabled=USE_MIXED_PRECISION and torch.cuda.is_available(), dtype=amp_dtype):


Saved /content/output/roberta_mrpc/custom_csv_confusion_matrix.png
Saved /content/output/roberta_mrpc/custom_csv_confusion_matrix_normalized.png
                precision    recall  f1-score   support

Not Paraphrase       0.78      0.93      0.85        30
    Paraphrase       0.96      0.87      0.91        60

      accuracy                           0.89        90
     macro avg       0.87      0.90      0.88        90
  weighted avg       0.90      0.89      0.89        90



Unnamed: 0,sentence1,sentence2,label,pred,prob_paraphrase
0,The possibility of approximating a continuous ...,Several studies have examined the ability of a...,1,1,0.995276
1,State-of-the-art object detection networks dep...,Leading object detection networks rely on regi...,1,1,0.995997
2,I swear I wasn’t going to do a “Top Whatevers ...,"I had resolved not to write a ""Top Whatever"" b...",1,0,0.221370
3,Our evolutionary history suggests that there w...,"Indeed, science and technology are distinctly ...",0,0,0.011346
4,The possibility of approximating a continuous ...,Numerous research papers have investigated the...,1,1,0.996303
...,...,...,...,...,...
85,The possibility of approximating a continuous ...,A feedforward neural network with one hidden l...,1,1,0.996354
86,"In fact, science and technology are clearly di...","In a recent opinion piece in The Conversation,...",0,0,0.006926
87,I swear I wasn’t going to do a “Top Whatevers ...,We investigate a new machine learning problem ...,0,0,0.037536
88,Our evolutionary history suggests that there w...,Our evolutionary history indicates that there ...,1,1,0.996995


Saved custom_results.csv to /content/output/roberta_mrpc


## Intel/bert-base-uncased-mrpc

In [None]:
import gc
import torch

# Delete model, tokenizer, and any large tensors
del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()


In [None]:
# Cell: evaluate Intel/bert-base-uncased-mrpc
import os, re, json, zipfile, io
from pathlib import Path
import numpy as np
import pandas as pd
from typing import Optional, List
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support,
                             matthews_corrcoef, confusion_matrix, roc_auc_score, roc_curve,
                             classification_report)
import matplotlib.pyplot as plt
import seaborn as sns

MODEL_NAME = "Intel/bert-base-uncased-mrpc"
OUTPUT_ROOT = Path("./output/bert_mrpc").resolve()
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 64
MAX_LENGTH = 256
USE_MIXED_PRECISION = True

OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 64
MAX_LENGTH = 256
USE_MIXED_PRECISION = True

# ----- cleaning/parsing helpers (unchanged except robust column handling) -----
def clean_sentence(sent: str) -> str:
    s = str(sent)
    s = re.sub(r"lexical\s*=\s*\d+", "", s, flags=re.IGNORECASE)
    s = re.sub(r"order\s*=\s*\d+", "", s, flags=re.IGNORECASE)
    s = s.replace("<sent>", " ")
    # If the cell contains quoted pieces, keep the quoted content (like original code)
    matches = re.findall(r'"([^"]+)"', s)
    if matches:
        s = " ".join(matches)
    s = re.sub(r"\s+", " ", s)
    s = s.replace(", ,", ",").strip()
    return s

def parse_dipper_tsv(df: pd.DataFrame) -> pd.DataFrame:
    # Expect at least two columns; some tsvs may have extra columns (we only need first two)
    if df is None or df.shape[1] < 2:
        return pd.DataFrame(columns=["sentence1", "sentence2", "label"])
    # force using first two columns no matter the header
    c1, c2 = df.columns[0], df.columns[1]
    s1 = df[c1].astype(str).map(clean_sentence)
    s2 = df[c2].astype(str).map(clean_sentence)
    out = pd.DataFrame({"sentence1": s1, "sentence2": s2})
    # filter too-short entries (same thresholds as you used)
    out = out[(out["sentence1"].str.len() > 8) & (out["sentence2"].str.len() > 8)].copy()
    out["label"] = 1
    out.dropna(inplace=True)
    out.reset_index(drop=True, inplace=True)
    return out

def _read_tsv(path: str, nrows: Optional[int] = None) -> Optional[pd.DataFrame]:
    """
    Read a TSV file defensively.
    Return DataFrame with whatever columns read (header=None fallback).
    """
    try:
        # try without header first (most DIPPER tsvs are headerless)
        return pd.read_csv(path, sep="\t", nrows=nrows, header=None, engine="python", quoting=3)
    except Exception:
        try:
            # fallback: let pandas infer header
            return pd.read_csv(path, sep="\t", nrows=nrows, engine="python")
        except Exception:
            # last resort: return None
            return None

# ----- New recursive gather function that matches your tree -----
def gather_dipper_pos_val(root: str, nrows_val: Optional[int] = 1500) -> List[pd.DataFrame]:
    """
    Recursively search `root` for any sents_* directories and collect valid/dev-like TSVs.
    Returns a list of positive-only DataFrames (label=1).
    """
    out = []
    if not os.path.exists(root):
        print(f"[gather_dipper_pos_val] DIPPER root not found: {root}")
        return out

    # patterns to accept: valid/dev variants, ctx_all/no_ctx_all variants, and 'valid..._all' etc.
    # We'll accept any filename that contains 'valid' or 'dev' or exactly 'ctx_all' / 'no_ctx_all'
    # but exclude 'train' files.
    def is_valid_file(fname: str) -> bool:
        lower = fname.lower()
        if "train" in lower:
            return False
        if "valid" in lower or "dev" in lower or "ctx_all" in lower or "no_ctx_all" in lower:
            return True
        return False

    found_files = []
    for dirpath, dirnames, filenames in os.walk(root):
        base = os.path.basename(dirpath)
        # focus on directories named 'sents_*' (covers the structure you posted)
        if not base.startswith("sents_"):
            continue
        for fname in filenames:
            if not fname.lower().endswith(".tsv"):
                continue
            if is_valid_file(fname):
                fpath = os.path.join(dirpath, fname)
                found_files.append(fpath)
                try:
                    df_raw = _read_tsv(fpath, nrows=nrows_val)
                    df = parse_dipper_tsv(df_raw) if df_raw is not None else None
                    if df is not None and len(df) > 0:
                        out.append(df)
                        print(f"  ✓ found DIPPER valid-like: {os.path.relpath(fpath, start=root)} -> {len(df)}")
                    else:
                        print(f"  ○ read but no usable rows: {os.path.relpath(fpath, start=root)}")
                except Exception as e:
                    print(f"  ! failed reading {fpath}: {e}")

    # Also check for some out_domain ctx_all/no_ctx_all files sitting under something like out_domain/*/sents_*
    # (the above walk will already hit them because they are under sents_*; this comment is just informative)
    if not found_files:
        print(f"[gather_dipper_pos_val] No valid/dev files found under {root}.")
    else:
        print(f"[gather_dipper_pos_val] Total valid-like files found: {len(found_files)}")

    return out

# ----- Add negatives (unchanged) -----
def add_negatives(df: pd.DataFrame, ratio=1.0, seed=42) -> pd.DataFrame:
    pos = df.copy().reset_index(drop=True)
    n = len(pos)
    if n < 4:
        return pos
    rng = np.random.default_rng(seed)
    idx = rng.permutation(n)
    half = n // 2
    neg = pd.DataFrame({
        "sentence1": pos.loc[idx[:half], "sentence1"].values,
        "sentence2": pos.loc[idx[-half:], "sentence2"].values,
        "label": 0
    })
    if len(neg) > int(len(pos) * ratio):
        neg = neg.sample(int(len(pos) * ratio), random_state=seed)
    pos["label"] = 1
    out = pd.concat([pos, neg], ignore_index=True).sample(frac=1.0, random_state=seed).reset_index(drop=True)
    return out

def load_dipper_val(dipper_root: str = "/content/dipper_data", val_size_limit: int = 8000) -> pd.DataFrame:
    """
    Build a DIPPER validation DF by aggregating all positive valid-like files found under dipper_root,
    then adding negatives and sampling a validation set.
    """
    print(f"[load_dipper_val] Searching for DIPPER valid/dev files under: {dipper_root}")
    pos_list = gather_dipper_pos_val(dipper_root)
    if not pos_list:
        raise RuntimeError("No DIPPER valid-like files found under dipper_root. Check path and filename conventions.")
    pos_all = pd.concat(pos_list, ignore_index=True)
    # deduplicate near-identical pairs (optional): uncomment if needed
    # pos_all = pos_all.drop_duplicates(subset=["sentence1","sentence2"]).reset_index(drop=True)
    full = add_negatives(pos_all, ratio=1.0)
    val_size = min(val_size_limit, max(2000, int(0.1 * len(full))))
    val_df = full.sample(val_size, random_state=42).reset_index(drop=True)
    print(f"[load_dipper_val] Loaded DIPPER val: {len(val_df)} samples (label dist: {val_df['label'].value_counts().to_dict()})")
    return val_df

# HC3 helpers (adapted from your code)
def to_list_safely(x):
    if x is None: return []
    try:
        if pd.isna(x): return []
    except Exception:
        pass
    if isinstance(x, (list, tuple)):
        return [str(e).strip() for e in x if str(e).strip()]
    if isinstance(x, np.ndarray):
        return [str(e).strip() for e in x.tolist() if str(e).strip()]
    s = str(x).strip()
    return [s] if s else []

def make_pairs_from_hc3_split(ds_split, add_context: bool, cartesian: bool = True):
    df = ds_split.to_pandas()
    rows = []
    human_col = "human_answers" if "human_answers" in df.columns else "human_answers"
    ai_col = "chatgpt_answers" if "chatgpt_answers" in df.columns else "chatgpt_answers"
    q_col = "question" if "question" in df.columns else None
    for _, r in df.iterrows():
        q = ""
        if q_col is not None and q_col in r:
            q = str(r[q_col]).strip()
        human_list = to_list_safely(r.get(human_col))
        ai_list = to_list_safely(r.get(ai_col))
        if not human_list or not ai_list:
            continue
        prefix = "lexical = NA, order = NA"
        if cartesian:
            for h in human_list:
                for a in ai_list:
                    if add_context and q:
                        col0 = f"{prefix}  <sent> {q} </sent> {h}"
                    else:
                        col0 = f"{prefix} {h}"
                    rows.append((col0, a))
        else:
            k = min(len(human_list), len(ai_list))
            for i in range(k):
                h = human_list[i]; a = ai_list[i]
                if add_context and q:
                    col0 = f"{prefix}  <sent> {q} </sent> {h}"
                else:
                    col0 = f"{prefix} {h}"
                rows.append((col0, a))
    return pd.DataFrame(rows, columns=[0, 1])

def build_hc3_tsvs(cartesian=True):
    print("Loading HC3 (Hello-SimpleAI/HC3, config='all') ...")
    hc3 = load_dataset("Hello-SimpleAI/HC3", "all")
    all_splits = {}
    for split in hc3.keys():
        df_no_ctx = make_pairs_from_hc3_split(hc3[split], add_context=False, cartesian=cartesian)
        df_no_ctx.columns = ["sentence1", "sentence2"]
        df_no_ctx["label"] = 1
        all_splits[f"hc3_{split}_no_ctx"] = df_no_ctx
        df_ctx = make_pairs_from_hc3_split(hc3[split], add_context=True, cartesian=cartesian)
        df_ctx.columns = ["sentence1", "sentence2"]
        df_ctx["label"] = 1
        all_splits[f"hc3_{split}_ctx"] = df_ctx
        print(f"  HC3 {split}: no_ctx={len(df_no_ctx)} ctx={len(df_ctx)}")
    # We'll return concatenation of available splits for evaluation
    if len(all_splits) == 0:
        raise RuntimeError("HC3 had no splits.")
    combined = pd.concat(list(all_splits.values()), ignore_index=True)
    # Add negatives by shuffling similar to DIPPER
    combined_full = add_negatives(combined, ratio=1.0)
    # sample validation-size portion (keep reasonable)
    val_size = min(8000, max(2000, int(0.1 * len(combined_full))))
    val_df = combined_full.sample(val_size, random_state=42).reset_index(drop=True)
    print(f"Built HC3-eval: {len(val_df)} samples")
    return val_df

# ----- Torch Dataset -----
class PairEvalDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, max_length=256):
        self.df = df.reset_index(drop=True)
        self.tok = tokenizer
        self.max_length = max_length
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        r = self.df.loc[idx]
        a = str(r["sentence1"])
        b = str(r["sentence2"])
        toks = self.tok(a, b, truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt")
        item = {k: v.squeeze(0) for k, v in toks.items()}
        item["label"] = torch.tensor(int(r["label"]), dtype=torch.long)
        return item

# ----- Evaluation functions -----
def save_confusion_matrix(cm: np.ndarray, dataset_name: str, output_dir: str, normalize: bool = False):
    fmt = 'd'
    cbar_label = 'Count'
    title = f"{dataset_name} - Confusion Matrix"
    cm_plot = cm
    if normalize:
        cm_plot = cm.astype(float)
        row_sums = cm_plot.sum(axis=1, keepdims=True)
        cm_plot = np.divide(cm_plot, row_sums, where=row_sums != 0)
        fmt = '.2f'
        cbar_label = 'Proportion'
        title = f"{dataset_name} - Normalized Confusion Matrix"
    plt.figure(figsize=(5.5,4.5))
    sns.heatmap(cm_plot, annot=True, fmt=fmt, cmap='Blues',
                xticklabels=['Not Paraphrase','Paraphrase'], yticklabels=['Not Paraphrase','Paraphrase'],
                cbar_kws={'label': cbar_label})
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    suffix = '_normalized' if normalize else ''
    fname = os.path.join(output_dir, f"{dataset_name.lower().replace(' ','_')}_confusion_matrix{suffix}.png")
    plt.savefig(fname, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved {fname}")

def evaluate_and_save(model, tokenizer, df: pd.DataFrame, dataset_name: str, output_dir: str, batch_size: int = 64, device: torch.device = DEVICE):
    if df is None or len(df) == 0:
        print(f"Skipping {dataset_name} - no data")
        return None, (None, None)
    print(f"\nEvaluating {dataset_name} ({len(df)} samples)")
    ds = PairEvalDataset(df, tokenizer, max_length=MAX_LENGTH)
    loader = DataLoader(ds, batch_size=batch_size, shuffle=False)
    model.to(device); model.eval()
    all_preds = []; all_labels = []; all_probs = []
    use_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
    amp_dtype = torch.bfloat16 if use_bf16 else torch.float16
    with torch.no_grad():
        with torch.cuda.amp.autocast(enabled=USE_MIXED_PRECISION and torch.cuda.is_available(), dtype=amp_dtype):
            for batch in loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                token_type_ids = batch.get('token_type_ids', None)
                if token_type_ids is not None:
                    token_type_ids = token_type_ids.to(device)
                    out = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
                else:
                    out = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = out.logits
                probs = F.softmax(logits, dim=-1)[:, 1].cpu().numpy()
                preds = (probs > 0.5).astype(int)
                all_preds.extend(preds.tolist())
                all_labels.extend(batch['label'].cpu().numpy().tolist())
                all_probs.extend(probs.tolist())
    acc = accuracy_score(all_labels, all_preds)
    p, r, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary', zero_division=0)
    mcc = matthews_corrcoef(all_labels, all_preds) if len(set(all_labels))>1 else 0.0
    cm = confusion_matrix(all_labels, all_preds, labels=[0,1])
    auc = None; fpr = None; tpr = None; tpr_1pct = None
    try:
        auc = float(roc_auc_score(all_labels, all_probs))
        fpr, tpr, _ = roc_curve(all_labels, all_probs)
        if len(fpr)>1:
            tpr_1pct = float(np.interp(0.01, fpr, tpr))
    except Exception:
        pass
    metrics = {
        'dataset': dataset_name,
        'accuracy': float(acc),
        'precision': float(p),
        'recall': float(r),
        'f1': float(f1),
        'mcc': float(mcc),
        'auc': auc,
        'confusion_matrix': cm.tolist(),
        'samples': len(df),
        'tpr_at_1pct_fpr': tpr_1pct,
        'fpr': fpr.tolist() if fpr is not None else None,
        'tpr': tpr.tolist() if tpr is not None else None
    }
    save_confusion_matrix(cm, dataset_name, output_dir, normalize=False)
    save_confusion_matrix(cm, dataset_name, output_dir, normalize=True)
    with open(os.path.join(output_dir, f"{dataset_name.lower().replace(' ','_')}_metrics.json"), "w") as fh:
        json.dump(metrics, fh, indent=2)
    print(classification_report(all_labels, all_preds, target_names=['Not Paraphrase','Paraphrase']))
    return metrics, (all_labels, all_probs)

# ----- Load model & tokenizer -----
print(f"Loading model/tokenizer: {MODEL_NAME} ...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(DEVICE)

# ----- Prepare datasets -----
try:
    dipper_val = load_dipper_val("/content/dipper_data")
except Exception as e:
    print("Warning: DIPPER load failed:", e)
    dipper_val = pd.DataFrame(columns=["sentence1","sentence2","label"])

try:
    hc3_df = build_hc3_tsvs(cartesian=True)
except Exception as e:
    print("Warning: HC3 load failed:", e)
    hc3_df = pd.DataFrame(columns=["sentence1","sentence2","label"])

# Custom CSV
CUSTOM_CSV_PATH = "/content/custom_paraphrases.csv"
custom_df = None
if os.path.isfile(CUSTOM_CSV_PATH):
    cdf = pd.read_csv(CUSTOM_CSV_PATH)
    if {"original_sentence","paraphrased_sentence"}.issubset(set(cdf.columns)):
        custom_df = pd.DataFrame({
            "sentence1": cdf["original_sentence"].astype(str),
            "sentence2": cdf["paraphrased_sentence"].astype(str),
            "label": 1
        })
        custom_eval_df = add_negatives(custom_df, ratio=1.0)
        print(f"Loaded custom CSV: {len(custom_eval_df)} (pos/neg: {custom_eval_df['label'].value_counts().to_dict()})")
    else:
        print("Custom CSV missing required columns. Skipping custom.")
        custom_eval_df = pd.DataFrame(columns=["sentence1","sentence2","label"])
else:
    print(f"Custom CSV not found at {CUSTOM_CSV_PATH}. Skipping custom.")
    custom_eval_df = pd.DataFrame(columns=["sentence1","sentence2","label"])

OUT = OUTPUT_ROOT
OUT.mkdir(parents=True, exist_ok=True)

# ----- Evaluate DIPPER and HC3 -----
all_metrics = []
roc_entries = []
for df, name in [(dipper_val, "DIPPER_VAL"), (hc3_df, "HC3_VAL")]:
    if df is None or len(df)==0:
        print(f"Skipping {name} - no data")
        continue
    metrics, (labels, probs) = evaluate_and_save(model, tokenizer, df, name, str(OUT), batch_size=BATCH_SIZE)
    if metrics:
        all_metrics.append(metrics)
        if metrics.get('auc') is not None and metrics.get('fpr') is not None:
            roc_entries.append({
                'dataset': name,
                'fpr': np.array(metrics['fpr']),
                'tpr': np.array(metrics['tpr']),
                'auc': metrics['auc'],
                'accuracy': metrics['accuracy']
            })

with open(os.path.join(OUT, "summary.json"), "w") as fh:
    json.dump(all_metrics, fh, indent=2)
print(f"Saved summary.json to {OUT}")

# ----- Plot ROC curves comparison -----
if len(roc_entries) > 0:
    plt.figure(figsize=(8,6))
    cmap = plt.get_cmap('tab10')
    lines = []
    labels_list = []
    for i, entry in enumerate(roc_entries):
        color = cmap(i % 10)
        fpr = entry['fpr']
        tpr = entry['tpr']
        auc_val = entry['auc']
        acc_val = entry['accuracy']

        if len(fpr) == 0 or len(tpr) == 0:
            continue

        try:
            tpr_at_1pct = float(np.interp(0.01, fpr, tpr))
        except Exception:
            tpr_at_1pct = 0.0
        plt.plot(fpr, tpr, label=None, color=color, linewidth=2)
        legend_label = f"{entry['dataset']} (AUC={auc_val:.3f}, TPR@1%FPR={tpr_at_1pct:.3f}, Acc={acc_val:.3f})"
        lines.append(plt.Line2D([0],[0], color=color, lw=2))
        labels_list.append(legend_label)

    plt.plot([0,1],[0,1], linestyle='--', color='gray', linewidth=1.5, label='Random')
    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.05])
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title('ROC Curves Comparison', fontsize=14, fontweight='bold')
    plt.legend(lines + [plt.Line2D([0],[0], color='gray', lw=1.5, linestyle='--')],
               labels_list + ['Random'], loc='lower right', fontsize=10)
    plt.grid(alpha=0.3)
    roc_path = os.path.join(OUT, 'roc_comparison.png')
    plt.tight_layout()
    plt.savefig(roc_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved {roc_path}")
else:
    print('No ROC data available for plotting.')

# ----- Plot performance metrics comparison -----
if len(all_metrics) > 1:
    comp_df = pd.DataFrame(all_metrics)
    comp_df_sorted = comp_df.set_index('dataset')

    fig, ax = plt.subplots(figsize=(12,6))
    comp_df_sorted[['accuracy','precision','recall','f1','mcc','auc','tpr_at_1pct_fpr']].plot(
        kind='bar', ax=ax, width=0.8)
    plt.title('Performance Metrics Comparison Across Datasets', fontsize=14, fontweight='bold')
    plt.ylabel('Score', fontsize=12)
    plt.xlabel('Dataset', fontsize=12)
    plt.ylim(0,1.05)
    plt.legend(title='Metrics', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    comp_path = os.path.join(OUT, 'metrics_comparison.png')
    plt.savefig(comp_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved {comp_path}")

# ----- Print and save summary table -----
summary_df = pd.DataFrame(all_metrics)
if not summary_df.empty:
    display_cols = ['dataset','accuracy','precision','recall','f1','mcc',
                'auc','tpr_at_1pct_fpr','samples']
    print('\n' + '='*80)
    print('SUMMARY TABLE')
    print('='*80)
    print(summary_df[display_cols].round(4).to_string(index=False))
    print('='*80)
    summary_df.to_csv(os.path.join(OUT, 'summary.csv'), index=False)
    print(f"\nSaved summary table to {os.path.join(OUT, 'summary.csv')}")

# ----- Evaluate custom CSV and create per-model analysis -----
if custom_eval_df is not None and len(custom_eval_df)>0:
    print("\n" + "="*80)
    print("CUSTOM CSV EVALUATION - PER-MODEL ANALYSIS")
    print("="*80)

    metrics_c, (labels_c, probs_c) = evaluate_and_save(model, tokenizer, custom_eval_df, "CUSTOM_CSV", str(OUT), batch_size=BATCH_SIZE)

    # Get predictions for all samples
    ds_custom = PairEvalDataset(custom_eval_df, tokenizer, max_length=MAX_LENGTH)
    loader = DataLoader(ds_custom, batch_size=BATCH_SIZE, shuffle=False)
    preds_list = []
    model.eval()
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            out = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = F.softmax(out.logits, dim=-1)[:,1].cpu().numpy()
            preds = (probs > 0.5).astype(int)
            for i in range(len(preds)):
                preds_list.append({'pred': int(preds[i]), 'prob_paraphrase': float(probs[i])})

    # Combine with original data
    report_df = custom_eval_df.copy().reset_index(drop=True)
    preds_df = pd.DataFrame(preds_list)
    display_df = pd.concat([report_df, preds_df], axis=1)

    # Load original CSV to get model names
    if os.path.isfile(CUSTOM_CSV_PATH):
        orig_csv = pd.read_csv(CUSTOM_CSV_PATH)
        if 'model_name' in orig_csv.columns:
            display_df['model_name'] = orig_csv['model_name'].values[:len(display_df)]

            # Calculate per-model metrics
            model_metrics = []
            for model_name in display_df['model_name'].unique():
                model_data = display_df[display_df['model_name'] == model_name]
                labels_m = model_data['label'].values
                preds_m = model_data['pred'].values

                acc = accuracy_score(labels_m, preds_m)
                p, r, f1, _ = precision_recall_fscore_support(labels_m, preds_m, average='binary', zero_division=0)

                model_metrics.append({
                    'Model': model_name,
                    'Precision (%)': p * 100,
                    'Recall (%)': r * 100,
                    'F1-Score (%)': f1 * 100,
                    'Accuracy (%)': acc * 100,
                    'Samples': len(model_data)
                })

            model_perf_df = pd.DataFrame(model_metrics)

            # Display and save per-model table
            print("\nPer-Model Performance:")
            print(model_perf_df.round(2).to_string(index=False))
            model_perf_df.to_csv(os.path.join(OUT, "per_model_metrics.csv"), index=False)
            print(f"\nSaved per-model metrics to {os.path.join(OUT, 'per_model_metrics.csv')}")

            # Plot per-model performance
            fig, ax = plt.subplots(figsize=(10, 6))
            x = np.arange(len(model_perf_df))
            width = 0.2

            ax.bar(x - 1.5*width, model_perf_df['Precision (%)'], width, label='Precision', alpha=0.8)
            ax.bar(x - 0.5*width, model_perf_df['Recall (%)'], width, label='Recall', alpha=0.8)
            ax.bar(x + 0.5*width, model_perf_df['F1-Score (%)'], width, label='F1-Score', alpha=0.8)
            ax.bar(x + 1.5*width, model_perf_df['Accuracy (%)'], width, label='Accuracy', alpha=0.8)

            ax.set_ylabel('Score (%)', fontsize=12)
            ax.set_xlabel('Model', fontsize=12)
            ax.set_title('Paraphrase Detection Performance by LLM', fontsize=14, fontweight='bold')
            ax.set_xticks(x)
            ax.set_xticklabels(model_perf_df['Model'], rotation=45, ha='right')
            ax.legend()
            ax.set_ylim(0, 105)
            ax.grid(axis='y', alpha=0.3)
            plt.tight_layout()

            model_plot_path = os.path.join(OUT, 'per_model_performance.png')
            plt.savefig(model_plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"Saved per-model plot to {model_plot_path}")

            # Plot average probability scores by model
            fig, ax = plt.subplots(figsize=(10, 6))
            model_probs = display_df.groupby('model_name')['prob_paraphrase'].agg(['mean', 'std'])
            model_probs = model_probs.sort_values('mean', ascending=False)

            ax.barh(model_probs.index, model_probs['mean'], xerr=model_probs['std'],
                   capsize=5, alpha=0.7, color='steelblue')
            ax.set_xlabel('Average Paraphrase Probability', fontsize=12)
            ax.set_ylabel('Model', fontsize=12)
            ax.set_title('Average Paraphrase Detection Confidence by LLM', fontsize=14, fontweight='bold')
            ax.set_xlim(0, 1)
            ax.grid(axis='x', alpha=0.3)
            plt.tight_layout()

            prob_plot_path = os.path.join(OUT, 'model_confidence_scores.png')
            plt.savefig(prob_plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"Saved confidence scores plot to {prob_plot_path}")

            # Heatmap of per-model confusion matrices
            n_models = len(model_perf_df)
            fig, axes = plt.subplots(1, n_models, figsize=(5*n_models, 4))
            if n_models == 1:
                axes = [axes]

            for idx, model_name in enumerate(display_df['model_name'].unique()):
                model_data = display_df[display_df['model_name'] == model_name]
                cm = confusion_matrix(model_data['label'], model_data['pred'], labels=[0,1])

                sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                           xticklabels=['Not Para','Para'],
                           yticklabels=['Not Para','Para'],
                           cbar=False)
                axes[idx].set_title(f'{model_name}', fontsize=11, fontweight='bold')
                axes[idx].set_ylabel('True' if idx == 0 else '', fontsize=10)
                axes[idx].set_xlabel('Predicted', fontsize=10)

            plt.suptitle('Confusion Matrices by LLM', fontsize=14, fontweight='bold', y=1.02)
            plt.tight_layout()

            cm_plot_path = os.path.join(OUT, 'model_confusion_matrices.png')
            plt.savefig(cm_plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"Saved confusion matrices plot to {cm_plot_path}")

            # Create error analysis: false positives and false negatives by model
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

            error_data = []
            for model_name in display_df['model_name'].unique():
                model_data = display_df[display_df['model_name'] == model_name]
                fp = ((model_data['label'] == 0) & (model_data['pred'] == 1)).sum()
                fn = ((model_data['label'] == 1) & (model_data['pred'] == 0)).sum()
                tp = ((model_data['label'] == 1) & (model_data['pred'] == 1)).sum()
                tn = ((model_data['label'] == 0) & (model_data['pred'] == 0)).sum()
                error_data.append({
                    'Model': model_name,
                    'False Positives': fp,
                    'False Negatives': fn,
                    'True Positives': tp,
                    'True Negatives': tn
                })

            error_df = pd.DataFrame(error_data)

            # Plot false positives and false negatives
            x = np.arange(len(error_df))
            width = 0.35

            ax1.bar(x - width/2, error_df['False Positives'], width, label='False Positives', color='salmon', alpha=0.8)
            ax1.bar(x + width/2, error_df['False Negatives'], width, label='False Negatives', color='lightcoral', alpha=0.8)
            ax1.set_ylabel('Count', fontsize=11)
            ax1.set_xlabel('Model', fontsize=11)
            ax1.set_title('Error Analysis: False Positives vs False Negatives', fontsize=12, fontweight='bold')
            ax1.set_xticks(x)
            ax1.set_xticklabels(error_df['Model'], rotation=45, ha='right')
            ax1.legend()
            ax1.grid(axis='y', alpha=0.3)

            # Plot true positives and true negatives
            ax2.bar(x - width/2, error_df['True Positives'], width, label='True Positives', color='mediumseagreen', alpha=0.8)
            ax2.bar(x + width/2, error_df['True Negatives'], width, label='True Negatives', color='lightgreen', alpha=0.8)
            ax2.set_ylabel('Count', fontsize=11)
            ax2.set_xlabel('Model', fontsize=11)
            ax2.set_title('Correct Predictions: True Positives vs True Negatives', fontsize=12, fontweight='bold')
            ax2.set_xticks(x)
            ax2.set_xticklabels(error_df['Model'], rotation=45, ha='right')
            ax2.legend()
            ax2.grid(axis='y', alpha=0.3)

            plt.tight_layout()
            error_plot_path = os.path.join(OUT, 'model_error_analysis.png')
            plt.savefig(error_plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"Saved error analysis plot to {error_plot_path}")

            # Distribution of confidence scores by model
            fig, ax = plt.subplots(figsize=(12, 6))

            for model_name in display_df['model_name'].unique():
                model_data = display_df[display_df['model_name'] == model_name]
                ax.hist(model_data['prob_paraphrase'], bins=30, alpha=0.5, label=model_name, edgecolor='black')

            ax.set_xlabel('Paraphrase Probability', fontsize=12)
            ax.set_ylabel('Frequency', fontsize=12)
            ax.set_title('Distribution of Paraphrase Detection Confidence Scores by LLM', fontsize=14, fontweight='bold')
            ax.legend()
            ax.grid(axis='y', alpha=0.3)
            ax.axvline(x=0.5, color='red', linestyle='--', linewidth=2, label='Decision Threshold')
            plt.tight_layout()

            dist_plot_path = os.path.join(OUT, 'confidence_distribution.png')
            plt.savefig(dist_plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"Saved confidence distribution plot to {dist_plot_path}")

    # Save detailed per-sample results
    display_df.to_csv(os.path.join(OUT, "custom_results_detailed.csv"), index=False)
    print(f"\nSaved detailed results to {os.path.join(OUT, 'custom_results_detailed.csv')}")
    print(f"Total samples evaluated: {len(display_df)}")
else:
    print("\nNo custom CSV results to display.")

print("\n" + "="*80)
print("EVALUATION COMPLETE")
print("="*80)
print(f"All results saved to: {OUT}")
print("\nGenerated files:")
print("  - summary.json, summary.csv: Overall metrics across datasets")
print("  - roc_comparison.png: ROC curves for DIPPER and HC3")
print("  - metrics_comparison.png: Bar chart comparing all metrics")
print("  - per_model_metrics.csv: Performance table by LLM")
print("  - per_model_performance.png: Bar chart of metrics by LLM")
print("  - model_confidence_scores.png: Average confidence by LLM")
print("  - model_confusion_matrices.png: Confusion matrices grid")
print("  - model_error_analysis.png: False positive/negative analysis")
print("  - confidence_distribution.png: Probability distributions")
print("  - custom_results_detailed.csv: Per-sample predictions")
print("="*80)


Loading model/tokenizer: Intel/bert-base-uncased-mrpc ...
[load_dipper_val] Searching for DIPPER valid/dev files under: /content/dipper_data
  ✓ found DIPPER valid-like: par3/gt_translator/sents_8/valid_ctrl_no_ctx_all.tsv -> 1500
  ✓ found DIPPER valid-like: par3/gt_translator/sents_8/valid_ctrl_no_ctx.tsv -> 1483
  ✓ found DIPPER valid-like: par3/gt_translator/sents_8/valid_ctrl_ctx.tsv -> 1479
  ✓ found DIPPER valid-like: par3/gt_translator/sents_8/valid_ctrl_ctx_all.tsv -> 1500
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_no_ctx_all.tsv -> 1484
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_no_ctx.tsv -> 1478
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_ctx.tsv -> 1474
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_ctx_all_small.tsv -> 1500
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_ctx_all.tsv -> 1500
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_no_ctx_

  if pd.isna(x): return []
  if pd.isna(x): return []


  HC3 train: no_ctx=60205 ctx=60205
Built HC3-eval: 8000 samples
Loaded custom CSV: 90 (pos/neg: {1: 60, 0: 30})

Evaluating DIPPER_VAL (8000 samples)


  with torch.cuda.amp.autocast(enabled=USE_MIXED_PRECISION and torch.cuda.is_available(), dtype=amp_dtype):


Saved /content/output/bert_mrpc/dipper_val_confusion_matrix.png
Saved /content/output/bert_mrpc/dipper_val_confusion_matrix_normalized.png
                precision    recall  f1-score   support

Not Paraphrase       0.39      1.00      0.56      2673
    Paraphrase       0.99      0.23      0.37      5327

      accuracy                           0.49      8000
     macro avg       0.69      0.61      0.47      8000
  weighted avg       0.79      0.49      0.44      8000


Evaluating HC3_VAL (8000 samples)


  with torch.cuda.amp.autocast(enabled=USE_MIXED_PRECISION and torch.cuda.is_available(), dtype=amp_dtype):


Saved /content/output/bert_mrpc/hc3_val_confusion_matrix.png
Saved /content/output/bert_mrpc/hc3_val_confusion_matrix_normalized.png
                precision    recall  f1-score   support

Not Paraphrase       0.34      1.00      0.51      2664
    Paraphrase       1.00      0.04      0.08      5336

      accuracy                           0.36      8000
     macro avg       0.67      0.52      0.30      8000
  weighted avg       0.78      0.36      0.22      8000

Saved summary.json to /content/output/bert_mrpc

Evaluating CUSTOM_CSV (90 samples)


  with torch.cuda.amp.autocast(enabled=USE_MIXED_PRECISION and torch.cuda.is_available(), dtype=amp_dtype):


Saved /content/output/bert_mrpc/custom_csv_confusion_matrix.png
Saved /content/output/bert_mrpc/custom_csv_confusion_matrix_normalized.png
                precision    recall  f1-score   support

Not Paraphrase       0.82      0.93      0.88        30
    Paraphrase       0.96      0.90      0.93        60

      accuracy                           0.91        90
     macro avg       0.89      0.92      0.90        90
  weighted avg       0.92      0.91      0.91        90



Unnamed: 0,sentence1,sentence2,label,pred,prob_paraphrase
0,The possibility of approximating a continuous ...,Several studies have examined the ability of a...,1,0,0.419090
1,State-of-the-art object detection networks dep...,Leading object detection networks rely on regi...,1,1,0.909879
2,I swear I wasn’t going to do a “Top Whatevers ...,"I had resolved not to write a ""Top Whatever"" b...",1,0,0.007163
3,Our evolutionary history suggests that there w...,"Indeed, science and technology are distinctly ...",0,0,0.013192
4,The possibility of approximating a continuous ...,Numerous research papers have investigated the...,1,0,0.295652
...,...,...,...,...,...
85,The possibility of approximating a continuous ...,A feedforward neural network with one hidden l...,1,0,0.432861
86,"In fact, science and technology are clearly di...","In a recent opinion piece in The Conversation,...",0,0,0.006711
87,I swear I wasn’t going to do a “Top Whatevers ...,We investigate a new machine learning problem ...,0,0,0.004467
88,Our evolutionary history suggests that there w...,Our evolutionary history indicates that there ...,1,1,0.897419


Saved custom_results.csv to /content/output/bert_mrpc


## Intel/deberta-v3-base-mrpc

In [None]:
import gc
import torch

# Delete model, tokenizer, and any large tensors
del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()


In [None]:
MODEL_NAME = "Intel/deberta-v3-base-mrpc"
OUTPUT_ROOT = Path("./output/deberta_v3_mrpc").resolve()

OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 64
MAX_LENGTH = 256
USE_MIXED_PRECISION = True

# ----- cleaning/parsing helpers (unchanged except robust column handling) -----
def clean_sentence(sent: str) -> str:
    s = str(sent)
    s = re.sub(r"lexical\s*=\s*\d+", "", s, flags=re.IGNORECASE)
    s = re.sub(r"order\s*=\s*\d+", "", s, flags=re.IGNORECASE)
    s = s.replace("<sent>", " ")
    # If the cell contains quoted pieces, keep the quoted content (like original code)
    matches = re.findall(r'"([^"]+)"', s)
    if matches:
        s = " ".join(matches)
    s = re.sub(r"\s+", " ", s)
    s = s.replace(", ,", ",").strip()
    return s

def parse_dipper_tsv(df: pd.DataFrame) -> pd.DataFrame:
    # Expect at least two columns; some tsvs may have extra columns (we only need first two)
    if df is None or df.shape[1] < 2:
        return pd.DataFrame(columns=["sentence1", "sentence2", "label"])
    # force using first two columns no matter the header
    c1, c2 = df.columns[0], df.columns[1]
    s1 = df[c1].astype(str).map(clean_sentence)
    s2 = df[c2].astype(str).map(clean_sentence)
    out = pd.DataFrame({"sentence1": s1, "sentence2": s2})
    # filter too-short entries (same thresholds as you used)
    out = out[(out["sentence1"].str.len() > 8) & (out["sentence2"].str.len() > 8)].copy()
    out["label"] = 1
    out.dropna(inplace=True)
    out.reset_index(drop=True, inplace=True)
    return out

def _read_tsv(path: str, nrows: Optional[int] = None) -> Optional[pd.DataFrame]:
    """
    Read a TSV file defensively.
    Return DataFrame with whatever columns read (header=None fallback).
    """
    try:
        # try without header first (most DIPPER tsvs are headerless)
        return pd.read_csv(path, sep="\t", nrows=nrows, header=None, engine="python", quoting=3)
    except Exception:
        try:
            # fallback: let pandas infer header
            return pd.read_csv(path, sep="\t", nrows=nrows, engine="python")
        except Exception:
            # last resort: return None
            return None

# ----- New recursive gather function that matches your tree -----
def gather_dipper_pos_val(root: str, nrows_val: Optional[int] = 1500) -> List[pd.DataFrame]:
    """
    Recursively search `root` for any sents_* directories and collect valid/dev-like TSVs.
    Returns a list of positive-only DataFrames (label=1).
    """
    out = []
    if not os.path.exists(root):
        print(f"[gather_dipper_pos_val] DIPPER root not found: {root}")
        return out

    # patterns to accept: valid/dev variants, ctx_all/no_ctx_all variants, and 'valid..._all' etc.
    # We'll accept any filename that contains 'valid' or 'dev' or exactly 'ctx_all' / 'no_ctx_all'
    # but exclude 'train' files.
    def is_valid_file(fname: str) -> bool:
        lower = fname.lower()
        if "train" in lower:
            return False
        if "valid" in lower or "dev" in lower or "ctx_all" in lower or "no_ctx_all" in lower:
            return True
        return False

    found_files = []
    for dirpath, dirnames, filenames in os.walk(root):
        base = os.path.basename(dirpath)
        # focus on directories named 'sents_*' (covers the structure you posted)
        if not base.startswith("sents_"):
            continue
        for fname in filenames:
            if not fname.lower().endswith(".tsv"):
                continue
            if is_valid_file(fname):
                fpath = os.path.join(dirpath, fname)
                found_files.append(fpath)
                try:
                    df_raw = _read_tsv(fpath, nrows=nrows_val)
                    df = parse_dipper_tsv(df_raw) if df_raw is not None else None
                    if df is not None and len(df) > 0:
                        out.append(df)
                        print(f"  ✓ found DIPPER valid-like: {os.path.relpath(fpath, start=root)} -> {len(df)}")
                    else:
                        print(f"  ○ read but no usable rows: {os.path.relpath(fpath, start=root)}")
                except Exception as e:
                    print(f"  ! failed reading {fpath}: {e}")

    # Also check for some out_domain ctx_all/no_ctx_all files sitting under something like out_domain/*/sents_*
    # (the above walk will already hit them because they are under sents_*; this comment is just informative)
    if not found_files:
        print(f"[gather_dipper_pos_val] No valid/dev files found under {root}.")
    else:
        print(f"[gather_dipper_pos_val] Total valid-like files found: {len(found_files)}")

    return out

# ----- Add negatives (unchanged) -----
def add_negatives(df: pd.DataFrame, ratio=1.0, seed=42) -> pd.DataFrame:
    pos = df.copy().reset_index(drop=True)
    n = len(pos)
    if n < 4:
        return pos
    rng = np.random.default_rng(seed)
    idx = rng.permutation(n)
    half = n // 2
    neg = pd.DataFrame({
        "sentence1": pos.loc[idx[:half], "sentence1"].values,
        "sentence2": pos.loc[idx[-half:], "sentence2"].values,
        "label": 0
    })
    if len(neg) > int(len(pos) * ratio):
        neg = neg.sample(int(len(pos) * ratio), random_state=seed)
    pos["label"] = 1
    out = pd.concat([pos, neg], ignore_index=True).sample(frac=1.0, random_state=seed).reset_index(drop=True)
    return out

def load_dipper_val(dipper_root: str = "/content/dipper_data", val_size_limit: int = 8000) -> pd.DataFrame:
    """
    Build a DIPPER validation DF by aggregating all positive valid-like files found under dipper_root,
    then adding negatives and sampling a validation set.
    """
    print(f"[load_dipper_val] Searching for DIPPER valid/dev files under: {dipper_root}")
    pos_list = gather_dipper_pos_val(dipper_root)
    if not pos_list:
        raise RuntimeError("No DIPPER valid-like files found under dipper_root. Check path and filename conventions.")
    pos_all = pd.concat(pos_list, ignore_index=True)
    # deduplicate near-identical pairs (optional): uncomment if needed
    # pos_all = pos_all.drop_duplicates(subset=["sentence1","sentence2"]).reset_index(drop=True)
    full = add_negatives(pos_all, ratio=1.0)
    val_size = min(val_size_limit, max(2000, int(0.1 * len(full))))
    val_df = full.sample(val_size, random_state=42).reset_index(drop=True)
    print(f"[load_dipper_val] Loaded DIPPER val: {len(val_df)} samples (label dist: {val_df['label'].value_counts().to_dict()})")
    return val_df

# HC3 helpers (adapted from your code)
def to_list_safely(x):
    if x is None: return []
    try:
        if pd.isna(x): return []
    except Exception:
        pass
    if isinstance(x, (list, tuple)):
        return [str(e).strip() for e in x if str(e).strip()]
    if isinstance(x, np.ndarray):
        return [str(e).strip() for e in x.tolist() if str(e).strip()]
    s = str(x).strip()
    return [s] if s else []

def make_pairs_from_hc3_split(ds_split, add_context: bool, cartesian: bool = True):
    df = ds_split.to_pandas()
    rows = []
    human_col = "human_answers" if "human_answers" in df.columns else "human_answers"
    ai_col = "chatgpt_answers" if "chatgpt_answers" in df.columns else "chatgpt_answers"
    q_col = "question" if "question" in df.columns else None
    for _, r in df.iterrows():
        q = ""
        if q_col is not None and q_col in r:
            q = str(r[q_col]).strip()
        human_list = to_list_safely(r.get(human_col))
        ai_list = to_list_safely(r.get(ai_col))
        if not human_list or not ai_list:
            continue
        prefix = "lexical = NA, order = NA"
        if cartesian:
            for h in human_list:
                for a in ai_list:
                    if add_context and q:
                        col0 = f"{prefix}  <sent> {q} </sent> {h}"
                    else:
                        col0 = f"{prefix} {h}"
                    rows.append((col0, a))
        else:
            k = min(len(human_list), len(ai_list))
            for i in range(k):
                h = human_list[i]; a = ai_list[i]
                if add_context and q:
                    col0 = f"{prefix}  <sent> {q} </sent> {h}"
                else:
                    col0 = f"{prefix} {h}"
                rows.append((col0, a))
    return pd.DataFrame(rows, columns=[0, 1])

def build_hc3_tsvs(cartesian=True):
    print("Loading HC3 (Hello-SimpleAI/HC3, config='all') ...")
    hc3 = load_dataset("Hello-SimpleAI/HC3", "all")
    all_splits = {}
    for split in hc3.keys():
        df_no_ctx = make_pairs_from_hc3_split(hc3[split], add_context=False, cartesian=cartesian)
        df_no_ctx.columns = ["sentence1", "sentence2"]
        df_no_ctx["label"] = 1
        all_splits[f"hc3_{split}_no_ctx"] = df_no_ctx
        df_ctx = make_pairs_from_hc3_split(hc3[split], add_context=True, cartesian=cartesian)
        df_ctx.columns = ["sentence1", "sentence2"]
        df_ctx["label"] = 1
        all_splits[f"hc3_{split}_ctx"] = df_ctx
        print(f"  HC3 {split}: no_ctx={len(df_no_ctx)} ctx={len(df_ctx)}")
    # We'll return concatenation of available splits for evaluation
    if len(all_splits) == 0:
        raise RuntimeError("HC3 had no splits.")
    combined = pd.concat(list(all_splits.values()), ignore_index=True)
    # Add negatives by shuffling similar to DIPPER
    combined_full = add_negatives(combined, ratio=1.0)
    # sample validation-size portion (keep reasonable)
    val_size = min(8000, max(2000, int(0.1 * len(combined_full))))
    val_df = combined_full.sample(val_size, random_state=42).reset_index(drop=True)
    print(f"Built HC3-eval: {len(val_df)} samples")
    return val_df

# ----- Torch Dataset -----
class PairEvalDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, max_length=256):
        self.df = df.reset_index(drop=True)
        self.tok = tokenizer
        self.max_length = max_length
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        r = self.df.loc[idx]
        a = str(r["sentence1"])
        b = str(r["sentence2"])
        toks = self.tok(a, b, truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt")
        item = {k: v.squeeze(0) for k, v in toks.items()}
        item["label"] = torch.tensor(int(r["label"]), dtype=torch.long)
        return item

# ----- Evaluation functions -----
def save_confusion_matrix(cm: np.ndarray, dataset_name: str, output_dir: str, normalize: bool = False):
    fmt = 'd'
    cbar_label = 'Count'
    title = f"{dataset_name} - Confusion Matrix"
    cm_plot = cm
    if normalize:
        cm_plot = cm.astype(float)
        row_sums = cm_plot.sum(axis=1, keepdims=True)
        cm_plot = np.divide(cm_plot, row_sums, where=row_sums != 0)
        fmt = '.2f'
        cbar_label = 'Proportion'
        title = f"{dataset_name} - Normalized Confusion Matrix"
    plt.figure(figsize=(5.5,4.5))
    sns.heatmap(cm_plot, annot=True, fmt=fmt, cmap='Blues',
                xticklabels=['Not Paraphrase','Paraphrase'], yticklabels=['Not Paraphrase','Paraphrase'],
                cbar_kws={'label': cbar_label})
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    suffix = '_normalized' if normalize else ''
    fname = os.path.join(output_dir, f"{dataset_name.lower().replace(' ','_')}_confusion_matrix{suffix}.png")
    plt.savefig(fname, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved {fname}")

def evaluate_and_save(model, tokenizer, df: pd.DataFrame, dataset_name: str, output_dir: str, batch_size: int = 64, device: torch.device = DEVICE):
    if df is None or len(df) == 0:
        print(f"Skipping {dataset_name} - no data")
        return None, (None, None)
    print(f"\nEvaluating {dataset_name} ({len(df)} samples)")
    ds = PairEvalDataset(df, tokenizer, max_length=MAX_LENGTH)
    loader = DataLoader(ds, batch_size=batch_size, shuffle=False)
    model.to(device); model.eval()
    all_preds = []; all_labels = []; all_probs = []
    use_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
    amp_dtype = torch.bfloat16 if use_bf16 else torch.float16
    with torch.no_grad():
        with torch.cuda.amp.autocast(enabled=USE_MIXED_PRECISION and torch.cuda.is_available(), dtype=amp_dtype):
            for batch in loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                token_type_ids = batch.get('token_type_ids', None)
                if token_type_ids is not None:
                    token_type_ids = token_type_ids.to(device)
                    out = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
                else:
                    out = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = out.logits
                probs = F.softmax(logits, dim=-1)[:, 1].cpu().numpy()
                preds = (probs > 0.5).astype(int)
                all_preds.extend(preds.tolist())
                all_labels.extend(batch['label'].cpu().numpy().tolist())
                all_probs.extend(probs.tolist())
    acc = accuracy_score(all_labels, all_preds)
    p, r, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary', zero_division=0)
    mcc = matthews_corrcoef(all_labels, all_preds) if len(set(all_labels))>1 else 0.0
    cm = confusion_matrix(all_labels, all_preds, labels=[0,1])
    auc = None; fpr = None; tpr = None; tpr_1pct = None
    try:
        auc = float(roc_auc_score(all_labels, all_probs))
        fpr, tpr, _ = roc_curve(all_labels, all_probs)
        if len(fpr)>1:
            tpr_1pct = float(np.interp(0.01, fpr, tpr))
    except Exception:
        pass
    metrics = {
        'dataset': dataset_name,
        'accuracy': float(acc),
        'precision': float(p),
        'recall': float(r),
        'f1': float(f1),
        'mcc': float(mcc),
        'auc': auc,
        'confusion_matrix': cm.tolist(),
        'samples': len(df),
        'tpr_at_1pct_fpr': tpr_1pct,
        'fpr': fpr.tolist() if fpr is not None else None,
        'tpr': tpr.tolist() if tpr is not None else None
    }
    save_confusion_matrix(cm, dataset_name, output_dir, normalize=False)
    save_confusion_matrix(cm, dataset_name, output_dir, normalize=True)
    with open(os.path.join(output_dir, f"{dataset_name.lower().replace(' ','_')}_metrics.json"), "w") as fh:
        json.dump(metrics, fh, indent=2)
    print(classification_report(all_labels, all_preds, target_names=['Not Paraphrase','Paraphrase']))
    return metrics, (all_labels, all_probs)

# ----- Load model & tokenizer -----
print(f"Loading model/tokenizer: {MODEL_NAME} ...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(DEVICE)

# ----- Prepare datasets -----
try:
    dipper_val = load_dipper_val("/content/dipper_data")
except Exception as e:
    print("Warning: DIPPER load failed:", e)
    dipper_val = pd.DataFrame(columns=["sentence1","sentence2","label"])

try:
    hc3_df = build_hc3_tsvs(cartesian=True)
except Exception as e:
    print("Warning: HC3 load failed:", e)
    hc3_df = pd.DataFrame(columns=["sentence1","sentence2","label"])

# Custom CSV
CUSTOM_CSV_PATH = "/content/custom_paraphrases.csv"
custom_df = None
if os.path.isfile(CUSTOM_CSV_PATH):
    cdf = pd.read_csv(CUSTOM_CSV_PATH)
    if {"original_sentence","paraphrased_sentence"}.issubset(set(cdf.columns)):
        custom_df = pd.DataFrame({
            "sentence1": cdf["original_sentence"].astype(str),
            "sentence2": cdf["paraphrased_sentence"].astype(str),
            "label": 1
        })
        custom_eval_df = add_negatives(custom_df, ratio=1.0)
        print(f"Loaded custom CSV: {len(custom_eval_df)} (pos/neg: {custom_eval_df['label'].value_counts().to_dict()})")
    else:
        print("Custom CSV missing required columns. Skipping custom.")
        custom_eval_df = pd.DataFrame(columns=["sentence1","sentence2","label"])
else:
    print(f"Custom CSV not found at {CUSTOM_CSV_PATH}. Skipping custom.")
    custom_eval_df = pd.DataFrame(columns=["sentence1","sentence2","label"])

OUT = OUTPUT_ROOT
OUT.mkdir(parents=True, exist_ok=True)

# ----- Evaluate DIPPER and HC3 -----
all_metrics = []
roc_entries = []
for df, name in [(dipper_val, "DIPPER_VAL"), (hc3_df, "HC3_VAL")]:
    if df is None or len(df)==0:
        print(f"Skipping {name} - no data")
        continue
    metrics, (labels, probs) = evaluate_and_save(model, tokenizer, df, name, str(OUT), batch_size=BATCH_SIZE)
    if metrics:
        all_metrics.append(metrics)
        if metrics.get('auc') is not None and metrics.get('fpr') is not None:
            roc_entries.append({
                'dataset': name,
                'fpr': np.array(metrics['fpr']),
                'tpr': np.array(metrics['tpr']),
                'auc': metrics['auc'],
                'accuracy': metrics['accuracy']
            })

with open(os.path.join(OUT, "summary.json"), "w") as fh:
    json.dump(all_metrics, fh, indent=2)
print(f"Saved summary.json to {OUT}")

# ----- Plot ROC curves comparison -----
if len(roc_entries) > 0:
    plt.figure(figsize=(8,6))
    cmap = plt.get_cmap('tab10')
    lines = []
    labels_list = []
    for i, entry in enumerate(roc_entries):
        color = cmap(i % 10)
        fpr = entry['fpr']
        tpr = entry['tpr']
        auc_val = entry['auc']
        acc_val = entry['accuracy']

        if len(fpr) == 0 or len(tpr) == 0:
            continue

        try:
            tpr_at_1pct = float(np.interp(0.01, fpr, tpr))
        except Exception:
            tpr_at_1pct = 0.0
        plt.plot(fpr, tpr, label=None, color=color, linewidth=2)
        legend_label = f"{entry['dataset']} (AUC={auc_val:.3f}, TPR@1%FPR={tpr_at_1pct:.3f}, Acc={acc_val:.3f})"
        lines.append(plt.Line2D([0],[0], color=color, lw=2))
        labels_list.append(legend_label)

    plt.plot([0,1],[0,1], linestyle='--', color='gray', linewidth=1.5, label='Random')
    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.05])
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title('ROC Curves Comparison', fontsize=14, fontweight='bold')
    plt.legend(lines + [plt.Line2D([0],[0], color='gray', lw=1.5, linestyle='--')],
               labels_list + ['Random'], loc='lower right', fontsize=10)
    plt.grid(alpha=0.3)
    roc_path = os.path.join(OUT, 'roc_comparison.png')
    plt.tight_layout()
    plt.savefig(roc_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved {roc_path}")
else:
    print('No ROC data available for plotting.')

# ----- Plot performance metrics comparison -----
if len(all_metrics) > 1:
    comp_df = pd.DataFrame(all_metrics)
    comp_df_sorted = comp_df.set_index('dataset')

    fig, ax = plt.subplots(figsize=(12,6))
    comp_df_sorted[['accuracy','precision','recall','f1','mcc','auc','tpr_at_1pct_fpr']].plot(
        kind='bar', ax=ax, width=0.8)
    plt.title('Performance Metrics Comparison Across Datasets', fontsize=14, fontweight='bold')
    plt.ylabel('Score', fontsize=12)
    plt.xlabel('Dataset', fontsize=12)
    plt.ylim(0,1.05)
    plt.legend(title='Metrics', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    comp_path = os.path.join(OUT, 'metrics_comparison.png')
    plt.savefig(comp_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved {comp_path}")

# ----- Print and save summary table -----
summary_df = pd.DataFrame(all_metrics)
if not summary_df.empty:
    display_cols = ['dataset','accuracy','precision','recall','f1','mcc',
                'auc','tpr_at_1pct_fpr','samples']
    print('\n' + '='*80)
    print('SUMMARY TABLE')
    print('='*80)
    print(summary_df[display_cols].round(4).to_string(index=False))
    print('='*80)
    summary_df.to_csv(os.path.join(OUT, 'summary.csv'), index=False)
    print(f"\nSaved summary table to {os.path.join(OUT, 'summary.csv')}")

# ----- Evaluate custom CSV and create per-model analysis -----
if custom_eval_df is not None and len(custom_eval_df)>0:
    print("\n" + "="*80)
    print("CUSTOM CSV EVALUATION - PER-MODEL ANALYSIS")
    print("="*80)

    metrics_c, (labels_c, probs_c) = evaluate_and_save(model, tokenizer, custom_eval_df, "CUSTOM_CSV", str(OUT), batch_size=BATCH_SIZE)

    # Get predictions for all samples
    ds_custom = PairEvalDataset(custom_eval_df, tokenizer, max_length=MAX_LENGTH)
    loader = DataLoader(ds_custom, batch_size=BATCH_SIZE, shuffle=False)
    preds_list = []
    model.eval()
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            out = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = F.softmax(out.logits, dim=-1)[:,1].cpu().numpy()
            preds = (probs > 0.5).astype(int)
            for i in range(len(preds)):
                preds_list.append({'pred': int(preds[i]), 'prob_paraphrase': float(probs[i])})

    # Combine with original data
    report_df = custom_eval_df.copy().reset_index(drop=True)
    preds_df = pd.DataFrame(preds_list)
    display_df = pd.concat([report_df, preds_df], axis=1)

    # Load original CSV to get model names
    if os.path.isfile(CUSTOM_CSV_PATH):
        orig_csv = pd.read_csv(CUSTOM_CSV_PATH)
        if 'model_name' in orig_csv.columns:
            display_df['model_name'] = orig_csv['model_name'].values[:len(display_df)]

            # Calculate per-model metrics
            model_metrics = []
            for model_name in display_df['model_name'].unique():
                model_data = display_df[display_df['model_name'] == model_name]
                labels_m = model_data['label'].values
                preds_m = model_data['pred'].values

                acc = accuracy_score(labels_m, preds_m)
                p, r, f1, _ = precision_recall_fscore_support(labels_m, preds_m, average='binary', zero_division=0)

                model_metrics.append({
                    'Model': model_name,
                    'Precision (%)': p * 100,
                    'Recall (%)': r * 100,
                    'F1-Score (%)': f1 * 100,
                    'Accuracy (%)': acc * 100,
                    'Samples': len(model_data)
                })

            model_perf_df = pd.DataFrame(model_metrics)

            # Display and save per-model table
            print("\nPer-Model Performance:")
            print(model_perf_df.round(2).to_string(index=False))
            model_perf_df.to_csv(os.path.join(OUT, "per_model_metrics.csv"), index=False)
            print(f"\nSaved per-model metrics to {os.path.join(OUT, 'per_model_metrics.csv')}")

            # Plot per-model performance
            fig, ax = plt.subplots(figsize=(10, 6))
            x = np.arange(len(model_perf_df))
            width = 0.2

            ax.bar(x - 1.5*width, model_perf_df['Precision (%)'], width, label='Precision', alpha=0.8)
            ax.bar(x - 0.5*width, model_perf_df['Recall (%)'], width, label='Recall', alpha=0.8)
            ax.bar(x + 0.5*width, model_perf_df['F1-Score (%)'], width, label='F1-Score', alpha=0.8)
            ax.bar(x + 1.5*width, model_perf_df['Accuracy (%)'], width, label='Accuracy', alpha=0.8)

            ax.set_ylabel('Score (%)', fontsize=12)
            ax.set_xlabel('Model', fontsize=12)
            ax.set_title('Paraphrase Detection Performance by LLM', fontsize=14, fontweight='bold')
            ax.set_xticks(x)
            ax.set_xticklabels(model_perf_df['Model'], rotation=45, ha='right')
            ax.legend()
            ax.set_ylim(0, 105)
            ax.grid(axis='y', alpha=0.3)
            plt.tight_layout()

            model_plot_path = os.path.join(OUT, 'per_model_performance.png')
            plt.savefig(model_plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"Saved per-model plot to {model_plot_path}")

            # Plot average probability scores by model
            fig, ax = plt.subplots(figsize=(10, 6))
            model_probs = display_df.groupby('model_name')['prob_paraphrase'].agg(['mean', 'std'])
            model_probs = model_probs.sort_values('mean', ascending=False)

            ax.barh(model_probs.index, model_probs['mean'], xerr=model_probs['std'],
                   capsize=5, alpha=0.7, color='steelblue')
            ax.set_xlabel('Average Paraphrase Probability', fontsize=12)
            ax.set_ylabel('Model', fontsize=12)
            ax.set_title('Average Paraphrase Detection Confidence by LLM', fontsize=14, fontweight='bold')
            ax.set_xlim(0, 1)
            ax.grid(axis='x', alpha=0.3)
            plt.tight_layout()

            prob_plot_path = os.path.join(OUT, 'model_confidence_scores.png')
            plt.savefig(prob_plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"Saved confidence scores plot to {prob_plot_path}")

            # Heatmap of per-model confusion matrices
            n_models = len(model_perf_df)
            fig, axes = plt.subplots(1, n_models, figsize=(5*n_models, 4))
            if n_models == 1:
                axes = [axes]

            for idx, model_name in enumerate(display_df['model_name'].unique()):
                model_data = display_df[display_df['model_name'] == model_name]
                cm = confusion_matrix(model_data['label'], model_data['pred'], labels=[0,1])

                sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                           xticklabels=['Not Para','Para'],
                           yticklabels=['Not Para','Para'],
                           cbar=False)
                axes[idx].set_title(f'{model_name}', fontsize=11, fontweight='bold')
                axes[idx].set_ylabel('True' if idx == 0 else '', fontsize=10)
                axes[idx].set_xlabel('Predicted', fontsize=10)

            plt.suptitle('Confusion Matrices by LLM', fontsize=14, fontweight='bold', y=1.02)
            plt.tight_layout()

            cm_plot_path = os.path.join(OUT, 'model_confusion_matrices.png')
            plt.savefig(cm_plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"Saved confusion matrices plot to {cm_plot_path}")

            # Create error analysis: false positives and false negatives by model
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

            error_data = []
            for model_name in display_df['model_name'].unique():
                model_data = display_df[display_df['model_name'] == model_name]
                fp = ((model_data['label'] == 0) & (model_data['pred'] == 1)).sum()
                fn = ((model_data['label'] == 1) & (model_data['pred'] == 0)).sum()
                tp = ((model_data['label'] == 1) & (model_data['pred'] == 1)).sum()
                tn = ((model_data['label'] == 0) & (model_data['pred'] == 0)).sum()
                error_data.append({
                    'Model': model_name,
                    'False Positives': fp,
                    'False Negatives': fn,
                    'True Positives': tp,
                    'True Negatives': tn
                })

            error_df = pd.DataFrame(error_data)

            # Plot false positives and false negatives
            x = np.arange(len(error_df))
            width = 0.35

            ax1.bar(x - width/2, error_df['False Positives'], width, label='False Positives', color='salmon', alpha=0.8)
            ax1.bar(x + width/2, error_df['False Negatives'], width, label='False Negatives', color='lightcoral', alpha=0.8)
            ax1.set_ylabel('Count', fontsize=11)
            ax1.set_xlabel('Model', fontsize=11)
            ax1.set_title('Error Analysis: False Positives vs False Negatives', fontsize=12, fontweight='bold')
            ax1.set_xticks(x)
            ax1.set_xticklabels(error_df['Model'], rotation=45, ha='right')
            ax1.legend()
            ax1.grid(axis='y', alpha=0.3)

            # Plot true positives and true negatives
            ax2.bar(x - width/2, error_df['True Positives'], width, label='True Positives', color='mediumseagreen', alpha=0.8)
            ax2.bar(x + width/2, error_df['True Negatives'], width, label='True Negatives', color='lightgreen', alpha=0.8)
            ax2.set_ylabel('Count', fontsize=11)
            ax2.set_xlabel('Model', fontsize=11)
            ax2.set_title('Correct Predictions: True Positives vs True Negatives', fontsize=12, fontweight='bold')
            ax2.set_xticks(x)
            ax2.set_xticklabels(error_df['Model'], rotation=45, ha='right')
            ax2.legend()
            ax2.grid(axis='y', alpha=0.3)

            plt.tight_layout()
            error_plot_path = os.path.join(OUT, 'model_error_analysis.png')
            plt.savefig(error_plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"Saved error analysis plot to {error_plot_path}")

            # Distribution of confidence scores by model
            fig, ax = plt.subplots(figsize=(12, 6))

            for model_name in display_df['model_name'].unique():
                model_data = display_df[display_df['model_name'] == model_name]
                ax.hist(model_data['prob_paraphrase'], bins=30, alpha=0.5, label=model_name, edgecolor='black')

            ax.set_xlabel('Paraphrase Probability', fontsize=12)
            ax.set_ylabel('Frequency', fontsize=12)
            ax.set_title('Distribution of Paraphrase Detection Confidence Scores by LLM', fontsize=14, fontweight='bold')
            ax.legend()
            ax.grid(axis='y', alpha=0.3)
            ax.axvline(x=0.5, color='red', linestyle='--', linewidth=2, label='Decision Threshold')
            plt.tight_layout()

            dist_plot_path = os.path.join(OUT, 'confidence_distribution.png')
            plt.savefig(dist_plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"Saved confidence distribution plot to {dist_plot_path}")

    # Save detailed per-sample results
    display_df.to_csv(os.path.join(OUT, "custom_results_detailed.csv"), index=False)
    print(f"\nSaved detailed results to {os.path.join(OUT, 'custom_results_detailed.csv')}")
    print(f"Total samples evaluated: {len(display_df)}")
else:
    print("\nNo custom CSV results to display.")

print("\n" + "="*80)
print("EVALUATION COMPLETE")
print("="*80)
print(f"All results saved to: {OUT}")
print("\nGenerated files:")
print("  - summary.json, summary.csv: Overall metrics across datasets")
print("  - roc_comparison.png: ROC curves for DIPPER and HC3")
print("  - metrics_comparison.png: Bar chart comparing all metrics")
print("  - per_model_metrics.csv: Performance table by LLM")
print("  - per_model_performance.png: Bar chart of metrics by LLM")
print("  - model_confidence_scores.png: Average confidence by LLM")
print("  - model_confusion_matrices.png: Confusion matrices grid")
print("  - model_error_analysis.png: False positive/negative analysis")
print("  - confidence_distribution.png: Probability distributions")
print("  - custom_results_detailed.csv: Per-sample predictions")
print("="*80)

Loading model/tokenizer: Intel/deberta-v3-base-mrpc ...


tokenizer_config.json:   0%|          | 0.00/412 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/738M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

[load_dipper_val] Searching for DIPPER valid/dev files under: /content/dipper_data
  ✓ found DIPPER valid-like: par3/gt_translator/sents_8/valid_ctrl_no_ctx_all.tsv -> 1500
  ✓ found DIPPER valid-like: par3/gt_translator/sents_8/valid_ctrl_no_ctx.tsv -> 1483
  ✓ found DIPPER valid-like: par3/gt_translator/sents_8/valid_ctrl_ctx.tsv -> 1479
  ✓ found DIPPER valid-like: par3/gt_translator/sents_8/valid_ctrl_ctx_all.tsv -> 1500
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_no_ctx_all.tsv -> 1484
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_no_ctx.tsv -> 1478
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_ctx.tsv -> 1474
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_ctx_all_small.tsv -> 1500
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_ctx_all.tsv -> 1500
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_no_ctx_all_small.tsv -> 1484
  ✓ found DIPPER valid-like: par3/gt

  if pd.isna(x): return []
  if pd.isna(x): return []


  HC3 train: no_ctx=60205 ctx=60205
Built HC3-eval: 8000 samples
Loaded custom CSV: 90 (pos/neg: {1: 60, 0: 30})

Evaluating DIPPER_VAL (8000 samples)


  with torch.cuda.amp.autocast(enabled=USE_MIXED_PRECISION and torch.cuda.is_available(), dtype=amp_dtype):


Saved /content/output/deberta_v3_mrpc/dipper_val_confusion_matrix.png
Saved /content/output/deberta_v3_mrpc/dipper_val_confusion_matrix_normalized.png
                precision    recall  f1-score   support

Not Paraphrase       0.45      1.00      0.62      2673
    Paraphrase       0.99      0.38      0.55      5327

      accuracy                           0.58      8000
     macro avg       0.72      0.69      0.58      8000
  weighted avg       0.81      0.58      0.57      8000


Evaluating HC3_VAL (8000 samples)


  with torch.cuda.amp.autocast(enabled=USE_MIXED_PRECISION and torch.cuda.is_available(), dtype=amp_dtype):


Saved /content/output/deberta_v3_mrpc/hc3_val_confusion_matrix.png
Saved /content/output/deberta_v3_mrpc/hc3_val_confusion_matrix_normalized.png
                precision    recall  f1-score   support

Not Paraphrase       0.35      0.99      0.52      2664
    Paraphrase       0.93      0.10      0.17      5336

      accuracy                           0.39      8000
     macro avg       0.64      0.54      0.35      8000
  weighted avg       0.74      0.39      0.29      8000

Saved summary.json to /content/output/deberta_v3_mrpc

Evaluating CUSTOM_CSV (90 samples)


  with torch.cuda.amp.autocast(enabled=USE_MIXED_PRECISION and torch.cuda.is_available(), dtype=amp_dtype):


Saved /content/output/deberta_v3_mrpc/custom_csv_confusion_matrix.png
Saved /content/output/deberta_v3_mrpc/custom_csv_confusion_matrix_normalized.png
                precision    recall  f1-score   support

Not Paraphrase       1.00      0.93      0.97        30
    Paraphrase       0.97      1.00      0.98        60

      accuracy                           0.98        90
     macro avg       0.98      0.97      0.97        90
  weighted avg       0.98      0.98      0.98        90



Unnamed: 0,sentence1,sentence2,label,pred,prob_paraphrase
0,The possibility of approximating a continuous ...,Several studies have examined the ability of a...,1,1,0.999547
1,State-of-the-art object detection networks dep...,Leading object detection networks rely on regi...,1,1,0.999545
2,I swear I wasn’t going to do a “Top Whatevers ...,"I had resolved not to write a ""Top Whatever"" b...",1,1,0.999384
3,Our evolutionary history suggests that there w...,"Indeed, science and technology are distinctly ...",0,0,0.002765
4,The possibility of approximating a continuous ...,Numerous research papers have investigated the...,1,1,0.999544
...,...,...,...,...,...
85,The possibility of approximating a continuous ...,A feedforward neural network with one hidden l...,1,1,0.999539
86,"In fact, science and technology are clearly di...","In a recent opinion piece in The Conversation,...",0,0,0.002277
87,I swear I wasn’t going to do a “Top Whatevers ...,We investigate a new machine learning problem ...,0,0,0.002738
88,Our evolutionary history suggests that there w...,Our evolutionary history indicates that there ...,1,1,0.999542


Saved custom_results.csv to /content/output/deberta_v3_mrpc


## viswadarshan06/pd-mpnet

In [None]:
import gc
import torch

# Delete model, tokenizer, and any large tensors
del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()


In [None]:
import os
import re
import json
from pathlib import Path
from typing import Optional, List
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support,
                            matthews_corrcoef, confusion_matrix, roc_auc_score,
                            roc_curve, classification_report)
import matplotlib.pyplot as plt
import seaborn as sns

# Configuration
MODEL_NAME = "viswadarshan06/pd-mpnet"
OUTPUT_ROOT = Path("./output/pd_mpnet").resolve()
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 64
MAX_LENGTH = 256
USE_MIXED_PRECISION = True

# ----- cleaning/parsing helpers -----
def clean_sentence(sent: str) -> str:
    s = str(sent)
    s = re.sub(r"lexical\s*=\s*\d+", "", s, flags=re.IGNORECASE)
    s = re.sub(r"order\s*=\s*\d+", "", s, flags=re.IGNORECASE)
    s = s.replace("<sent>", " ")
    matches = re.findall(r'"([^"]+)"', s)
    if matches:
        s = " ".join(matches)
    s = re.sub(r"\s+", " ", s)
    s = s.replace(", ,", ",").strip()
    return s

def parse_dipper_tsv(df: pd.DataFrame) -> pd.DataFrame:
    if df is None or df.shape[1] < 2:
        return pd.DataFrame(columns=["sentence1", "sentence2", "label"])
    c1, c2 = df.columns[0], df.columns[1]
    s1 = df[c1].astype(str).map(clean_sentence)
    s2 = df[c2].astype(str).map(clean_sentence)
    out = pd.DataFrame({"sentence1": s1, "sentence2": s2})
    out = out[(out["sentence1"].str.len() > 8) & (out["sentence2"].str.len() > 8)].copy()
    out["label"] = 1
    out.dropna(inplace=True)
    out.reset_index(drop=True, inplace=True)
    return out

def _read_tsv(path: str, nrows: Optional[int] = None) -> Optional[pd.DataFrame]:
    try:
        return pd.read_csv(path, sep="\t", nrows=nrows, header=None, engine="python", quoting=3)
    except Exception:
        try:
            return pd.read_csv(path, sep="\t", nrows=nrows, engine="python")
        except Exception:
            return None

def gather_dipper_pos_val(root: str, nrows_val: Optional[int] = 1500) -> List[pd.DataFrame]:
    out = []
    if not os.path.exists(root):
        print(f"[gather_dipper_pos_val] DIPPER root not found: {root}")
        return out

    def is_valid_file(fname: str) -> bool:
        lower = fname.lower()
        if "train" in lower:
            return False
        if "valid" in lower or "dev" in lower or "ctx_all" in lower or "no_ctx_all" in lower:
            return True
        return False

    found_files = []
    for dirpath, dirnames, filenames in os.walk(root):
        base = os.path.basename(dirpath)
        if not base.startswith("sents_"):
            continue
        for fname in filenames:
            if not fname.lower().endswith(".tsv"):
                continue
            if is_valid_file(fname):
                fpath = os.path.join(dirpath, fname)
                found_files.append(fpath)
                try:
                    df_raw = _read_tsv(fpath, nrows=nrows_val)
                    df = parse_dipper_tsv(df_raw) if df_raw is not None else None
                    if df is not None and len(df) > 0:
                        out.append(df)
                        print(f"  ✓ found DIPPER valid-like: {os.path.relpath(fpath, start=root)} -> {len(df)}")
                    else:
                        print(f"  ○ read but no usable rows: {os.path.relpath(fpath, start=root)}")
                except Exception as e:
                    print(f"  ! failed reading {fpath}: {e}")

    if not found_files:
        print(f"[gather_dipper_pos_val] No valid/dev files found under {root}.")
    else:
        print(f"[gather_dipper_pos_val] Total valid-like files found: {len(found_files)}")

    return out

def add_negatives(df: pd.DataFrame, ratio=1.0, seed=42) -> pd.DataFrame:
    pos = df.copy().reset_index(drop=True)
    n = len(pos)
    if n < 4:
        return pos
    rng = np.random.default_rng(seed)
    idx = rng.permutation(n)
    half = n // 2
    neg = pd.DataFrame({
        "sentence1": pos.loc[idx[:half], "sentence1"].values,
        "sentence2": pos.loc[idx[-half:], "sentence2"].values,
        "label": 0
    })
    if len(neg) > int(len(pos) * ratio):
        neg = neg.sample(int(len(pos) * ratio), random_state=seed)
    pos["label"] = 1
    out = pd.concat([pos, neg], ignore_index=True).sample(frac=1.0, random_state=seed).reset_index(drop=True)
    return out

def load_dipper_val(dipper_root: str = "/content/dipper_data", val_size_limit: int = 8000) -> pd.DataFrame:
    print(f"[load_dipper_val] Searching for DIPPER valid/dev files under: {dipper_root}")
    pos_list = gather_dipper_pos_val(dipper_root)
    if not pos_list:
        raise RuntimeError("No DIPPER valid-like files found under dipper_root. Check path and filename conventions.")
    pos_all = pd.concat(pos_list, ignore_index=True)
    full = add_negatives(pos_all, ratio=1.0)
    val_size = min(val_size_limit, max(2000, int(0.1 * len(full))))
    val_df = full.sample(val_size, random_state=42).reset_index(drop=True)
    print(f"[load_dipper_val] Loaded DIPPER val: {len(val_df)} samples (label dist: {val_df['label'].value_counts().to_dict()})")
    return val_df[0:8000]

# HC3 helpers
def to_list_safely(x):
    if x is None: return []
    try:
        if pd.isna(x): return []
    except Exception:
        pass
    if isinstance(x, (list, tuple)):
        return [str(e).strip() for e in x if str(e).strip()]
    if isinstance(x, np.ndarray):
        return [str(e).strip() for e in x.tolist() if str(e).strip()]
    s = str(x).strip()
    return [s] if s else []

def make_pairs_from_hc3_split(ds_split, add_context: bool, cartesian: bool = True):
    df = ds_split.to_pandas()
    rows = []
    human_col = "human_answers" if "human_answers" in df.columns else "human_answers"
    ai_col = "chatgpt_answers" if "chatgpt_answers" in df.columns else "chatgpt_answers"
    q_col = "question" if "question" in df.columns else None
    for _, r in df.iterrows():
        q = ""
        if q_col is not None and q_col in r:
            q = str(r[q_col]).strip()
        human_list = to_list_safely(r.get(human_col))
        ai_list = to_list_safely(r.get(ai_col))
        if not human_list or not ai_list:
            continue
        prefix = "lexical = NA, order = NA"
        if cartesian:
            for h in human_list:
                for a in ai_list:
                    if add_context and q:
                        col0 = f"{prefix}  <sent> {q} </sent> {h}"
                    else:
                        col0 = f"{prefix} {h}"
                    rows.append((col0, a))
        else:
            k = min(len(human_list), len(ai_list))
            for i in range(k):
                h = human_list[i]; a = ai_list[i]
                if add_context and q:
                    col0 = f"{prefix}  <sent> {q} </sent> {h}"
                else:
                    col0 = f"{prefix} {h}"
                rows.append((col0, a))
    return pd.DataFrame(rows, columns=[0, 1])

def build_hc3_tsvs(cartesian=True):
    print("Loading HC3 (Hello-SimpleAI/HC3, config='all') ...")
    hc3 = load_dataset("Hello-SimpleAI/HC3", "all")
    all_splits = {}
    for split in hc3.keys():
        df_no_ctx = make_pairs_from_hc3_split(hc3[split], add_context=False, cartesian=cartesian)
        df_no_ctx.columns = ["sentence1", "sentence2"]
        df_no_ctx["label"] = 1
        all_splits[f"hc3_{split}_no_ctx"] = df_no_ctx
        df_ctx = make_pairs_from_hc3_split(hc3[split], add_context=True, cartesian=cartesian)
        df_ctx.columns = ["sentence1", "sentence2"]
        df_ctx["label"] = 1
        all_splits[f"hc3_{split}_ctx"] = df_ctx
        print(f"  HC3 {split}: no_ctx={len(df_no_ctx)} ctx={len(df_ctx)}")
    if len(all_splits) == 0:
        raise RuntimeError("HC3 had no splits.")
    combined = pd.concat(list(all_splits.values()), ignore_index=True)
    combined_full = add_negatives(combined, ratio=1.0)
    val_size = min(8000, max(2000, int(0.1 * len(combined_full))))
    val_df = combined_full.sample(val_size, random_state=42).reset_index(drop=True)
    print(f"Built HC3-eval: {len(val_df)} samples")
    return val_df

# ----- Torch Dataset -----
class PairEvalDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, max_length=256):
        self.df = df.reset_index(drop=True)
        self.tok = tokenizer
        self.max_length = max_length
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        r = self.df.loc[idx]
        a = str(r["sentence1"])
        b = str(r["sentence2"])
        toks = self.tok(a, b, truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt")
        item = {k: v.squeeze(0) for k, v in toks.items()}
        item["label"] = torch.tensor(int(r["label"]), dtype=torch.long)
        return item

# ----- Evaluation functions -----
def save_confusion_matrix(cm: np.ndarray, dataset_name: str, output_dir: str, normalize: bool = False):
    fmt = 'd'
    cbar_label = 'Count'
    title = f"{dataset_name} - Confusion Matrix"
    cm_plot = cm
    if normalize:
        cm_plot = cm.astype(float)
        row_sums = cm_plot.sum(axis=1, keepdims=True)
        cm_plot = np.divide(cm_plot, row_sums, where=row_sums != 0)
        fmt = '.2f'
        cbar_label = 'Proportion'
        title = f"{dataset_name} - Normalized Confusion Matrix"
    plt.figure(figsize=(5.5,4.5))
    sns.heatmap(cm_plot, annot=True, fmt=fmt, cmap='Blues',
                xticklabels=['Not Paraphrase','Paraphrase'], yticklabels=['Not Paraphrase','Paraphrase'],
                cbar_kws={'label': cbar_label})
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    suffix = '_normalized' if normalize else ''
    fname = os.path.join(output_dir, f"{dataset_name.lower().replace(' ','_')}_confusion_matrix{suffix}.png")
    plt.savefig(fname, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved {fname}")

def evaluate_and_save(model, tokenizer, df: pd.DataFrame, dataset_name: str, output_dir: str, batch_size: int = 64, device: torch.device = DEVICE):
    if df is None or len(df) == 0:
        print(f"Skipping {dataset_name} - no data")
        return None, (None, None)
    print(f"\nEvaluating {dataset_name} ({len(df)} samples)")
    ds = PairEvalDataset(df, tokenizer, max_length=MAX_LENGTH)
    loader = DataLoader(ds, batch_size=batch_size, shuffle=False)
    model.to(device); model.eval()
    all_preds = []; all_labels = []; all_probs = []
    use_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
    amp_dtype = torch.bfloat16 if use_bf16 else torch.float16
    with torch.no_grad():
        with torch.cuda.amp.autocast(enabled=USE_MIXED_PRECISION and torch.cuda.is_available(), dtype=amp_dtype):
            for batch in loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                token_type_ids = batch.get('token_type_ids', None)
                if token_type_ids is not None:
                    token_type_ids = token_type_ids.to(device)
                    out = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
                else:
                    out = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = out.logits
                probs = F.softmax(logits, dim=-1)[:, 1].cpu().numpy()
                preds = (probs > 0.5).astype(int)
                all_preds.extend(preds.tolist())
                all_labels.extend(batch['label'].cpu().numpy().tolist())
                all_probs.extend(probs.tolist())
    acc = accuracy_score(all_labels, all_preds)
    p, r, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary', zero_division=0)
    mcc = matthews_corrcoef(all_labels, all_preds) if len(set(all_labels))>1 else 0.0
    cm = confusion_matrix(all_labels, all_preds, labels=[0,1])
    auc = None; fpr = None; tpr = None; tpr_1pct = None
    try:
        auc = float(roc_auc_score(all_labels, all_probs))
        fpr, tpr, _ = roc_curve(all_labels, all_probs)
        if len(fpr)>1:
            tpr_1pct = float(np.interp(0.01, fpr, tpr))
    except Exception:
        pass
    metrics = {
        'dataset': dataset_name,
        'accuracy': float(acc),
        'precision': float(p),
        'recall': float(r),
        'f1': float(f1),
        'mcc': float(mcc),
        'auc': auc,
        'confusion_matrix': cm.tolist(),
        'samples': len(df),
        'tpr_at_1pct_fpr': tpr_1pct,
        'fpr': fpr.tolist() if fpr is not None else None,
        'tpr': tpr.tolist() if tpr is not None else None
    }
    save_confusion_matrix(cm, dataset_name, output_dir, normalize=False)
    save_confusion_matrix(cm, dataset_name, output_dir, normalize=True)
    with open(os.path.join(output_dir, f"{dataset_name.lower().replace(' ','_')}_metrics.json"), "w") as fh:
        json.dump(metrics, fh, indent=2)
    print(classification_report(all_labels, all_preds, target_names=['Not Paraphrase','Paraphrase']))
    return metrics, (all_labels, all_probs)

# ----- Load model & tokenizer -----
print(f"Loading model/tokenizer: {MODEL_NAME} ...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(DEVICE)

# ----- Prepare datasets -----
try:
    dipper_val = load_dipper_val("/content/dipper_data")
except Exception as e:
    print("Warning: DIPPER load failed:", e)
    dipper_val = pd.DataFrame(columns=["sentence1","sentence2","label"])

try:
    hc3_df = build_hc3_tsvs(cartesian=True)
except Exception as e:
    print("Warning: HC3 load failed:", e)
    hc3_df = pd.DataFrame(columns=["sentence1","sentence2","label"])

# Custom CSV
CUSTOM_CSV_PATH = "/content/custom_paraphrases.csv"
custom_df = None
if os.path.isfile(CUSTOM_CSV_PATH):
    cdf = pd.read_csv(CUSTOM_CSV_PATH)
    if {"original_sentence","paraphrased_sentence"}.issubset(set(cdf.columns)):
        custom_df = pd.DataFrame({
            "sentence1": cdf["original_sentence"].astype(str),
            "sentence2": cdf["paraphrased_sentence"].astype(str),
            "label": 1
        })
        custom_eval_df = add_negatives(custom_df, ratio=1.0)
        print(f"Loaded custom CSV: {len(custom_eval_df)} (pos/neg: {custom_eval_df['label'].value_counts().to_dict()})")
    else:
        print("Custom CSV missing required columns. Skipping custom.")
        custom_eval_df = pd.DataFrame(columns=["sentence1","sentence2","label"])
else:
    print(f"Custom CSV not found at {CUSTOM_CSV_PATH}. Skipping custom.")
    custom_eval_df = pd.DataFrame(columns=["sentence1","sentence2","label"])

OUT = OUTPUT_ROOT
OUT.mkdir(parents=True, exist_ok=True)

# ----- Evaluate DIPPER and HC3 -----
all_metrics = []
roc_entries = []
for df, name in [(dipper_val, "DIPPER_VAL"), (hc3_df, "HC3_VAL")]:
    if df is None or len(df)==0:
        print(f"Skipping {name} - no data")
        continue
    metrics, (labels, probs) = evaluate_and_save(model, tokenizer, df, name, str(OUT), batch_size=BATCH_SIZE)
    if metrics:
        all_metrics.append(metrics)
        if metrics.get('auc') is not None and metrics.get('fpr') is not None:
            roc_entries.append({
                'dataset': name,
                'fpr': np.array(metrics['fpr']),
                'tpr': np.array(metrics['tpr']),
                'auc': metrics['auc'],
                'accuracy': metrics['accuracy']
            })

with open(os.path.join(OUT, "summary.json"), "w") as fh:
    json.dump(all_metrics, fh, indent=2)
print(f"Saved summary.json to {OUT}")

# ----- Plot ROC curves comparison -----
if len(roc_entries) > 0:
    plt.figure(figsize=(8,6))
    cmap = plt.get_cmap('tab10')
    lines = []
    labels_list = []
    for i, entry in enumerate(roc_entries):
        color = cmap(i % 10)
        fpr = entry['fpr']
        tpr = entry['tpr']
        auc_val = entry['auc']
        acc_val = entry['accuracy']

        if len(fpr) == 0 or len(tpr) == 0:
            continue

        try:
            tpr_at_1pct = float(np.interp(0.01, fpr, tpr))
        except Exception:
            tpr_at_1pct = 0.0
        plt.plot(fpr, tpr, label=None, color=color, linewidth=2)
        legend_label = f"{entry['dataset']} (AUC={auc_val:.3f}, TPR@1%FPR={tpr_at_1pct:.3f}, Acc={acc_val:.3f})"
        lines.append(plt.Line2D([0],[0], color=color, lw=2))
        labels_list.append(legend_label)

    plt.plot([0,1],[0,1], linestyle='--', color='gray', linewidth=1.5, label='Random')
    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.05])
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title('ROC Curves Comparison', fontsize=14, fontweight='bold')
    plt.legend(lines + [plt.Line2D([0],[0], color='gray', lw=1.5, linestyle='--')],
               labels_list + ['Random'], loc='lower right', fontsize=10)
    plt.grid(alpha=0.3)
    roc_path = os.path.join(OUT, 'roc_comparison.png')
    plt.tight_layout()
    plt.savefig(roc_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved {roc_path}")
else:
    print('No ROC data available for plotting.')

# ----- Plot performance metrics comparison -----
if len(all_metrics) > 1:
    comp_df = pd.DataFrame(all_metrics)
    comp_df_sorted = comp_df.set_index('dataset')

    fig, ax = plt.subplots(figsize=(12,6))
    comp_df_sorted[['accuracy','precision','recall','f1','mcc','auc','tpr_at_1pct_fpr']].plot(
        kind='bar', ax=ax, width=0.8)
    plt.title('Performance Metrics Comparison Across Datasets', fontsize=14, fontweight='bold')
    plt.ylabel('Score', fontsize=12)
    plt.xlabel('Dataset', fontsize=12)
    plt.ylim(0,1.05)
    plt.legend(title='Metrics', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    comp_path = os.path.join(OUT, 'metrics_comparison.png')
    plt.savefig(comp_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved {comp_path}")

# ----- Print and save summary table -----
summary_df = pd.DataFrame(all_metrics)
if not summary_df.empty:
    display_cols = ['dataset','accuracy','precision','recall','f1','mcc',
                'auc','tpr_at_1pct_fpr','samples']
    print('\n' + '='*80)
    print('SUMMARY TABLE')
    print('='*80)
    print(summary_df[display_cols].round(4).to_string(index=False))
    print('='*80)
    summary_df.to_csv(os.path.join(OUT, 'summary.csv'), index=False)
    print(f"\nSaved summary table to {os.path.join(OUT, 'summary.csv')}")

# ----- Evaluate custom CSV and create per-model analysis -----
if custom_eval_df is not None and len(custom_eval_df)>0:
    print("\n" + "="*80)
    print("CUSTOM CSV EVALUATION - PER-MODEL ANALYSIS")
    print("="*80)

    metrics_c, (labels_c, probs_c) = evaluate_and_save(model, tokenizer, custom_eval_df, "CUSTOM_CSV", str(OUT), batch_size=BATCH_SIZE)

    # Get predictions for all samples
    ds_custom = PairEvalDataset(custom_eval_df, tokenizer, max_length=MAX_LENGTH)
    loader = DataLoader(ds_custom, batch_size=BATCH_SIZE, shuffle=False)
    preds_list = []
    model.eval()
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            out = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = F.softmax(out.logits, dim=-1)[:,1].cpu().numpy()
            preds = (probs > 0.5).astype(int)
            for i in range(len(preds)):
                preds_list.append({'pred': int(preds[i]), 'prob_paraphrase': float(probs[i])})

    # Combine with original data
    report_df = custom_eval_df.copy().reset_index(drop=True)
    preds_df = pd.DataFrame(preds_list)
    display_df = pd.concat([report_df, preds_df], axis=1)

    # Load original CSV to get model names
    if os.path.isfile(CUSTOM_CSV_PATH):
        orig_csv = pd.read_csv(CUSTOM_CSV_PATH)
        if 'model_name' in orig_csv.columns:
            display_df['model_name'] = orig_csv['model_name'].values[:len(display_df)]

            # Calculate per-model metrics
            model_metrics = []
            for model_name in display_df['model_name'].unique():
                model_data = display_df[display_df['model_name'] == model_name]
                labels_m = model_data['label'].values
                preds_m = model_data['pred'].values

                acc = accuracy_score(labels_m, preds_m)
                p, r, f1, _ = precision_recall_fscore_support(labels_m, preds_m, average='binary', zero_division=0)

                model_metrics.append({
                    'Model': model_name,
                    'Precision (%)': p * 100,
                    'Recall (%)': r * 100,
                    'F1-Score (%)': f1 * 100,
                    'Accuracy (%)': acc * 100,
                    'Samples': len(model_data)
                })

            model_perf_df = pd.DataFrame(model_metrics)

            # Display and save per-model table
            print("\nPer-Model Performance:")
            print(model_perf_df.round(2).to_string(index=False))
            model_perf_df.to_csv(os.path.join(OUT, "per_model_metrics.csv"), index=False)
            print(f"\nSaved per-model metrics to {os.path.join(OUT, 'per_model_metrics.csv')}")

            # Plot per-model performance
            fig, ax = plt.subplots(figsize=(10, 6))
            x = np.arange(len(model_perf_df))
            width = 0.2

            ax.bar(x - 1.5*width, model_perf_df['Precision (%)'], width, label='Precision', alpha=0.8)
            ax.bar(x - 0.5*width, model_perf_df['Recall (%)'], width, label='Recall', alpha=0.8)
            ax.bar(x + 0.5*width, model_perf_df['F1-Score (%)'], width, label='F1-Score', alpha=0.8)
            ax.bar(x + 1.5*width, model_perf_df['Accuracy (%)'], width, label='Accuracy', alpha=0.8)

            ax.set_ylabel('Score (%)', fontsize=12)
            ax.set_xlabel('Model', fontsize=12)
            ax.set_title('Paraphrase Detection Performance by LLM', fontsize=14, fontweight='bold')
            ax.set_xticks(x)
            ax.set_xticklabels(model_perf_df['Model'], rotation=45, ha='right')
            ax.legend()
            ax.set_ylim(0, 105)
            ax.grid(axis='y', alpha=0.3)
            plt.tight_layout()

            model_plot_path = os.path.join(OUT, 'per_model_performance.png')
            plt.savefig(model_plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"Saved per-model plot to {model_plot_path}")

            # Plot average probability scores by model
            fig, ax = plt.subplots(figsize=(10, 6))
            model_probs = display_df.groupby('model_name')['prob_paraphrase'].agg(['mean', 'std'])
            model_probs = model_probs.sort_values('mean', ascending=False)

            ax.barh(model_probs.index, model_probs['mean'], xerr=model_probs['std'],
                   capsize=5, alpha=0.7, color='steelblue')
            ax.set_xlabel('Average Paraphrase Probability', fontsize=12)
            ax.set_ylabel('Model', fontsize=12)
            ax.set_title('Average Paraphrase Detection Confidence by LLM', fontsize=14, fontweight='bold')
            ax.set_xlim(0, 1)
            ax.grid(axis='x', alpha=0.3)
            plt.tight_layout()

            prob_plot_path = os.path.join(OUT, 'model_confidence_scores.png')
            plt.savefig(prob_plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"Saved confidence scores plot to {prob_plot_path}")

            # Heatmap of per-model confusion matrices
            n_models = len(model_perf_df)
            fig, axes = plt.subplots(1, n_models, figsize=(5*n_models, 4))
            if n_models == 1:
                axes = [axes]

            for idx, model_name in enumerate(display_df['model_name'].unique()):
                model_data = display_df[display_df['model_name'] == model_name]
                cm = confusion_matrix(model_data['label'], model_data['pred'], labels=[0,1])

                sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                           xticklabels=['Not Para','Para'],
                           yticklabels=['Not Para','Para'],
                           cbar=False)
                axes[idx].set_title(f'{model_name}', fontsize=11, fontweight='bold')
                axes[idx].set_ylabel('True' if idx == 0 else '', fontsize=10)
                axes[idx].set_xlabel('Predicted', fontsize=10)

            plt.suptitle('Confusion Matrices by LLM', fontsize=14, fontweight='bold', y=1.02)
            plt.tight_layout()

            cm_plot_path = os.path.join(OUT, 'model_confusion_matrices.png')
            plt.savefig(cm_plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"Saved confusion matrices plot to {cm_plot_path}")

            # Create error analysis: false positives and false negatives by model
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

            error_data = []
            for model_name in display_df['model_name'].unique():
                model_data = display_df[display_df['model_name'] == model_name]
                fp = ((model_data['label'] == 0) & (model_data['pred'] == 1)).sum()
                fn = ((model_data['label'] == 1) & (model_data['pred'] == 0)).sum()
                tp = ((model_data['label'] == 1) & (model_data['pred'] == 1)).sum()
                tn = ((model_data['label'] == 0) & (model_data['pred'] == 0)).sum()
                error_data.append({
                    'Model': model_name,
                    'False Positives': fp,
                    'False Negatives': fn,
                    'True Positives': tp,
                    'True Negatives': tn
                })

            error_df = pd.DataFrame(error_data)

            # Plot false positives and false negatives
            x = np.arange(len(error_df))
            width = 0.35

            ax1.bar(x - width/2, error_df['False Positives'], width, label='False Positives', color='salmon', alpha=0.8)
            ax1.bar(x + width/2, error_df['False Negatives'], width, label='False Negatives', color='lightcoral', alpha=0.8)
            ax1.set_ylabel('Count', fontsize=11)
            ax1.set_xlabel('Model', fontsize=11)
            ax1.set_title('Error Analysis: False Positives vs False Negatives', fontsize=12, fontweight='bold')
            ax1.set_xticks(x)
            ax1.set_xticklabels(error_df['Model'], rotation=45, ha='right')
            ax1.legend()
            ax1.grid(axis='y', alpha=0.3)

            # Plot true positives and true negatives
            ax2.bar(x - width/2, error_df['True Positives'], width, label='True Positives', color='mediumseagreen', alpha=0.8)
            ax2.bar(x + width/2, error_df['True Negatives'], width, label='True Negatives', color='lightgreen', alpha=0.8)
            ax2.set_ylabel('Count', fontsize=11)
            ax2.set_xlabel('Model', fontsize=11)
            ax2.set_title('Correct Predictions: True Positives vs True Negatives', fontsize=12, fontweight='bold')
            ax2.set_xticks(x)
            ax2.set_xticklabels(error_df['Model'], rotation=45, ha='right')
            ax2.legend()
            ax2.grid(axis='y', alpha=0.3)

            plt.tight_layout()
            error_plot_path = os.path.join(OUT, 'model_error_analysis.png')
            plt.savefig(error_plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"Saved error analysis plot to {error_plot_path}")

            # Distribution of confidence scores by model
            fig, ax = plt.subplots(figsize=(12, 6))

            for model_name in display_df['model_name'].unique():
                model_data = display_df[display_df['model_name'] == model_name]
                ax.hist(model_data['prob_paraphrase'], bins=30, alpha=0.5, label=model_name, edgecolor='black')

            ax.set_xlabel('Paraphrase Probability', fontsize=12)
            ax.set_ylabel('Frequency', fontsize=12)
            ax.set_title('Distribution of Paraphrase Detection Confidence Scores by LLM', fontsize=14, fontweight='bold')
            ax.legend()
            ax.grid(axis='y', alpha=0.3)
            ax.axvline(x=0.5, color='red', linestyle='--', linewidth=2, label='Decision Threshold')
            plt.tight_layout()

            dist_plot_path = os.path.join(OUT, 'confidence_distribution.png')
            plt.savefig(dist_plot_path, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"Saved confidence distribution plot to {dist_plot_path}")

    # Save detailed per-sample results
    display_df.to_csv(os.path.join(OUT, "custom_results_detailed.csv"), index=False)
    print(f"\nSaved detailed results to {os.path.join(OUT, 'custom_results_detailed.csv')}")
    print(f"Total samples evaluated: {len(display_df)}")
else:
    print("\nNo custom CSV results to display.")

print("\n" + "="*80)
print("EVALUATION COMPLETE")
print("="*80)
print(f"All results saved to: {OUT}")
print("\nGenerated files:")
print("  - summary.json, summary.csv: Overall metrics across datasets")
print("  - roc_comparison.png: ROC curves for DIPPER and HC3")
print("  - metrics_comparison.png: Bar chart comparing all metrics")
print("  - per_model_metrics.csv: Performance table by LLM")
print("  - per_model_performance.png: Bar chart of metrics by LLM")
print("  - model_confidence_scores.png: Average confidence by LLM")
print("  - model_confusion_matrices.png: Confusion matrices grid")
print("  - model_error_analysis.png: False positive/negative analysis")
print("  - confidence_distribution.png: Probability distributions")
print("  - custom_results_detailed.csv: Per-sample predictions")
print("="*80)

Loading model/tokenizer: viswadarshan06/pd-mpnet ...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

[load_dipper_val] Searching for DIPPER valid/dev files under: /content/dipper_data
  ✓ found DIPPER valid-like: par3/gt_translator/sents_8/valid_ctrl_no_ctx_all.tsv -> 1500
  ✓ found DIPPER valid-like: par3/gt_translator/sents_8/valid_ctrl_no_ctx.tsv -> 1483
  ✓ found DIPPER valid-like: par3/gt_translator/sents_8/valid_ctrl_ctx.tsv -> 1479
  ✓ found DIPPER valid-like: par3/gt_translator/sents_8/valid_ctrl_ctx_all.tsv -> 1500
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_no_ctx_all.tsv -> 1484
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_no_ctx.tsv -> 1478
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_ctx.tsv -> 1474
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_ctx_all_small.tsv -> 1500
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_ctx_all.tsv -> 1500
  ✓ found DIPPER valid-like: par3/gt_translator/sents_2/valid_ctrl_no_ctx_all_small.tsv -> 1484
  ✓ found DIPPER valid-like: par3/gt

  if pd.isna(x): return []
  if pd.isna(x): return []


  HC3 train: no_ctx=60205 ctx=60205
Built HC3-eval: 8000 samples
Loaded custom CSV: 90 (pos/neg: {1: 60, 0: 30})

Evaluating DIPPER_VAL (8000 samples)


  with torch.cuda.amp.autocast(enabled=USE_MIXED_PRECISION and torch.cuda.is_available(), dtype=amp_dtype):


Saved /content/output/pd_mpnet/dipper_val_confusion_matrix.png
Saved /content/output/pd_mpnet/dipper_val_confusion_matrix_normalized.png
                precision    recall  f1-score   support

Not Paraphrase       0.38      1.00      0.55      2673
    Paraphrase       0.99      0.17      0.29      5327

      accuracy                           0.45      8000
     macro avg       0.68      0.58      0.42      8000
  weighted avg       0.79      0.45      0.38      8000


Evaluating HC3_VAL (8000 samples)


  with torch.cuda.amp.autocast(enabled=USE_MIXED_PRECISION and torch.cuda.is_available(), dtype=amp_dtype):


Saved /content/output/pd_mpnet/hc3_val_confusion_matrix.png
Saved /content/output/pd_mpnet/hc3_val_confusion_matrix_normalized.png
                precision    recall  f1-score   support

Not Paraphrase       0.33      1.00      0.50      2664
    Paraphrase       1.00      0.00      0.00      5336

      accuracy                           0.33      8000
     macro avg       0.67      0.50      0.25      8000
  weighted avg       0.78      0.33      0.17      8000

Saved summary.json to /content/output/pd_mpnet

Evaluating CUSTOM_CSV (90 samples)


  with torch.cuda.amp.autocast(enabled=USE_MIXED_PRECISION and torch.cuda.is_available(), dtype=amp_dtype):


Saved /content/output/pd_mpnet/custom_csv_confusion_matrix.png
Saved /content/output/pd_mpnet/custom_csv_confusion_matrix_normalized.png
                precision    recall  f1-score   support

Not Paraphrase       0.85      0.93      0.89        30
    Paraphrase       0.96      0.92      0.94        60

      accuracy                           0.92        90
     macro avg       0.91      0.93      0.91        90
  weighted avg       0.93      0.92      0.92        90



Unnamed: 0,sentence1,sentence2,label,pred,prob_paraphrase
0,The possibility of approximating a continuous ...,Several studies have examined the ability of a...,1,1,0.875344
1,State-of-the-art object detection networks dep...,Leading object detection networks rely on regi...,1,1,0.995918
2,I swear I wasn’t going to do a “Top Whatevers ...,"I had resolved not to write a ""Top Whatever"" b...",1,1,0.984598
3,Our evolutionary history suggests that there w...,"Indeed, science and technology are distinctly ...",0,0,0.000394
4,The possibility of approximating a continuous ...,Numerous research papers have investigated the...,1,1,0.998125
...,...,...,...,...,...
85,The possibility of approximating a continuous ...,A feedforward neural network with one hidden l...,1,1,0.997959
86,"In fact, science and technology are clearly di...","In a recent opinion piece in The Conversation,...",0,0,0.000352
87,I swear I wasn’t going to do a “Top Whatevers ...,We investigate a new machine learning problem ...,0,0,0.000281
88,Our evolutionary history suggests that there w...,Our evolutionary history indicates that there ...,1,1,0.998010


Saved custom_results.csv to /content/output/pd_mpnet


##

In [None]:
!zip -r /content/output.zip /content/output

  adding: content/output/ (stored 0%)
  adding: content/output/deberta_v3_mrpc/ (stored 0%)
  adding: content/output/deberta_v3_mrpc/hc3_val_metrics.json (deflated 39%)
  adding: content/output/deberta_v3_mrpc/custom_csv_confusion_matrix.png (deflated 18%)
  adding: content/output/deberta_v3_mrpc/custom_csv_confusion_matrix_normalized.png (deflated 16%)
  adding: content/output/deberta_v3_mrpc/custom_results.csv (deflated 87%)
  adding: content/output/deberta_v3_mrpc/summary.json (deflated 59%)
  adding: content/output/deberta_v3_mrpc/dipper_val_metrics.json (deflated 37%)
  adding: content/output/deberta_v3_mrpc/dipper_val_confusion_matrix.png (deflated 17%)
  adding: content/output/deberta_v3_mrpc/hc3_val_confusion_matrix_normalized.png (deflated 17%)
  adding: content/output/deberta_v3_mrpc/dipper_val_confusion_matrix_normalized.png (deflated 16%)
  adding: content/output/deberta_v3_mrpc/hc3_val_confusion_matrix.png (deflated 17%)
  adding: content/output/deberta_v3_mrpc/custom_csv_