In [1]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd() / "notebooks"))  # so we can import _utils from notebooks/

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from _utils import get_data_dir

DATA_DIR = get_data_dir()
DATA_DIR


WindowsPath('C:/Users/AdamR/OneDrive/UCSB/VIU/HonorsThesis/data')

In [2]:
# === Load human + model CSVs for 50_50, 80_20, and 100_0 datasets ===
from pathlib import Path
import pandas as pd

ROOT = DATA_DIR  # already defined in your environment
FOLDERS = ["50_50", "80_20", "100_0"]

def safe_read_csv(path: Path, **kwargs) -> pd.DataFrame:
    """Robust CSV reader with fallback parsing."""
    defaults = dict(low_memory=False, encoding_errors="ignore")
    defaults.update(kwargs)
    try:
        return pd.read_csv(path, **defaults)
    except Exception:
        return pd.read_csv(path, engine="python", sep=None, **defaults)

def load_dataset(folder: Path) -> dict:
    """Load human_data.csv and all model decision CSVs inside 'decisions/'."""
    human_path = folder / "human_data.csv"
    decisions_dir = folder / "decisions"

    if not human_path.exists():
        raise FileNotFoundError(f"Missing human_data.csv in {folder}")
    if not decisions_dir.exists():
        raise FileNotFoundError(f"Missing 'decisions/' subfolder in {folder}")

    # Load human data
    human_df = safe_read_csv(human_path)

    # Load each model file
    models = {}
    for csv_path in sorted(decisions_dir.glob("*.csv")):
        model_name = csv_path.stem
        models[model_name] = safe_read_csv(csv_path)

    return {
        "human": human_df,
        "human_path": human_path,
        "models": models,
        "model_paths": {m: csv_path for m, csv_path in zip(models.keys(), sorted(decisions_dir.glob('*.csv')))}
    }

# === Main loading loop ===
datasets: dict[str, dict] = {}
records = []

for name in FOLDERS:
    folder = ROOT / name
    if not folder.exists():
        print(f"⚠️ Warning: Folder '{name}' not found under {ROOT}")
        continue

    data_bundle = load_dataset(folder)
    datasets[name] = data_bundle

    # Record human file
    h = data_bundle["human"]
    records.append({
        "dataset": name,
        "kind": "human",
        "name": "human_data",
        "n_rows": len(h),
        "n_cols": h.shape[1],
        "path": str(data_bundle["human_path"].resolve())
    })

    # Record model files
    for mname, mdf in data_bundle["models"].items():
        records.append({
            "dataset": name,
            "kind": "model",
            "name": mname,
            "n_rows": len(mdf),
            "n_cols": mdf.shape[1],
            "path": str((folder / "decisions" / f"{mname}.csv").resolve())
        })

# === Summary table ===
assignment_index = pd.DataFrame.from_records(records).sort_values(
    ["dataset", "kind", "name"]
).reset_index(drop=True)

print(f"✅ Loaded datasets: {list(datasets.keys())}")


✅ Loaded datasets: ['50_50', '80_20', '100_0']


In [3]:
import pandas as pd
import glob
from pathlib import Path

root = DATA_DIR  # contains folders 50_50, 80_20, 100_0
conditions = ["50_50", "80_20", "100_0"]
all_conditions = {}

for cond in conditions:
    files = glob.glob(str(root / cond / "decisions" / "*.csv"))
    dfs = []
    for f in files:
        model_name = Path(f).stem
        df = pd.read_csv(f)
        df["condition"] = cond
        df["model_name"] = model_name
        dfs.append(df)
    all_conditions[cond] = pd.concat(dfs, ignore_index=True)

# Optional single master DataFrame:
models_df = pd.concat(all_conditions.values(), ignore_index=True)

models_df.head()

Unnamed: 0,image_id,side_selected,cue_points,line1_angle,line2_angle,valid_cue,TP,GPT_response,condition,model_name
0,100,1,2,14.314827,1.921956,False,True,present,50_50,0responses_claude-3-5-haiku-20241022
1,845,1,2,15.054317,4.22223,False,True,absent,50_50,0responses_claude-3-5-haiku-20241022
2,245,1,1,14.314827,6.508956,True,True,absent,50_50,0responses_claude-3-5-haiku-20241022
3,72,2,2,8.775056,15.054317,True,True,present,50_50,0responses_claude-3-5-haiku-20241022
4,469,2,2,4.22223,19.885165,True,True,absent,50_50,0responses_claude-3-5-haiku-20241022


In [4]:
# Summary: Unique Value Counts per Column

def summarize_uniques(df: pd.DataFrame, max_display: int = 12):
    """
    For each column, prints:
    - Number of unique values
    - Example of up to 12 unique values
    - Value counts for up to 12 most frequent values
    """
    print(f"\n=== UNIQUE VALUE SUMMARY: n_rows = {len(df):,} ===\n")
    for col in df.columns:
        n_unique = df[col].nunique(dropna=True)
        uniques = df[col].dropna().unique()[:max_display]
        print(f"\n▶ Column: '{col}'")
        print(f"   • Unique count: {n_unique}")
        if n_unique <= max_display:
            print(f"   • Unique values: {uniques.tolist()}")
        else:
            print(f"   • First {max_display} unique values: {uniques.tolist()} ...")
        # Value counts (top 12)
        print("   • Value counts:")
        display(df[col].value_counts(dropna=False).head(max_display))

# Example usage
summarize_uniques(models_df)



=== UNIQUE VALUE SUMMARY: n_rows = 36,000 ===


▶ Column: 'image_id'
   • Unique count: 1000
   • First 12 unique values: [100, 845, 245, 72, 469, 468, 923, 646, 672, 275, 712, 448] ...
   • Value counts:


image_id
100    36
680    36
679    36
973    36
520    36
531    36
744    36
630    36
93     36
840    36
123    36
484    36
Name: count, dtype: int64


▶ Column: 'side_selected'
   • Unique count: 2
   • Unique values: [1, 2]
   • Value counts:


side_selected
1    18504
2    17496
Name: count, dtype: int64


▶ Column: 'cue_points'
   • Unique count: 2
   • Unique values: [2, 1]
   • Value counts:


cue_points
1    18492
2    17508
Name: count, dtype: int64


▶ Column: 'line1_angle'
   • Unique count: 40
   • First 12 unique values: [14.31482691040488, 15.05431655960486, 8.775055744479694, 4.222230206142679, 6.5089564405024944, 13.392497753751124, 10.40771131249005, 19.885165113855454, 2.6897703231505687, 16.735995851476087, 7.266954405811635, 4.986333771235535] ...
   • Value counts:


line1_angle
4.222230     2520
4.986334     2340
3.456619     2196
7.266954     2124
5.748663     2088
6.508956     2052
8.022404     1692
2.689770     1476
1.921956     1404
8.775056     1296
11.908300    1296
9.651833     1188
Name: count, dtype: int64


▶ Column: 'line2_angle'
   • Unique count: 42
   • First 12 unique values: [1.921955958931289, 4.222230206142679, 6.5089564405024944, 15.05431655960486, 19.885165113855454, 17.468023251277728, 8.775055744479694, -1.1534504511055996, 16.735995851476087, 2.6897703231505687, -3.4566191720419983, 14.31482691040488] ...
   • Value counts:


line2_angle
4.986334     2736
5.748663     2592
6.508956     2160
4.222230     2160
7.266954     2160
3.456619     2088
1.921956     1872
2.689770     1836
8.022404     1656
8.775056     1404
1.153450     1224
11.159944    1044
Name: count, dtype: int64


▶ Column: 'valid_cue'
   • Unique count: 2
   • Unique values: [False, True]
   • Value counts:


valid_cue
True     23076
False    12924
Name: count, dtype: int64


▶ Column: 'TP'
   • Unique count: 2
   • Unique values: [True, False]
   • Value counts:


TP
True     18000
False    18000
Name: count, dtype: int64


▶ Column: 'GPT_response'
   • Unique count: 2
   • Unique values: ['present', 'absent']
   • Value counts:


GPT_response
present    19149
absent     16851
Name: count, dtype: int64


▶ Column: 'condition'
   • Unique count: 3
   • Unique values: ['50_50', '80_20', '100_0']
   • Value counts:


condition
50_50    12000
80_20    12000
100_0    12000
Name: count, dtype: int64


▶ Column: 'model_name'
   • Unique count: 12
   • Unique values: ['0responses_claude-3-5-haiku-20241022', '0responses_claude-3-7-sonnet-20250219', '0responses_claude-opus-4-20250514', '0responses_claude-sonnet-4-20250514', '0responses_gemini-2.5-flash-lite-preview-06-17', '0responses_gemini-2.5-flash', '0responses_gemini-2.5-pro', '0responses_gpt-4.1-2025-04-14', '0responses_gpt-5-2025-08-07', '0responses_gpt-5-mini-2025-08-07', '0responses_o3-2025-04-16', '0responses_o4-mini-2025-04-16']
   • Value counts:


model_name
0responses_claude-3-5-haiku-20241022              3000
0responses_claude-3-7-sonnet-20250219             3000
0responses_claude-opus-4-20250514                 3000
0responses_claude-sonnet-4-20250514               3000
0responses_gemini-2.5-flash-lite-preview-06-17    3000
0responses_gemini-2.5-flash                       3000
0responses_gemini-2.5-pro                         3000
0responses_gpt-4.1-2025-04-14                     3000
0responses_gpt-5-2025-08-07                       3000
0responses_gpt-5-mini-2025-08-07                  3000
0responses_o3-2025-04-16                          3000
0responses_o4-mini-2025-04-16                     3000
Name: count, dtype: int64

In [None]:
# Compute model hit and false alarm rates

def compute_model_hit_fa(MODELS_DF: pd.DataFrame):
    df = MODELS_DF.copy()
    df["GPT_response"] = (
        df["GPT_response"].astype(str).str.strip().str.lower()
        .replace({"true": "present", "false": "absent"})
    )
    df["present_choice"] = df["GPT_response"].map({"present": 1, "absent": 0})
    df["TP"] = pd.to_numeric(df["TP"], errors="coerce").astype(int)

    results = (
        df.groupby(["condition", "model_name"])
          .apply(lambda g: pd.Series({
              "hit_rate": np.mean(g.loc[g["TP"] == 1, "present_choice"])
                          if (g["TP"] == 1).any() else np.nan,
              "fa_rate":  np.mean(g.loc[g["TP"] == 0, "present_choice"])
                          if (g["TP"] == 0).any() else np.nan
          }))
          .reset_index()
    )

    # Mean ± SEM across models per condition
    summary = (
        results.groupby("condition", as_index=False)
               .agg(hit_mean=("hit_rate", "mean"),
                    hit_sem=("hit_rate", lambda x: x.std(ddof=1)/np.sqrt(x.count())),
                    fa_mean=("fa_rate", "mean"),
                    fa_sem=("fa_rate", lambda x: x.std(ddof=1)/np.sqrt(x.count())))
    )

    print("=== Model Hit / False Alarm Rates ===")
    display(summary.round(3))
    return results, summary


model_rates, model_summary = compute_model_hit_fa(models_df)


=== Model Hit / False Alarm Rates ===


  .apply(lambda g: pd.Series({


Unnamed: 0,condition,hit_mean,hit_sem,fa_mean,fa_sem
0,100_0,0.501,0.04,0.536,0.035
1,50_50,0.513,0.045,0.544,0.044
2,80_20,0.536,0.051,0.561,0.048


In [6]:
# === Majority-vote accuracy for MODEL ENSEMBLES (schema: image_id, TP, GPT_response) ===
# Uses the 'datasets' dict you created earlier.
# Each CSV in datasets[COND]["models"] is one "participant" in the ensemble.

def compute_model_majority_vote(
    model_dfs: dict[str, pd.DataFrame],
    condition_label: str,
    trial_col: str = "image_id",
    truth_col: str = "TP",
    response_col: str = "GPT_response",
    tie_rule: str = ">"   # use ">=" to break ties toward "present"
) -> pd.DataFrame:
    """
    Build a per-trial table by majority-voting across models.
    Returns columns: [image_id, pred, truth, condition, n_models, correct]
    """
    if not model_dfs:
        raise ValueError(f"No model CSVs found for condition {condition_label}.")

    # 1) Stack all model decisions
    recs = []
    mapping = {"present": 1, "absent": 0, "1": 1, "0": 0, "yes": 1, "no": 0}
    for model_name, df in model_dfs.items():
        for col in (trial_col, truth_col, response_col):
            if col not in df.columns:
                raise KeyError(f"[{condition_label} / {model_name}] missing column: '{col}'")

        tmp = df[[trial_col, truth_col, response_col]].copy()
        # Normalize responses to binary
        tmp[response_col] = tmp[response_col].astype(str).str.strip().str.lower()
        tmp["model_decision"] = tmp[response_col].map(mapping)

        if tmp["model_decision"].isna().any():
            bad_vals = sorted(tmp.loc[tmp["model_decision"].isna(), response_col].unique().tolist())
            raise ValueError(f"[{condition_label} / {model_name}] Unmapped GPT_response values: {bad_vals}")

        tmp["model_name"] = model_name
        recs.append(tmp[[trial_col, truth_col, "model_name", "model_decision"]])

    stacked = pd.concat(recs, ignore_index=True)

    # 2) Majority vote across models per trial
    prop_present = stacked.groupby(trial_col)["model_decision"].mean()
    if tie_rule not in {">", ">="}:
        raise ValueError("tie_rule must be '>' or '>='.")
    trial_pred = ((prop_present >= 0.5) if tie_rule == ">=" else (prop_present > 0.5)).astype(int).rename("pred")

    # 3) Truth per trial (mode fallback, warn if inconsistent)
    tp_grp = stacked.groupby(trial_col)[truth_col]
    trial_truth = tp_grp.agg(lambda s: s.mode().iloc[0] if not s.mode().empty else int(round(s.mean()))).astype(int)

    n_inconsistent = int((tp_grp.nunique() > 1).sum())
    if n_inconsistent > 0:
        print(f"⚠️ {condition_label}: {n_inconsistent} trial(s) had inconsistent TP across files; using per-trial mode.")

    # 4) Assemble trial table and compute accuracy
    tt = pd.concat([trial_pred, trial_truth.rename("truth")], axis=1).reset_index()
    tt["condition"] = condition_label
    tt["n_models"] = len(model_dfs)
    tt["correct"] = (tt["pred"] == tt["truth"]).astype(int)

    acc = tt["correct"].mean()
    print(f"{condition_label}: model-ensemble majority accuracy = {acc:.4f} | trials={len(tt)} | models={len(model_dfs)}")
    return tt


# === Run for each dataset folder you loaded earlier ===
model_results = []
for cond_label, bundle in datasets.items():  # e.g., "50_50", "80_20", "100_0"
    tt = compute_model_majority_vote(
        model_dfs=bundle["models"],
        condition_label=cond_label,
        trial_col="image_id",
        truth_col="TP",
        response_col="GPT_response",
        tie_rule=">"     # change to ">=" to tie-break toward "present"
    )
    model_results.append(tt)

combined_models = pd.concat(model_results, ignore_index=True)

# Summary
overall_acc = combined_models["correct"].mean()
by_condition = (
    combined_models.groupby("condition", as_index=False)
    .agg(trials=("image_id", "nunique"), accuracy=("correct", "mean"))
    .sort_values("condition")
)

print(f"\nCombined model-ensemble accuracy across all conditions: {overall_acc:.4f}")
display(by_condition)


50_50: model-ensemble majority accuracy = 0.4860 | trials=1000 | models=12
80_20: model-ensemble majority accuracy = 0.4770 | trials=1000 | models=12
100_0: model-ensemble majority accuracy = 0.4770 | trials=1000 | models=12

Combined model-ensemble accuracy across all conditions: 0.4800


Unnamed: 0,condition,trials,accuracy
0,100_0,1000,0.477
1,50_50,1000,0.486
2,80_20,1000,0.477


In [7]:
import numpy as np
import pandas as pd

# =========================
# WLC for MODELS (binary)
# =========================
def wlc_models(
    MODELS_DF: pd.DataFrame,
    center_by_model: bool = True,
    standardize: bool = False,
    reg_lambda: float = 1e-6,
    tie_rule: str = ">",          # '>' => equal-threshold -> absent ; '>=' => equal -> present
    fit_per_condition: bool = False  # if True: fit separate WLC per condition
):
    # ---------- Sanity checks ----------
    req = {"image_id","condition","model_name","GPT_response","TP"}
    missing = req - set(MODELS_DF.columns)
    if missing:
        raise KeyError(f"Missing required columns: {sorted(missing)}")

    df = MODELS_DF.copy()

    # Normalize GPT_response -> binary (1=present, 0=absent)
    df["GPT_response"] = (
        df["GPT_response"].astype(str).str.strip().str.lower()
        .replace({"true":"present","false":"absent"})
    )
    df["pred_bin"] = df["GPT_response"].map({"present":1, "absent":0}).astype("float")

    # Coerce TP
    df["TP"] = pd.to_numeric(df["TP"], errors="coerce").astype(int)

    # ---------- Enforce uniqueness at (image_id, condition, model_name) ----------
    # If a model has multiple rows on the same trial, average its binary predictions (still in [0,1]).
    df_uni = (
        df.groupby(["image_id","condition","model_name"], as_index=False)
          .agg(pred_bin=("pred_bin","mean"))
    )

    # Truth per (image_id, condition)
    truth_ic = (
        df.groupby(["image_id","condition"])["TP"]
          .agg(lambda s: s.mode().iloc[0] if not s.mode().empty else int(round(s.mean())))
          .astype(int)
    )

    # Helper: fit WLC on a given subset (optionally per condition)
    def _fit_core(df_uni_sub, truth_sub, cond_label="ALL"):
        # ----- Matrix R: trials x models -----
        R = df_uni_sub.pivot_table(
                index=["image_id","condition"],
                columns="model_name",
                values="pred_bin",
                aggfunc="mean"
            ).sort_index()

        # Align truth to R
        y = truth_sub.reindex(R.index).to_numpy()

        # ----- Δμ by model: mean(pred|TP=1) - mean(pred|TP=0) -----
        # Compute from the long df (not just R) to use all raw rows
        sub_keys = set(map(tuple, R.index.to_frame(index=False).to_records(index=False)))
        mask = df.set_index(["image_id","condition"]).index.map(tuple).isin(sub_keys)
        df_sub = df.loc[mask, ["model_name","TP","pred_bin"]].copy()

        means_by_tp = (
            df_sub.groupby(["model_name","TP"])["pred_bin"]
                  .mean().unstack("TP")  # columns {0,1}
        )
        if 0 not in means_by_tp.columns: means_by_tp[0] = np.nan
        if 1 not in means_by_tp.columns: means_by_tp[1] = np.nan
        Delta_mu = (means_by_tp[1] - means_by_tp[0]).fillna(0.0)
        Delta_mu = Delta_mu.reindex(R.columns).fillna(0.0).to_numpy(dtype=float)  # (P,)

        # ----- Prepare X for covariance -----
        X = R.copy()  # rows=trials, cols=models
        if center_by_model:
            X = X.subtract(X.mean(axis=0, skipna=True), axis=1)
        if standardize:
            col_stds = X.std(axis=0, ddof=1, skipna=True).replace(0, np.nan)
            X = X.divide(col_stds, axis=1)
        # Missing model predictions -> 0 after centering (model-at-mean)
        X = X.fillna(0.0)
        X_np = X.to_numpy(dtype=float)
        N, P = X_np.shape

        # ----- Covariance & weights -----
        Sigma = np.cov(X_np, rowvar=False, ddof=1) if P > 1 else np.array([[np.var(X_np, ddof=1)]])
        Sigma_reg = Sigma + reg_lambda * np.eye(P)
        try:
            w = np.linalg.solve(Sigma_reg, Delta_mu)
        except np.linalg.LinAlgError:
            w = np.linalg.pinv(Sigma_reg) @ Delta_mu

        # ----- Decision variable & threshold sweep -----
        D = X_np @ w
        D_sorted = np.unique(D)
        if len(D_sorted) == 1:
            crit_best = D_sorted[0]
            pred = np.full_like(D, fill_value=int(y.mean() >= 0.5))
            best_acc = np.mean(pred == y)
        else:
            mids = (D_sorted[:-1] + D_sorted[1:]) / 2.0
            cands = np.concatenate(([-np.inf], mids, [np.inf]))
            best_acc, crit_best, pred = -1.0, None, None
            for c in cands:
                p = (D >= c).astype(int) if tie_rule == ">=" else (D > c).astype(int)
                acc = (p == y).mean()
                if acc > best_acc:
                    best_acc, crit_best, pred = acc, c, p

        # ----- Tables -----
        trial_table = (
            pd.DataFrame({
                "D": D,
                "pred": pred.astype(int),
                "truth": y,
            }, index=R.index)
            .reset_index()
            .assign(correct=lambda t: (t["pred"] == t["truth"]).astype(int))
        )

        by_condition = (
            trial_table.groupby("condition", as_index=False)
                       .agg(trials=("image_id","nunique"),
                            accuracy=("correct","mean"),
                            D_mean=("D","mean"),
                            D_std=("D","std"))
                       .sort_values("condition")
        )

        Delta_mu_series = pd.Series(Delta_mu, index=R.columns, name="Delta_mu")
        weights_df = (
            pd.DataFrame({"weight": w}, index=R.columns)
              .join(Delta_mu_series, how="left")
              .reset_index().rename(columns={"index":"model_name"})
              .assign(Delta_mu=lambda d: d["Delta_mu"].astype(float))
              .sort_values("weight", ascending=False)
              .reset_index(drop=True)
        )

        rank_sigma = int(np.linalg.matrix_rank(Sigma))
        summary = {
            "condition_fit": cond_label,
            "trials": int(N),
            "n_models": int(P),
            "rank_Sigma": rank_sigma,
            "lambda": reg_lambda,
            "best_threshold": float(crit_best),
            "overall_accuracy": float(trial_table["correct"].mean()),
        }

        return {
            "trial_table": trial_table,
            "by_condition": by_condition,
            "weights_df": weights_df,
            "R_matrix": R,
            "X_centered": X,
            "Sigma": Sigma,
            "w": w,
            "summary": summary
        }

    results = {}
    if fit_per_condition:
        for cond, g in df_uni.groupby("condition"):
            truth_sub = truth_ic.loc[truth_ic.index.get_level_values("condition") == cond]
            res = _fit_core(g, truth_sub, cond_label=cond)
            results[cond] = res
        # Also compute a pooled “report” by concatenating summaries
        pooled_summary = pd.DataFrame([v["summary"] for v in results.values()])
        results["__summary_per_condition__"] = pooled_summary
    else:
        res = _fit_core(df_uni, truth_ic, cond_label="ALL")
        results["ALL"] = res

    return results

# =========================
# Example usage
# =========================
results = wlc_models(models_df, center_by_model=True, standardize=False,
                      reg_lambda=1e-6, tie_rule=">", fit_per_condition=False)
summary = results["ALL"]["summary"]
display(pd.DataFrame([summary]))
display(results["ALL"]["by_condition"])
display(results["ALL"]["weights_df"].head(12))


Unnamed: 0,condition_fit,trials,n_models,rank_Sigma,lambda,best_threshold,overall_accuracy
0,ALL,3000,12,12,1e-06,-0.030921,0.536667


Unnamed: 0,condition,trials,accuracy,D_mean,D_std
0,100_0,1000,0.541,0.025937,0.180225
1,50_50,1000,0.538,-0.019212,0.156988
2,80_20,1000,0.531,-0.006725,0.172722


Unnamed: 0,model_name,weight,Delta_mu
0,0responses_claude-3-7-sonnet-20250219,0.190728,-0.019333
1,0responses_claude-sonnet-4-20250514,0.137368,-0.018667
2,0responses_gemini-2.5-flash-lite-preview-06-17,0.121923,-0.004667
3,0responses_gemini-2.5-pro,0.089671,-0.020667
4,0responses_gpt-5-2025-08-07,-0.033049,-0.031333
5,0responses_gemini-2.5-flash,-0.036858,-0.035333
6,0responses_gpt-5-mini-2025-08-07,-0.056686,-0.029333
7,0responses_o4-mini-2025-04-16,-0.065138,-0.027333
8,0responses_o3-2025-04-16,-0.124537,-0.036667
9,0responses_claude-3-5-haiku-20241022,-0.131421,-0.042667
