In [1]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd() / "notebooks"))  # so we can import _utils from notebooks/

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from _utils import get_data_dir

DATA_DIR = get_data_dir()
DATA_DIR


WindowsPath('C:/Users/AdamR/OneDrive/UCSB/VIU/HonorsThesis/data')

In [3]:
#list(DATA_DIR.glob("**/*"))

In [6]:
# === Load human + model CSVs for 50_50, 80_20, and 100_0 datasets ===
from pathlib import Path
import pandas as pd

ROOT = DATA_DIR  # already defined in your environment
FOLDERS = ["50_50", "80_20", "100_0"]

def safe_read_csv(path: Path, **kwargs) -> pd.DataFrame:
    """Robust CSV reader with fallback parsing."""
    defaults = dict(low_memory=False, encoding_errors="ignore")
    defaults.update(kwargs)
    try:
        return pd.read_csv(path, **defaults)
    except Exception:
        return pd.read_csv(path, engine="python", sep=None, **defaults)

def load_dataset(folder: Path) -> dict:
    """Load human_data.csv and all model decision CSVs inside 'decisions/'."""
    human_path = folder / "human_data.csv"
    decisions_dir = folder / "decisions"

    if not human_path.exists():
        raise FileNotFoundError(f"Missing human_data.csv in {folder}")
    if not decisions_dir.exists():
        raise FileNotFoundError(f"Missing 'decisions/' subfolder in {folder}")

    # Load human data
    human_df = safe_read_csv(human_path)

    # Load each model file
    models = {}
    for csv_path in sorted(decisions_dir.glob("*.csv")):
        model_name = csv_path.stem
        models[model_name] = safe_read_csv(csv_path)

    return {
        "human": human_df,
        "human_path": human_path,
        "models": models,
        "model_paths": {m: csv_path for m, csv_path in zip(models.keys(), sorted(decisions_dir.glob('*.csv')))}
    }

# === Main loading loop ===
datasets: dict[str, dict] = {}
records = []

for name in FOLDERS:
    folder = ROOT / name
    if not folder.exists():
        print(f"⚠️ Warning: Folder '{name}' not found under {ROOT}")
        continue

    data_bundle = load_dataset(folder)
    datasets[name] = data_bundle

    # Record human file
    h = data_bundle["human"]
    records.append({
        "dataset": name,
        "kind": "human",
        "name": "human_data",
        "n_rows": len(h),
        "n_cols": h.shape[1],
        "path": str(data_bundle["human_path"].resolve())
    })

    # Record model files
    for mname, mdf in data_bundle["models"].items():
        records.append({
            "dataset": name,
            "kind": "model",
            "name": mname,
            "n_rows": len(mdf),
            "n_cols": mdf.shape[1],
            "path": str((folder / "decisions" / f"{mname}.csv").resolve())
        })

# === Summary table ===
assignment_index = pd.DataFrame.from_records(records).sort_values(
    ["dataset", "kind", "name"]
).reset_index(drop=True)

print(f"✅ Loaded datasets: {list(datasets.keys())}")


✅ Loaded datasets: ['50_50', '80_20', '100_0']


In [8]:
#column titles

def get_csv_columns(csv_path):
    """
    Return a list of column names from a CSV file.
    Accepts either a string/Path to a file or a pandas DataFrame.
    """
    if isinstance(csv_path, pd.DataFrame):
        return list(csv_path.columns)
    
    csv_path = Path(csv_path)
    if not csv_path.exists():
        raise FileNotFoundError(f"File not found: {csv_path}")
    
    # Read only the header row
    df = pd.read_csv(csv_path, nrows=0, encoding_errors="ignore")
    return list(df.columns)


In [12]:
human50_50 = datasets["50_50"]["human"]
human50_50_columns = get_csv_columns(human50_50)

display(human50_50.head(30))
print(human50_50_columns)

Unnamed: 0,stimID,condition,response,side_selected,cue_points,line1_angle,line2_angle,valid_cue,TP,participantID
0,100,condition_2,6,1,2,14.314827,1.921956,False,True,SA
1,845,condition_2,5,1,2,15.054317,4.22223,False,True,SA
2,245,condition_2,4,1,1,14.314827,6.508956,True,True,SA
3,72,condition_2,4,2,2,8.775056,15.054317,True,True,SA
4,469,condition_2,4,2,2,4.22223,19.885165,True,True,SA
5,468,condition_2,4,2,2,6.508956,17.468023,True,True,SA
6,923,condition_2,4,1,1,13.392498,4.22223,True,True,SA
7,646,condition_2,5,2,1,10.407711,17.468023,False,True,SA
8,672,condition_2,6,1,2,19.885165,8.775056,False,True,SA
9,275,condition_2,6,1,1,15.054317,-1.15345,True,True,SA


['stimID', 'condition', 'response', 'side_selected', 'cue_points', 'line1_angle', 'line2_angle', 'valid_cue', 'TP', 'participantID']
