## Function

In [63]:
from datetime import datetime, timedelta, timezone
import pandas as pd
import os

def parse_expstart(exp_start_str: str) -> datetime:
    """
    Convert '2025-11-12 14h45.49.380213 +0100'
    -> datetime(2025-11-12 14:45:49.380213+01:00)
    """
    date, time_part, tz = exp_start_str.split(" ")

    # 14h45.49.380213 -> 14:45:49.380213
    time_part = time_part.replace("h", ":")
    time_part = time_part.replace(".", ":", 1)  # only first '.' -> ':'

    fixed = f"{date} {time_part} {tz}"
    return datetime.strptime(fixed, "%Y-%m-%d %H:%M:%S.%f %z")


def offset_to_str(offset_seconds, base_dt):
    """Convert a float offset in seconds to same string format as expStart."""
    if base_dt is None or pd.isna(offset_seconds):
        return None
    dt = base_dt + timedelta(seconds=float(offset_seconds))
    return dt.strftime("%Y-%m-%d %Hh%M.%S.%f %z")


In [64]:
def parse_participant_csv(path: str) -> dict:
    df = pd.read_csv(path)

    # --- participant id ---
    try:
        participant_id = int(df["participant"].dropna().iloc[0])
    except Exception:
        # fall back to filename prefix
        participant_id = int(os.path.basename(path).split("_", 1)[0])

    # --- focus (all trials) ---
    focus_ratings = []
    if "stimuli.focus_slider.response" in df.columns:
        focus_ratings = df["stimuli.focus_slider.response"].dropna().tolist()
    focus_mean = float(sum(focus_ratings) / len(focus_ratings)) if focus_ratings else None

    # --- opinion (final slider) ---
    if "opinion_slider.response" in df.columns and df["opinion_slider.response"].notna().any():
        opinion = float(df["opinion_slider.response"].dropna().iloc[0])
    else:
        opinion = None

    # --- experiment start/end timestamps ---
    if "expStart" in df.columns and df["expStart"].notna().any():
        raw_start = df["expStart"].dropna().iloc[0]
        exp_start_dt = parse_expstart(raw_start)
        exp_start = raw_start
    else:
        exp_start_dt = None
        exp_start = None

    if "expEndTime" in df.columns and df["expEndTime"].notna().any() and exp_start_dt is not None:
        end_epoch = float(df["expEndTime"].dropna().iloc[0])
        exp_end_dt = datetime.fromtimestamp(end_epoch, tz=timezone.utc).astimezone(
            exp_start_dt.tzinfo
        )
        exp_end = exp_end_dt.strftime("%Y-%m-%d %Hh%M.%S.%f %z")
    else:
        exp_end = None

    # --- trial rows (one per video) ---
    if "video_index" in df.columns:
        trial_rows = df[df["video_index"].notna()]
    else:
        trial_rows = pd.DataFrame()

    stimuli = []

    for _, row in trial_rows.iterrows():
        # SAM ratings
        sam_cols = ["rating_sam_1.keys", "rating_sam_2.keys", "rating_sam_3.keys"]
        sam_values = []
        for col in sam_cols:
            if col in df.columns and not pd.isna(row[col]):
                sam_values.append(float(row[col]))
            else:
                sam_values.append(None)
        trusted = all(v is not None for v in sam_values)

        # SAM timestamps (absolute times, based on r_SAM_* offsets)
        timestamps = {
            "sam1_start": offset_to_str(row.get("r_SAM_1.started"), exp_start_dt),
            "sam1_end":   offset_to_str(row.get("r_SAM_1.stopped"), exp_start_dt),
            "sam2_start": offset_to_str(row.get("r_SAM_2.started"), exp_start_dt),
            "sam2_end":   offset_to_str(row.get("r_SAM_2.stopped"), exp_start_dt),
            "sam3_start": offset_to_str(row.get("r_SAM_3.started"), exp_start_dt),
            "sam3_end":   offset_to_str(row.get("r_SAM_3.stopped"), exp_start_dt),
        }

        # video start/end â€“ ONLY using video.started / video.stopped
        video_start = offset_to_str(row.get("video.started"), exp_start_dt)
        video_end   = offset_to_str(row.get("video.stopped"), exp_start_dt)

        stim_struct = {
            "id": int(row["video_index"]),
            "video_id": row["video_id"],
            "video_start": video_start,
            "video_end": video_end,
            "video_response": {
                "belief": row.get("belief"),
                "sam": sam_values,
                "timestamps": timestamps,
                "trusted": trusted,
            },
        }
        stimuli.append(stim_struct)

    return {
        "participant_id": participant_id,
        "focus": {"mean": focus_mean, "all_ratings": focus_ratings},
        "opinion": opinion,
        "timestamp_start": exp_start,
        "timestamp_end": exp_end,
        "stimuli": stimuli,
    }


## Looping over all CSV files

In [65]:
from glob import glob
from pathlib import Path

notebook_dir = Path.cwd()
root = notebook_dir.parent
data_folder = root / "trials"
pattern = str(data_folder / "*_manipulation-of-belief_*.csv")

all_participants = []
for csv_path in glob(pattern):
    all_participants.append(parse_participant_csv(csv_path))

all_participants.sort(key=lambda p: p["participant_id"])



In [66]:
import json

with open("experiment_data.json", "w", encoding="utf-8") as f:
    json.dump(all_participants, f, indent=2)
