# Extracting PsychoPy data

## Helper

In [1]:
from datetime import datetime, timedelta, timezone
import pandas as pd
import os

def parse_expstart(exp_start_str: str) -> datetime:
    """
    Convert '2025-11-12 14h45.49.380213 +0100'
    -> datetime(2025-11-12 14:45:49.380213+01:00)
    """
    date, time_part, tz = exp_start_str.split(" ")

    # 14h45.49.380213 -> 14:45:49.380213
    time_part = time_part.replace("h", ":")
    time_part = time_part.replace(".", ":", 1)  # only first '.' -> ':'

    fixed = f"{date} {time_part} {tz}"
    return datetime.strptime(fixed, "%Y-%m-%d %H:%M:%S.%f %z")


def offset_to_str(offset_seconds, base_dt):
    """Convert a float offset in seconds to same string format as expStart."""
    if base_dt is None or pd.isna(offset_seconds):
        return None
    dt = base_dt + timedelta(seconds=float(offset_seconds))
    return dt.strftime("%Y-%m-%d %Hh%M.%S.%f %z")


def parse_participant_csv(path: str) -> dict:
    df = pd.read_csv(path)

    # --- participant id ---
    try:
        participant_id = int(df["participant"].dropna().iloc[0])
    except Exception:
        # fall back to filename prefix
        participant_id = int(os.path.basename(path).split("_", 1)[0])

    # --- focus (all trials) ---
    focus_ratings = []
    if "stimuli.focus_slider.response" in df.columns:
        focus_ratings = df["stimuli.focus_slider.response"].dropna().tolist()
    focus_mean = float(sum(focus_ratings) / len(focus_ratings)) if focus_ratings else None

    # --- opinion (final slider) ---
    if "opinion_slider.response" in df.columns and df["opinion_slider.response"].notna().any():
        opinion = float(df["opinion_slider.response"].dropna().iloc[0])
    else:
        opinion = None

    # --- experiment start/end timestamps ---
    if "expStart" in df.columns and df["expStart"].notna().any():
        raw_start = df["expStart"].dropna().iloc[0]
        exp_start_dt = parse_expstart(raw_start)
        exp_start = raw_start
    else:
        exp_start_dt = None
        exp_start = None

    if "expEndTime" in df.columns and df["expEndTime"].notna().any() and exp_start_dt is not None:
        end_epoch = float(df["expEndTime"].dropna().iloc[0])
        exp_end_dt = datetime.fromtimestamp(end_epoch, tz=timezone.utc).astimezone(
            exp_start_dt.tzinfo
        )
        exp_end = exp_end_dt.strftime("%Y-%m-%d %Hh%M.%S.%f %z")
    else:
        exp_end = None

    # --- trial rows (one per video) ---
    if "video_index" in df.columns:
        trial_rows = df[df["video_index"].notna()]
    else:
        trial_rows = pd.DataFrame()

    stimuli = []

    for _, row in trial_rows.iterrows():
        # SAM ratings
        sam_cols = ["rating_sam_1.keys", "rating_sam_2.keys", "rating_sam_3.keys"]
        sam_values = []
        for col in sam_cols:
            if col in df.columns and not pd.isna(row[col]):
                sam_values.append(float(row[col]))
            else:
                sam_values.append(None)
        trusted = all(v is not None for v in sam_values)

        # SAM timestamps (absolute times, based on r_SAM_* offsets)
        timestamps = {
            "sam1_start": offset_to_str(row.get("r_SAM_1.started"), exp_start_dt),
            "sam1_end":   offset_to_str(row.get("r_SAM_1.stopped"), exp_start_dt),
            "sam2_start": offset_to_str(row.get("r_SAM_2.started"), exp_start_dt),
            "sam2_end":   offset_to_str(row.get("r_SAM_2.stopped"), exp_start_dt),
            "sam3_start": offset_to_str(row.get("r_SAM_3.started"), exp_start_dt),
            "sam3_end":   offset_to_str(row.get("r_SAM_3.stopped"), exp_start_dt),
        }

        # video start/end â€“ ONLY using video.started / video.stopped
        video_start = offset_to_str(row.get("video.started"), exp_start_dt)
        video_end   = offset_to_str(row.get("r_video.stopped"), exp_start_dt)

        stim_struct = {
            "id": int(row["video_index"]),
            "video_id": row["video_id"],
            "video_start": video_start,
            "video_end": video_end,
            "video_response": {
                "belief": row.get("belief"),
                "sam": sam_values,
                "timestamps": timestamps,
                #"trusted": trusted,
            },
        }
        stimuli.append(stim_struct)

    return {
        "participant_id": participant_id,
        "focus": {"mean": focus_mean, "all_ratings": focus_ratings},
        "opinion": opinion,
        "timestamp_start": exp_start,
        "timestamp_end": exp_end,
        "stimuli": stimuli,
    }


## Looping over all CSV files

In [2]:
from glob import glob
from pathlib import Path
import json

pattern = str(Path("..") / "trials_psychopy" / "*_manipulation-of-belief_*.csv")

all_participants = []
for csv_path in glob(pattern):
    all_participants.append(parse_participant_csv(csv_path))

all_participants.sort(key=lambda p: p["participant_id"])

## Check for nulls

In [3]:
import math

def is_missing(v):
    if v is None: return True
    if isinstance(v, float) and math.isnan(v): return True
    if isinstance(v, str) and v.strip().lower() in ("", "nan", "none", "null"): return True
    return False

def find_missing(obj, path="root"):
    missing = []
    if isinstance(obj, dict):
        for k, v in obj.items(): missing += find_missing(v, f"{path}.{k}")
    elif isinstance(obj, list):
        for i, v in enumerate(obj): missing += find_missing(v, f"{path}[{i}]")
    else:
        if is_missing(obj): missing.append((path, obj))
    return missing

missing = find_missing(all_participants, "root")
if not missing:
    print("No missing values found.")
else:
    print(f"Found {len(missing)} missing values. Showing up to 200 entries:\n")
    for path, val in missing[:200]: print(f"{path} -> {repr(val)}")

No missing values found.


## Enrich with HR data

### Imports + robust HR loader

In [4]:
from pathlib import Path
from datetime import datetime
import json
import os

import pandas as pd

def load_hr_file(path: str) -> pd.DataFrame:
    """
    Load a participantN.csv HR file into a tidy DataFrame with columns:
    ts (datetime), RR, ArtifactCorrectedRR, RawArtifact.
    Works even if header isn't on the first line.
    """
    lines = Path(path).read_text(encoding="utf-8").splitlines()

    # find header line (must contain both 'RR' and 'ts')
    header_idx = None
    for i, line in enumerate(lines):
        if "RR" in line and "ts" in line:
            header_idx = i
            break
    if header_idx is None:
        raise ValueError(f"Could not find header with 'RR' and 'ts' in {path}")

    header_line = lines[header_idx].strip()

    # detect delimiter
    if ";" in header_line and "," not in header_line:
        delim = ";"
    else:
        delim = ","  # default

    header = [h.strip() for h in header_line.split(delim)]

    data_rows = []
    for line in lines[header_idx + 1:]:
        if not line.strip():
            continue
        parts = [p.strip() for p in line.split(delim)]
        if len(parts) != len(header):
            continue
        data_rows.append(parts)

    df = pd.DataFrame(data_rows, columns=header)

    # numeric columns
    for col in ["RR", "ArtifactCorrectedRR", "RawArtifact"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    if "RawArtifact" in df.columns:
        df["RawArtifact"] = df["RawArtifact"].fillna(0).astype(int)

    # timestamp column
    if "ts" in df.columns:
        df["ts"] = pd.to_datetime(df["ts"], errors="coerce")

    # keep only valid rows
    df = df.dropna(subset=["ts", "ArtifactCorrectedRR"])

    return df


### Helpers to parse PsychoPy timestamps

In [5]:
from datetime import datetime

def parse_exptimestamp(s: str) -> datetime:
    """
    Convert PsychoPy string:
      '2025-11-12 14h50.45.373178 +0100'
    to timezone-aware datetime.
    """
    date, time_part, tz = s.split(" ")
    time_part = time_part.replace("h", ":")
    time_part = time_part.replace(".", ":", 1)  # first '.' -> ':'
    fixed = f"{date} {time_part} {tz}"
    return datetime.strptime(fixed, "%Y-%m-%d %H:%M:%S.%f %z")

def json_time_to_naive(s: str):
    """Return naive datetime (no tz) or None if s is None."""
    if s is None:
        return None
    dt = parse_exptimestamp(s)
    return dt.replace(tzinfo=None)


### Load HR data for all participants

In [6]:
from glob import glob

hr_data = {}

for path in glob("../trials_hr/participant*.csv"):
    base = os.path.basename(path)              # e.g. 'participant1.csv'
    num_str = base.replace("participant", "").replace(".csv", "")

    try:
        pid = int(num_str)
    except ValueError:
        continue  # ignore any weird filenames

    df_hr = load_hr_file(path)
    hr_data[pid] = df_hr
    print(f"Loaded HR data for participant {pid}: {len(df_hr)} rows")

print("HR participants available:", sorted(hr_data.keys()))


Loaded HR data for participant 1: 1065 rows
Loaded HR data for participant 10: 966 rows
Loaded HR data for participant 11: 1490 rows
Loaded HR data for participant 12: 1469 rows
Loaded HR data for participant 13: 1570 rows
Loaded HR data for participant 14: 1288 rows
Loaded HR data for participant 15: 1642 rows
Loaded HR data for participant 16: 971 rows
Loaded HR data for participant 17: 971 rows
Loaded HR data for participant 18: 1154 rows
Loaded HR data for participant 19: 1154 rows
Loaded HR data for participant 2: 914 rows
Loaded HR data for participant 20: 971 rows
Loaded HR data for participant 21: 897 rows
Loaded HR data for participant 22: 1066 rows
Loaded HR data for participant 23: 1152 rows
Loaded HR data for participant 3: 1065 rows
Loaded HR data for participant 4: 1268 rows
Loaded HR data for participant 5: 1172 rows
Loaded HR data for participant 6: 1254 rows
Loaded HR data for participant 7: 1176 rows
Loaded HR data for participant 8: 817 rows
HR participants available

### Match HR rows to each video & save new JSON

In [7]:
# Work on a copy of the participants structure
participants_with_hr = all_participants

for p in participants_with_hr:
    pid = p["participant_id"]
    df_hr = hr_data.get(pid)

    if df_hr is None:
        print(f"No HR data for participant {pid}, skipping.")
        continue

    # ensure timestamps are clean and sorted
    df_hr["ts"] = pd.to_datetime(df_hr["ts"], errors="coerce")
    df_hr = df_hr.dropna(subset=["ts"]).sort_values("ts")

    for stim in p["stimuli"]:
        v_start = json_time_to_naive(stim.get("video_start"))
        v_end   = json_time_to_naive(stim.get("video_end"))

        if v_start is None or v_end is None:
            stim["heart_rate"] = []
            continue

        mask = (df_hr["ts"] >= v_start) & (df_hr["ts"] <= v_end)
        seg = df_hr.loc[mask, ["ts", "RR", "ArtifactCorrectedRR", "RawArtifact"]].copy()

        # store timestamp as string so JSON can serialize it
        seg["ts"] = seg["ts"].dt.strftime("%Y-%m-%d %H:%M:%S.%f")

        stim["heart_rate"] = seg.to_dict(orient="records")

        print(
            f"Participant {pid}, stim {stim['id']} "
            f"({stim['video_id']}): {len(seg)} HR rows"
        )

# finally, save the merged structures
output_path = "experiment_data_with_hr.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(participants_with_hr, f, ensure_ascii=False, indent=2)

print(f"\nSaved merged data with HR to {output_path}")


Participant 1, stim 1 (4171487-uhd_3840_2160_30fps.mp4): 139 HR rows
Participant 1, stim 2 (5768645-uhd_3840_2160_25fps.mp4): 149 HR rows
Participant 1, stim 3 (18840567-hd_1920_1080_30fps.mp4): 129 HR rows
Participant 1, stim 4 (11946387_3840_2160_30fps.mp4): 150 HR rows
Participant 2, stim 1 (4171487-uhd_3840_2160_30fps.mp4): 140 HR rows
Participant 2, stim 2 (11946387_3840_2160_30fps.mp4): 148 HR rows
Participant 2, stim 3 (5768645-uhd_3840_2160_25fps.mp4): 154 HR rows
Participant 2, stim 4 (18840567-hd_1920_1080_30fps.mp4): 114 HR rows
Participant 3, stim 1 (4171487-uhd_3840_2160_30fps.mp4): 138 HR rows
Participant 3, stim 2 (11946387_3840_2160_30fps.mp4): 149 HR rows
Participant 3, stim 3 (18840567-hd_1920_1080_30fps.mp4): 130 HR rows
Participant 3, stim 4 (5768645-uhd_3840_2160_25fps.mp4): 150 HR rows
Participant 4, stim 1 (4171487-uhd_3840_2160_30fps.mp4): 140 HR rows
Participant 4, stim 2 (18840567-hd_1920_1080_30fps.mp4): 132 HR rows
Participant 4, stim 3 (5768645-uhd_3840_216