In [10]:
from datetime import datetime, timedelta
import pandas as pd
import json
from glob import glob
import os

# ------------------------
# 1. Helpers
# ------------------------

from pathlib import Path
import pandas as pd

def load_hr_file(path: str) -> pd.DataFrame:
    """
    Load a participantN.csv HR file into a DataFrame with:
    RR, ArtifactCorrectedRR, RawArtifact, ts

    Works even if there are extra header/meta lines.
    """
    # Read whole file as text lines
    lines = Path(path).read_text(encoding="utf-8").splitlines()

    # 1) Find the header line (contains RR and ts)
    header_idx = None
    for i, line in enumerate(lines):
        if "RR" in line and "ts" in line:
            header_idx = i
            break

    if header_idx is None:
        raise ValueError(f"Could not find header with 'RR' and 'ts' in {path}")

    header_line = lines[header_idx].strip()

    # 2) Detect delimiter (comma or semicolon)
    if ";" in header_line and "," not in header_line:
        delim = ";"
    else:
        delim = ","

    header = [h.strip() for h in header_line.split(delim)]

    # 3) Parse data lines
    data_rows = []
    for line in lines[header_idx + 1 :]:
        if not line.strip():
            continue  # skip empty lines
        parts = [p.strip() for p in line.split(delim)]
        if len(parts) != len(header):
            # skip lines that don't match the header shape (footers, meta, etc.)
            continue
        data_rows.append(parts)

    # 4) Build DataFrame
    df = pd.DataFrame(data_rows, columns=header)

    # 5) Convert numeric columns safely
    numeric_cols = ["RR", "ArtifactCorrectedRR", "RawArtifact"]
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    if "RawArtifact" in df.columns:
        df["RawArtifact"] = df["RawArtifact"].fillna(0).astype(int)

    # 6) Parse timestamps
    if "ts" in df.columns:
        df["ts"] = pd.to_datetime(df["ts"], errors="coerce")

    # 7) Drop rows without timestamp or RR
    df = df.dropna(subset=["ts", "RR"])

    return df




def parse_exptimestamp(s: str) -> datetime:
    """
    '2025-11-12 14h50.45.373178 +0100' -> aware datetime
    """
    date, time_part, tz = s.split(" ")
    time_part = time_part.replace("h", ":")
    time_part = time_part.replace(".", ":", 1)  # only first '.' -> ':'
    fixed = f"{date} {time_part} {tz}"
    return datetime.strptime(fixed, "%Y-%m-%d %H:%M:%S.%f %z")


def json_time_to_naive(s: str):
    """Convert experiment timestamp string to naive datetime (no tz)."""
    if s is None:
        return None
    dt = parse_exptimestamp(s)
    return dt.replace(tzinfo=None)


# ------------------------
# 2. Load experiment JSON
# ------------------------

experiment_json_path = "experiment_data.json"  # adjust if needed

with open(experiment_json_path, "r", encoding="utf-8") as f:
    participants = json.load(f)

print(f"Loaded {len(participants)} participants from JSON")


# ------------------------
# 3. Load ALL heart-rate CSV files
# ------------------------

hr_folder = "hr_data"  # <- change this if your HR files are elsewhere
hr_files = glob(os.path.join(hr_folder, "participant*.csv"))

hr_data = {}  # dict: {participant_id: DataFrame}

for path in hr_files:
    base = os.path.basename(path)              # e.g. "participant1.csv"
    num_str = base.replace("participant", "").replace(".csv", "")
    try:
        pid = int(num_str)
    except ValueError:
        continue  # ignore weird files

    df_hr = load_hr_file(path)
    hr_data[pid] = df_hr
    print(f"Loaded HR data for participant {pid}: {len(df_hr)} rows")

print("HR participants available:", sorted(hr_data.keys()))


# ------------------------
# 4. Match HR windows to each video for each participant
# ------------------------

for p in participants:
    pid = p.get("participant_id") or p.get("id")
    if pid is None:
        continue

    # skip if we don't have HR for this participant
    if pid not in hr_data:
        print(f"No HR data for participant {pid}, skipping.")
        continue

    df_hr = hr_data[pid].copy()
    df_hr["ts"] = pd.to_datetime(df_hr["ts"])  # ensure datetime

    for stim in p["stimuli"]:
        v_start_str = stim.get("video_start")
        v_end_str = stim.get("video_end")

        start = json_time_to_naive(v_start_str)
        end = json_time_to_naive(v_end_str)

        if start is None or end is None:
            stim["heart_rate"] = []
            continue

        # ALL HR samples between video_start and video_end
        mask = (df_hr["ts"] >= start) & (df_hr["ts"] <= end)
        seg = df_hr.loc[mask].reset_index(drop=True)

        # store as list-of-dicts so it can go into JSON
        stim["heart_rate"] = seg.to_dict(orient="records")

    print(f"Attached HR segments for participant {pid}")


# ------------------------
# 5. Save merged JSON
# ------------------------

output_path = "experiment_with_heart_rate.json"

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(participants, f, indent=2, default=str)

print(f"Saved merged data with HR to {output_path}")


Loaded 22 participants from JSON
Loaded HR data for participant 1: 1065 rows
Loaded HR data for participant 10: 966 rows
Loaded HR data for participant 11: 1490 rows
Loaded HR data for participant 12: 1469 rows
Loaded HR data for participant 13: 1570 rows
Loaded HR data for participant 14: 1288 rows
Loaded HR data for participant 15: 1637 rows
Loaded HR data for participant 16: 971 rows
Loaded HR data for participant 17: 971 rows
Loaded HR data for participant 18: 1154 rows
Loaded HR data for participant 19: 1154 rows
Loaded HR data for participant 2: 914 rows
Loaded HR data for participant 20: 971 rows
Loaded HR data for participant 21: 897 rows
Loaded HR data for participant 22: 1066 rows
Loaded HR data for participant 23: 1152 rows
Loaded HR data for participant 3: 1065 rows
Loaded HR data for participant 4: 1268 rows
Loaded HR data for participant 5: 1172 rows
Loaded HR data for participant 6: 1254 rows
Loaded HR data for participant 7: 1176 rows
Loaded HR data for participant 8: 8